├── projects
    ├── mmdet3d_plugin
    │   ├── core
    │   │   ├── __init__.py
    │   │   └── bbox
    │   │   │   ├── coders
    │   │   │       ├── __init__.py
    │   │   │       └── multi_task_bbox_coder.py
    │   │   │   ├── assigners
    │   │   │       ├── __init__.py
    │   │   │       └── hungarian_assigner_3d.py
    │   │   │   ├── match_costs
    │   │   │       ├── __init__.py
    │   │   │       └── match_cost.py
    │   │   │   └── util.py
    │   ├── mmcv_custom
    │   │   ├── ops
    │   │   │   ├── __init__.py
    │   │   │   └── voxel
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── spconv_voxelize.py
    │   │   ├── runner
    │   │   │   ├── __init__.py
    │   │   │   └── hooks
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── optimizer.py
    │   │   └── __init__.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   └── custom_nuscenes_dataset.py
    │   ├── models
    │   │   ├── detectors
    │   │   │   ├── __init__.py
    │   │   │   └── fstr.py
    │   │   ├── utils
    │   │   │   ├── __init__.py
    │   │   │   ├── attention.py
    │   │   │   ├── cmt_transformer.py
    │   │   │   └── petr_transformer.py
    │   │   ├── backbones
    │   │   │   ├── __init__.py
    │   │   │   └── voxelnext.py
    │   │   ├── __init__.py
    │   │   └── dense_heads
    │   │   │   ├── __init__.py
    │   │   │   └── fstr_head.py
    │   └── __init__.py
    └── configs
    │   └── lidar
    │       ├── fstr_voxel0075_cbgs_20e.py
    │       ├── fstr_large_voxel0075_cbgs_20e.py
    │       └── fstr_xlarge_voxel0050_cbgs_20e.py
├── tools
    ├── dist_train.sh
    ├── dist_test.sh
    ├── test.py
    └── train.py
├── .gitignore
├── README.md
└── LICENSE


/projects/mmdet3d_plugin/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/ops/__init__.py:
--------------------------------------------------------------------------------
1 | from .voxel import *


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/runner/__init__.py:
--------------------------------------------------------------------------------
1 | from .hooks import *


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/__init__.py:
--------------------------------------------------------------------------------
1 | from .runner import *
2 | from .ops import *


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .custom_nuscenes_dataset import CustomNuScenesDataset


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/ops/voxel/__init__.py:
--------------------------------------------------------------------------------
1 | from .spconv_voxelize import SPConvVoxelization


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/runner/hooks/__init__.py:
--------------------------------------------------------------------------------
1 | from .optimizer import CustomFp16OptimizerHook


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/detectors/__init__.py:
--------------------------------------------------------------------------------
1 | from .fstr import FSTRDetector
2 | __all__ = ['FSTRDetector']


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .cmt_transformer import *
2 | from .petr_transformer import *
3 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | from .voxelnext import VoxelNextEncoder
2 | __all__ = ['VoxelNextEncoder']


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/core/bbox/coders/__init__.py:
--------------------------------------------------------------------------------
1 | from .multi_task_bbox_coder import MultiTaskBBoxCoder
2 | 
3 | __all__ = ['MultiTaskBBoxCoder']


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py:
--------------------------------------------------------------------------------
1 | from .hungarian_assigner_3d import HungarianAssigner3D
2 | 
3 | __all__ = ['HungarianAssigner3D']


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .backbones import *
2 | from .detectors import *
3 | from .dense_heads import *
4 | from .utils import *
5 | 
6 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/dense_heads/__init__.py:
--------------------------------------------------------------------------------
1 | from .fstr_head import (
2 |     FSTRHead,
3 |     SeparateTaskHead,
4 | )
5 | 
6 | __all__ = ['SeparateTaskHead', 'FSTRHead']


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/__init__.py:
--------------------------------------------------------------------------------
1 | from .core.bbox.assigners import *
2 | from .core.bbox.coders import *
3 | from .core.bbox.match_costs import BBox3DL1Cost
4 | from .datasets import *
5 | from .mmcv_custom import *
6 | from .models import *


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py:
--------------------------------------------------------------------------------
1 | from mmdet.core.bbox.match_costs import build_match_cost
2 | from .match_cost import BBox3DL1Cost, BBoxBEVL1Cost, IoU3DCost
3 | 
4 | __all__ = ['build_match_cost', 'BBox3DL1Cost', 'BBoxBEVL1Cost', 'IoU3DCost']
5 | 


--------------------------------------------------------------------------------
/tools/dist_train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CONFIG=$1
 4 | GPUS=$2
 5 | NNODES=${NNODES:-1}
 6 | NODE_RANK=${NODE_RANK:-0}
 7 | PORT=${PORT:-29500}
 8 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
 9 | 
10 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
11 | python -m torch.distributed.launch \
12 |     --nnodes=$NNODES \
13 |     --node_rank=$NODE_RANK \
14 |     --master_addr=$MASTER_ADDR \
15 |     --nproc_per_node=$GPUS \
16 |     --master_port=$PORT \
17 |     $(dirname "$0")/train.py \
18 |     $CONFIG \
19 |     --seed 0 \
20 |     --launcher pytorch ${@:3}
21 | 


--------------------------------------------------------------------------------
/tools/dist_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CONFIG=$1
 4 | CHECKPOINT=$2
 5 | GPUS=$3
 6 | NNODES=${NNODES:-1}
 7 | NODE_RANK=${NODE_RANK:-0}
 8 | PORT=${PORT:-29500}
 9 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
10 | 
11 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
12 | python -m torch.distributed.launch \
13 |     --nnodes=$NNODES \
14 |     --node_rank=$NODE_RANK \
15 |     --master_addr=$MASTER_ADDR \
16 |     --nproc_per_node=$GPUS \
17 |     --master_port=$PORT \
18 |     $(dirname "$0")/test.py \
19 |     $CONFIG \
20 |     $CHECKPOINT \
21 |     --launcher pytorch \
22 |     ${@:4}
23 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/runner/hooks/optimizer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from mmcv.runner.hooks.optimizer import Fp16OptimizerHook
 6 | from mmcv.runner.hooks import HOOKS
 7 | 
 8 | 
 9 | @HOOKS.register_module()
10 | class CustomFp16OptimizerHook(Fp16OptimizerHook):
11 | 
12 |     def __init__(self,
13 |                 custom_fp16={},
14 |                 *args,
15 |                 **kwargs):
16 |         super(CustomFp16OptimizerHook, self).__init__(*args, **kwargs)
17 |         self.custom_fp16 = custom_fp16
18 | 
19 |     def before_run(self, runner) -> None:
20 |         super().before_run(runner)
21 |         for module_name, v in self.custom_fp16.items():
22 |             runner.model.module._modules[module_name].fp16_enabled = v
23 |     


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from mmdet.core.bbox.match_costs.builder import MATCH_COST
 3 | 
 4 | 
 5 | @MATCH_COST.register_module()
 6 | class BBox3DL1Cost(object):
 7 |     """BBox3DL1Cost.
 8 |      Args:
 9 |          weight (int | float, optional): loss_weight
10 |     """
11 | 
12 |     def __init__(self, weight=1.):
13 |         self.weight = weight
14 | 
15 |     def __call__(self, bbox_pred, gt_bboxes):
16 |         """
17 |         Args:
18 |             bbox_pred (Tensor): Predicted boxes with normalized coordinates
19 |                 (cx, cy, w, h), which are all in range [0, 1]. Shape
20 |                 [num_query, 4].
21 |             gt_bboxes (Tensor): Ground truth boxes with normalized
22 |                 coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
23 |         Returns:
24 |             torch.Tensor: bbox_cost value with weight
25 |         """
26 |         bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
27 |         return bbox_cost * self.weight
28 | 
29 | 
30 | @MATCH_COST.register_module()
31 | class BBoxBEVL1Cost(object):
32 |     def __init__(self, weight):
33 |         self.weight = weight
34 | 
35 |     def __call__(self, bboxes, gt_bboxes, pc_range):
36 |         pc_start = bboxes.new(pc_range[0:2])
37 |         pc_range = bboxes.new(pc_range[3:5]) - bboxes.new(pc_range[0:2])
38 |         # normalize the box center to [0, 1]
39 |         normalized_bboxes_xy = (bboxes[:, :2] - pc_start) / pc_range
40 |         normalized_gt_bboxes_xy = (gt_bboxes[:, :2] - pc_start) / pc_range
41 |         reg_cost = torch.cdist(normalized_bboxes_xy, normalized_gt_bboxes_xy, p=1)
42 |         return reg_cost * self.weight
43 | 
44 | 
45 | @MATCH_COST.register_module()
46 | class IoU3DCost(object):
47 |     def __init__(self, weight):
48 |         self.weight = weight
49 | 
50 |     def __call__(self, iou):
51 |         iou_cost = - iou
52 |         return iou_cost * self.weight


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | *.ipynb
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | tmp/
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | .figs
 30 | 
 31 | mmdetection3d/
 32 | mmdetection3d
 33 | mmdet3d
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | hostfile.txt
 52 | nosetests.xml
 53 | coverage.xml
 54 | *.cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # pyenv
 84 | .python-version
 85 | 
 86 | # celery beat schedule file
 87 | celerybeat-schedule
 88 | 
 89 | # SageMath parsed files
 90 | *.sage.py
 91 | 
 92 | # Environments
 93 | .env
 94 | .venv
 95 | env/
 96 | venv/
 97 | ENV/
 98 | env.bak/
 99 | venv.bak/
100 | 
101 | # Spyder project settings
102 | .spyderproject
103 | .spyproject
104 | 
105 | # Rope project settings
106 | .ropeproject
107 | 
108 | # mkdocs documentation
109 | /site
110 | 
111 | # mypy
112 | .mypy_cache/
113 | 
114 | # cython generated cpp
115 | data
116 | ckpts
117 | .vscode
118 | .idea
119 | 
120 | # custom
121 | nuscenes_gt_database
122 | nuscenes_unified_gt_database
123 | work_dirs
124 | *.pkl
125 | *.pkl.json
126 | *.log.json
127 | work_dirs/
128 | exps/
129 | *~
130 | mmdet3d/.mim
131 | 
132 | # Pytorch
133 | *.pth
134 | 
135 | 
136 | # demo
137 | figs
138 | 
139 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/core/bbox/util.py:
--------------------------------------------------------------------------------
 1 | import torch 
 2 | 
 3 | import numpy as np
 4 | import mmdet3d
 5 | from mmdet3d.core import limit_period
 6 | 
 7 | 
 8 | def normalize_bbox(bboxes, pc_range=None):
 9 | 
10 |     cx = bboxes[..., 0:1]
11 |     cy = bboxes[..., 1:2]
12 |     cz = bboxes[..., 2:3]
13 |     w = bboxes[..., 3:4].log()
14 |     l = bboxes[..., 4:5].log()
15 |     h = bboxes[..., 5:6].log()
16 | 
17 |     rot = bboxes[..., 6:7]
18 |     if bboxes.size(-1) > 7:
19 |         vx = bboxes[..., 7:8] 
20 |         vy = bboxes[..., 8:9]
21 |         normalized_bboxes = torch.cat(
22 |             (cx, cy, w, l, cz, h, rot.sin(), rot.cos(), vx, vy), dim=-1
23 |         )
24 |     else:
25 |         normalized_bboxes = torch.cat(
26 |             (cx, cy, w, l, cz, h, rot.sin(), rot.cos()), dim=-1
27 |         )
28 |     return normalized_bboxes
29 | 
30 | 
31 | def denormalize_bbox(normalized_bboxes, pc_range=None):
32 |     # rotation 
33 |     rot_sine = normalized_bboxes[..., 6:7]
34 | 
35 |     rot_cosine = normalized_bboxes[..., 7:8]
36 |     rot = torch.atan2(rot_sine, rot_cosine)
37 | 
38 |     # center in the bev
39 |     cx = normalized_bboxes[..., 0:1]
40 |     cy = normalized_bboxes[..., 1:2]
41 |     cz = normalized_bboxes[..., 4:5]
42 | 
43 |     # size
44 |     w = normalized_bboxes[..., 2:3]
45 |     l = normalized_bboxes[..., 3:4]
46 |     h = normalized_bboxes[..., 5:6]
47 | 
48 |     w = w.exp() 
49 |     l = l.exp() 
50 |     h = h.exp() 
51 | 
52 |     if normalized_bboxes.size(-1) > 8:
53 |          # velocity 
54 |         vx = normalized_bboxes[..., 8:9]
55 |         vy = normalized_bboxes[..., 9:10]
56 |         denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1)
57 |     else:
58 |         denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1)
59 |     return denormalized_bboxes
60 | 
61 | 
62 | def bbox3d_mapping_back(bboxes, rot_degree, scale_factor, flip_horizontal, flip_vertical):
63 |     """Map bboxes from testing scale to original image scale.
64 | 
65 |     Args:
66 |         bboxes (:obj:`BaseInstance3DBoxes`): Boxes to be mapped back.
67 |         scale_factor (float): Scale factor.
68 |         flip_horizontal (bool): Whether to flip horizontally.
69 |         flip_vertical (bool): Whether to flip vertically.
70 | 
71 |     Returns:
72 |         :obj:`BaseInstance3DBoxes`: Boxes mapped back.
73 |     """
74 |     new_bboxes = bboxes.clone()
75 |     if flip_horizontal:
76 |         new_bboxes.flip('horizontal')
77 |     if flip_vertical:
78 |         new_bboxes.flip('vertical')
79 |     new_bboxes.scale(1 / scale_factor)
80 |     new_bboxes.rotate(-rot_degree)
81 | 
82 |     return new_bboxes


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/ops/voxel/spconv_voxelize.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 megvii-model. All Rights Reserved.
 2 | 
 3 | import numpy as np
 4 | from torch import nn
 5 | from spconv.pytorch.utils import PointToVoxel  # spconv-cu111  2.1.21
 6 | import torch
 7 | import torch.nn.functional as F
 8 | from torch.nn.modules.utils import _pair
 9 | 
10 | 
11 | class SPConvVoxelization(nn.Module):
12 |     def __init__(self, voxel_size, point_cloud_range, max_num_points, max_voxels, num_point_features, device=torch.device("cuda")):
13 |         super().__init__()
14 |         assert len(voxel_size) == 3
15 |         assert len(point_cloud_range) == 6
16 |         self.voxel_size = np.array(voxel_size)
17 |         self.point_cloud_range = np.array(point_cloud_range)
18 |         self.max_num_points = max_num_points
19 |         self.num_point_features = num_point_features
20 |         self.device = device
21 |         if isinstance(max_voxels, tuple):
22 |             self.max_voxels = max_voxels
23 |         else:
24 |             self.max_voxels = _pair(max_voxels)
25 |         self.voxel_generator = PointToVoxel(
26 |             vsize_xyz=voxel_size,
27 |             coors_range_xyz=point_cloud_range,
28 |             max_num_points_per_voxel=max_num_points,
29 |             max_num_voxels=self.max_voxels[0],
30 |             num_point_features=num_point_features,
31 |             device=device,
32 |         )
33 |         grid_size = (self.point_cloud_range[3:6] - self.point_cloud_range[0:3]) / np.array(voxel_size)
34 |         self.grid_size = np.round(grid_size).astype(np.int64)
35 | 
36 |     def train(self, mode: bool = True):
37 |         if mode:
38 |             self.voxel_generator = PointToVoxel(
39 |                 vsize_xyz=self.voxel_size.tolist(),
40 |                 coors_range_xyz=self.point_cloud_range.tolist(),
41 |                 max_num_points_per_voxel=self.max_num_points,
42 |                 max_num_voxels=self.max_voxels[0],
43 |                 num_point_features=self.num_point_features,
44 |                 device=self.device,
45 |             )
46 |         else:
47 |             self.voxel_generator = PointToVoxel(
48 |                 vsize_xyz=self.voxel_size.tolist(),
49 |                 coors_range_xyz=self.point_cloud_range.tolist(),
50 |                 max_num_points_per_voxel=self.max_num_points,
51 |                 max_num_voxels=self.max_voxels[1],
52 |                 num_point_features=self.num_point_features,
53 |                 device=self.device,
54 |             )
55 | 
56 |         return super().train(mode)
57 | 
58 |     def forward(self, points):
59 |         voxel_output = self.voxel_generator(points)
60 |         voxels, coordinates, num_points = voxel_output
61 |         return torch.clone(voxels), torch.clone(coordinates), torch.clone(num_points)
62 |     
63 |     def __repr__(self):
64 |         tmpstr = self.__class__.__name__ + '('
65 |         tmpstr += 'voxel_size=' + str(self.voxel_size)
66 |         tmpstr += ', point_cloud_range=' + str(self.point_cloud_range)
67 |         tmpstr += ', max_num_points=' + str(self.max_num_points)
68 |         tmpstr += ', max_voxels=' + str(self.max_voxels)
69 |         tmpstr += ', num_point_features=' + str(self.num_point_features)
70 |         tmpstr += ')'
71 |         return tmpstr
72 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Fully Sparse Transformer 3D Detector for LiDAR Point Cloud 
 3 | 
 4 | [Paper](https://ieeexplore.ieee.org/document/10302363), [nuScenes LeaderBoard](https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Lidar)
 5 | 
 6 | <!-- ## Introduction -->
 7 | 
 8 | All statistics are measured on a single Tesla A100 GPU using the best model of official repositories. Some sparse module in the model are supported. </em>
 9 | </div><br/>
10 | 
11 | FSTR is a fully sparse LiDAR-based detector that achieves better accuracy-efficient trade-off compare with other popular LiDAR-based detectors. A lightweight DETR-like framework with signle decoder layer is designed for lidar-only detection, which obtains **73.6%** NDS (**FSTR-XLarge with TTA**) on nuScenes benchmark and **31.5%** CDS (**FSTR-Large**) on Argoverse2 validation dataset.
12 | 
13 | ## Currently Supported Features
14 | - [x] Support nuScenes dataset
15 | - [ ] Support Argoverse2 dataset
16 | ## Preparation
17 | 
18 | * Environments  
19 | Python == 3.8 \
20 | CUDA == 11.1 \
21 | pytorch == 1.9.0 \
22 | mmcv-full == 1.6.0 \
23 | mmdet == 2.24.0 \
24 | mmsegmentation == 0.29.1 \
25 | mmdet3d == 1.0.0rc5 \
26 | [flash-attn](https://github.com/HazyResearch/flash-attention) == 0.2.2 \
27 | [Spconv-plus](https://github.com/dvlab-research/spconv-plus) == 2.1.21
28 | 
29 | * Data   
30 | Follow the [mmdet3d](https://github.com/open-mmlab/mmdetection3d/blob/master/docs/en/data_preparation.md) to process the nuScenes dataset.
31 | 
32 | ## Train & inference
33 | ```bash
34 | # train
35 | bash tools/dist_train.sh /path_to_your_config 8
36 | # inference
37 | bash tools/dist_test.sh /path_to_your_config /path_to_your_pth 8 --eval bbox
38 | ```
39 | ## Main Results
40 | Results on nuScenes **val set**. The default batch size is 2 on each GPU. The FPS are all evaluated with a single Tesla A100 GPU. (15e + 5e means the last 5 epochs should be trained without [GTsample](https://github.com/Poley97/FSTR/blob/master/projects/configs/lidar/fstr_voxel0075_cbgs_20e.py.py#L33-L69))
41 | 
42 | | Config            | mAP      | NDS     | Schedule|Inference FPS|
43 | |:--------:|:----------:|:---------:|:--------:|:--------:|
44 | | [FSTR](./projects/configs/lidar/fstr_voxel0075_cbgs_20e.py) | 64.2% | 69.1%  | 15e+5e | 15.4 |
45 | | [FSTR-Large](./projects/configs/lidar/fstr_large_voxel0075_cbgs_20e.py) | 65.5% | 70.3%    | 15e+5e | 9.5 |  
46 | 
47 | 
48 | Results on nuScenes **test set**. To reproduce our result, replace `ann_file=data_root + '/nuscenes_infos_train.pkl'` in [training config](./projects/configs/lidar/fstr_large_voxel0075_cbgs_20e.py) with `ann_file=[data_root + '/nuscenes_infos_train.pkl', data_root + '/nuscenes_infos_val.pkl']`:
49 | 
50 | | Config            | mAP      | NDS     | Schedule|Inference FPS|
51 | |:--------:|:----------:|:---------:|:--------:|:--------:|
52 | | [FSTR](./projects/configs/lidar/fstr_voxel0075_cbgs_20e.py) | 66.2% | 70.4%  | 15e+5e | 15.4 |
53 | | [FSTR](./projects/configs/lidar/fstr_voxel0075_cbgs_20e.py) +TTA | 67.6% | 71.5%  | 15e+5e | - |
54 | | [FSTR-Large](./projects/configs/lidar/fstr_large_voxel0075_cbgs_20e.py) + TTA | 69.5% | 73.0%  | 15e+5e | - |
55 | | [FSTR-XLarge](./projects/configs/lidar/fstr_xlarge_voxel0050_cbgs_20e.py) + TTA | 70.2% | 73.5%  | 15e+5e | - |
56 | 
57 | ## Citation
58 | If you find our FSTR helpful in your research, please consider citing: 
59 | ```bibtex   
60 | @article{zhang2023fully,
61 |   title={Fully Sparse Transformer 3D Detector for LiDAR Point Cloud},
62 |   author={Zhang, Diankun and Zheng, Zhijie and Niu, Haoyu and Wang, Xueqing and Liu, Xiaojun},
63 |   journal={IEEE Transactions on Geoscience and Remote Sensing},
64 |   year={2023},
65 |   publisher={IEEE}
66 | }
67 | ```
68 | 
69 | ## Contact
70 | If you have any questions, feel free to open an issue or contact us at zhangdiankun19@mails.ucas.edu.cn, or tanfeiyang@megvii.com.
71 | 
72 | ## Acknowledgement
73 | Parts of our Code refer to the the recent work [CMT](https://github.com/junjie18/CMT).
74 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/datasets/custom_nuscenes_dataset.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2023 megvii-model. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from DETR3D (https://github.com/WangYueFt/detr3d)
 5 | # Copyright (c) 2021 Wang, Yue
 6 | # ------------------------------------------------------------------------
 7 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
 8 | # Copyright (c) OpenMMLab. All rights reserved.
 9 | # ------------------------------------------------------------------------
10 | 
11 | import numpy as np
12 | from mmdet.datasets import DATASETS
13 | from mmdet3d.datasets import NuScenesDataset
14 | 
15 | 
16 | @DATASETS.register_module()
17 | class CustomNuScenesDataset(NuScenesDataset):
18 |     r"""NuScenes Dataset.
19 | 
20 |     This datset only add camera intrinsics and extrinsics to the results.
21 |     """
22 | 
23 |     def __init__(self, *args, return_gt_info=False, **kwargs):
24 |         super(CustomNuScenesDataset, self).__init__(*args, **kwargs)
25 |         self.return_gt_info = return_gt_info
26 | 
27 |     def get_data_info(self, index):
28 |         """Get data info according to the given index.
29 | 
30 |         Args:
31 |             index (int): Index of the sample data to get.
32 | 
33 |         Returns:
34 |             dict: Data information that will be passed to the data \
35 |                 preprocessing pipelines. It includes the following keys:
36 | 
37 |                 - sample_idx (str): Sample index.
38 |                 - pts_filename (str): Filename of point clouds.
39 |                 - sweeps (list[dict]): Infos of sweeps.
40 |                 - timestamp (float): Sample timestamp.
41 |                 - img_filename (str, optional): Image filename.
42 |                 - lidar2img (list[np.ndarray], optional): Transformations \
43 |                     from lidar to different cameras.
44 |                 - ann_info (dict): Annotation info.
45 |         """
46 |         info = self.data_infos[index]
47 |         # standard protocal modified from SECOND.Pytorch
48 |         input_dict = dict(
49 |             sample_idx=info['token'],
50 |             pts_filename=info['lidar_path'],
51 |             sweeps=info['sweeps'],
52 |             timestamp=info['timestamp'] / 1e6,
53 |             img_sweeps=None if 'img_sweeps' not in info else info['img_sweeps'],
54 |             radar_info=None if 'radars' not in info else info['radars']
55 |         )
56 | 
57 |         if self.return_gt_info:
58 |             input_dict['info'] = info
59 | 
60 |         if self.modality['use_camera']:
61 |             image_paths = []
62 |             lidar2img_rts = []
63 |             lidar2cam_rts = []
64 |             cam_intrinsics = []
65 |             img_timestamp = []
66 |             for cam_type, cam_info in info['cams'].items():
67 |                 img_timestamp.append(cam_info['timestamp'] / 1e6)
68 |                 image_paths.append(cam_info['data_path'])
69 |                 # obtain lidar to image transformation matrix
70 |                 lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
71 |                 lidar2cam_t = cam_info[
72 |                     'sensor2lidar_translation'] @ lidar2cam_r.T
73 |                 lidar2cam_rt = np.eye(4)
74 |                 lidar2cam_rt[:3, :3] = lidar2cam_r.T
75 |                 lidar2cam_rt[3, :3] = -lidar2cam_t
76 |                 intrinsic = cam_info['cam_intrinsic']
77 |                 viewpad = np.eye(4)
78 |                 viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
79 |                 lidar2img_rt = (viewpad @ lidar2cam_rt.T)
80 |                 lidar2img_rts.append(lidar2img_rt)
81 | 
82 |                 cam_intrinsics.append(viewpad)
83 |                 lidar2cam_rts.append(lidar2cam_rt.T)
84 | 
85 |             input_dict.update(
86 |                 dict(
87 |                     img_timestamp=img_timestamp,
88 |                     img_filename=image_paths,
89 |                     lidar2img=lidar2img_rts,
90 |                     cam_intrinsic=cam_intrinsics,
91 |                     lidar2cam=lidar2cam_rts,
92 |                 ))
93 | 
94 |         if not self.test_mode:
95 |             annos = self.get_ann_info(index)
96 |             input_dict['ann_info'] = annos
97 | 
98 |         return input_dict
99 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/core/bbox/coders/multi_task_bbox_coder.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2023 megvii-model. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from mmdetection (https://github.com/open-mmlab/mmdetection)
  5 | # Copyright (c) OpenMMLab. All rights reserved.
  6 | # ------------------------------------------------------------------------
  7 | 
  8 | import torch
  9 | 
 10 | from mmdet.core.bbox import BaseBBoxCoder
 11 | from mmdet.core.bbox.builder import BBOX_CODERS
 12 | from projects.mmdet3d_plugin.core.bbox.util import denormalize_bbox
 13 | 
 14 | 
 15 | @BBOX_CODERS.register_module()
 16 | class MultiTaskBBoxCoder(BaseBBoxCoder):
 17 |     """Bbox coder for NMS-free detector.
 18 |     Args:
 19 |         pc_range (list[float]): Range of point cloud.
 20 |         post_center_range (list[float]): Limit of the center.
 21 |             Default: None.
 22 |         max_num (int): Max number to be kept. Default: 100.
 23 |         score_threshold (float): Threshold to filter boxes based on score.
 24 |             Default: None.
 25 |         code_size (int): Code size of bboxes. Default: 9
 26 |     """
 27 | 
 28 |     def __init__(self,
 29 |                  pc_range,
 30 |                  voxel_size=None,
 31 |                  post_center_range=None,
 32 |                  max_num=100,
 33 |                  score_threshold=None,
 34 |                  num_classes=10):
 35 |         
 36 |         self.pc_range = pc_range
 37 |         self.voxel_size = voxel_size
 38 |         self.post_center_range = post_center_range
 39 |         self.max_num = max_num
 40 |         self.score_threshold = score_threshold
 41 |         self.num_classes = num_classes
 42 | 
 43 |     def encode(self):
 44 |         pass
 45 | 
 46 |     def decode_single(self, cls_scores, bbox_preds, task_ids):
 47 |         """Decode bboxes.
 48 |         Args:
 49 |             cls_scores (Tensor): Outputs from the classification head, \
 50 |                 shape [num_query, cls_out_channels]. Note \
 51 |                 cls_out_channels should includes background.
 52 |             bbox_preds (Tensor): Outputs from the regression \
 53 |                 head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
 54 |                 Shape [num_query, 9].
 55 |         Returns:
 56 |             list[dict]: Decoded boxes.
 57 |         """
 58 |         max_num = self.max_num
 59 |         num_query = cls_scores.shape[0]
 60 | 
 61 |         cls_scores = cls_scores.sigmoid()
 62 |         scores, indexs = cls_scores.view(-1).topk(max_num)
 63 |         labels = indexs % self.num_classes
 64 |         bbox_index = indexs // self.num_classes
 65 |         task_index = torch.gather(task_ids, 1, labels.unsqueeze(1)).squeeze()
 66 | 
 67 |         bbox_preds = bbox_preds[task_index * num_query + bbox_index]
 68 | 
 69 |         final_box_preds = denormalize_bbox(bbox_preds, self.pc_range)   
 70 |         final_scores = scores 
 71 |         final_preds = labels 
 72 | 
 73 |         # use score threshold
 74 |         if self.score_threshold is not None:
 75 |             thresh_mask = final_scores > self.score_threshold
 76 |         if self.post_center_range is not None:
 77 |             self.post_center_range = torch.tensor(
 78 |                 self.post_center_range, device=scores.device)
 79 |             mask = (final_box_preds[..., :3] >=
 80 |                     self.post_center_range[:3]).all(1)
 81 |             mask &= (final_box_preds[..., :3] <=
 82 |                      self.post_center_range[3:]).all(1)
 83 | 
 84 |             if self.score_threshold:
 85 |                 mask &= thresh_mask
 86 | 
 87 |             boxes3d = final_box_preds[mask]
 88 |             scores = final_scores[mask]
 89 |             labels = final_preds[mask]
 90 |             predictions_dict = {
 91 |                 'bboxes': boxes3d,
 92 |                 'scores': scores,
 93 |                 'labels': labels
 94 |             }
 95 | 
 96 |         else:
 97 |             raise NotImplementedError(
 98 |                 'Need to reorganize output as a batch, only '
 99 |                 'support post_center_range is not None for now!')
100 |         return predictions_dict
101 | 
102 |     def decode(self, preds_dicts):
103 |         """Decode bboxes.
104 |         Args:
105 |             all_cls_scores (Tensor): Outputs from the classification head, \
106 |                 shape [nb_dec, bs, num_query, cls_out_channels]. Note \
107 |                 cls_out_channels should includes background.
108 |             all_bbox_preds (Tensor): Sigmoid outputs from the regression \
109 |                 head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
110 |                 Shape [nb_dec, bs, num_query, 9].
111 |         Returns:
112 |             list[dict]: Decoded boxes.
113 |         """
114 |         task_num = len(preds_dicts)
115 | 
116 |         pred_bbox_list, pred_logits_list, task_ids_list = [], [], []
117 |         for task_id in range(task_num):
118 |             task_pred_dict = preds_dicts[task_id][0]
119 |             task_pred_bbox = torch.cat(
120 |                     (task_pred_dict['center'][-1], task_pred_dict['height'][-1],
121 |                      task_pred_dict['dim'][-1], task_pred_dict['rot'][-1],
122 |                      task_pred_dict['vel'][-1]),
123 |                     dim=-1
124 |                 )
125 |             task_pred_logits = task_pred_dict['cls_logits'][-1]
126 |             pred_bbox_list.append(task_pred_bbox)
127 |             pred_logits_list.append(task_pred_logits)
128 | 
129 |             task_ids = task_pred_logits.new_ones(task_pred_logits.shape).int() * task_id
130 |             task_ids_list.append(task_ids)
131 |         
132 |        
133 |         all_pred_logits = torch.cat(pred_logits_list, dim=-1)  # bs * nq * 10
134 |         all_pred_bbox = torch.cat(pred_bbox_list, dim=1)  # bs * (task nq) * 10
135 |         all_task_ids = torch.cat(task_ids_list, dim=-1) # bs * nq * 10
136 | 
137 |         batch_size = all_pred_logits.shape[0]
138 |         predictions_list = []
139 |         for i in range(batch_size):
140 |             predictions_list.append(
141 |                 self.decode_single(all_pred_logits[i], all_pred_bbox[i], all_task_ids[i]))
142 |         return predictions_list


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/utils/attention.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 megvii-model. All Rights Reserved.
  2 | 
  3 | import math
  4 | import torch
  5 | import torch.nn as nn
  6 | from torch.nn.init import (
  7 |     xavier_uniform_,
  8 |     constant_,
  9 |     xavier_normal_
 10 | )
 11 | from torch.nn.functional import linear
 12 | 
 13 | from einops import rearrange
 14 | from mmcv.runner import auto_fp16
 15 | from mmcv.runner.base_module import BaseModule
 16 | 
 17 | from flash_attn.flash_attn_interface import flash_attn_unpadded_kvpacked_func
 18 | from flash_attn.bert_padding import unpad_input, pad_input, index_first_axis
 19 | 
 20 | 
 21 | def _in_projection_packed(q, k, v, w, b = None):
 22 |     w_q, w_k, w_v = w.chunk(3)
 23 |     if b is None:
 24 |         b_q = b_k = b_v = None
 25 |     else:
 26 |         b_q, b_k, b_v = b.chunk(3)
 27 |     return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
 28 | 
 29 | 
 30 | class FlashAttention(nn.Module):
 31 |     """Implement the scaled dot product attention with softmax.
 32 |     Arguments
 33 |     ---------
 34 |         softmax_scale: The temperature to use for the softmax attention.
 35 |                       (default: 1/sqrt(d_keys) where d_keys is computed at
 36 |                       runtime)
 37 |         attention_dropout: The dropout rate to apply to the attention
 38 |                            (default: 0.1)
 39 |     """
 40 |     def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
 41 |         super().__init__()
 42 |         self.softmax_scale = softmax_scale
 43 |         self.dropout_p = attention_dropout
 44 |         self.fp16_enabled = True
 45 | 
 46 |     @auto_fp16(apply_to=('q', 'kv'), out_fp32=True)
 47 |     def forward(self, q, kv, 
 48 |                 causal=False, 
 49 |                 key_padding_mask=None):
 50 |         """Implements the multihead softmax attention.
 51 |         Arguments
 52 |         ---------
 53 |             q: The tensor containing the query. (B, T, H, D) 
 54 |             kv: The tensor containing the key, and value. (B, S, 2, H, D) 
 55 |             key_padding_mask: a bool tensor of shape (B, S)
 56 |         """
 57 |         assert q.dtype in [torch.float16, torch.bfloat16] and kv.dtype in [torch.float16, torch.bfloat16]
 58 |         assert q.is_cuda and kv.is_cuda
 59 |         assert q.shape[0] == kv.shape[0] and q.shape[-2] == kv.shape[-2] and q.shape[-1] == kv.shape[-1]
 60 | 
 61 |         batch_size = q.shape[0]
 62 |         seqlen_q, seqlen_k = q.shape[1], kv.shape[1]
 63 |         if key_padding_mask is None:
 64 |             q, kv = rearrange(q, 'b s ... -> (b s) ...'), rearrange(kv, 'b s ... -> (b s) ...')
 65 |             max_sq, max_sk = seqlen_q, seqlen_k 
 66 |             cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32,
 67 |                                     device=q.device)
 68 |             cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32,
 69 |                                     device=kv.device)                    
 70 |             output = flash_attn_unpadded_kvpacked_func(
 71 |                 q, kv, cu_seqlens_q, cu_seqlens_k, max_sq, max_sk,
 72 |                 self.dropout_p if self.training else 0.0,
 73 |                 softmax_scale=self.softmax_scale, causal=causal
 74 |             )
 75 |             output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
 76 |         else:
 77 |             nheads = kv.shape[-2]
 78 |             q = rearrange(q, 'b s ... -> (b s) ...')
 79 |             max_sq = seqlen_q
 80 |             cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32,
 81 |                                     device=q.device)
 82 |             x = rearrange(kv, 'b s two h d -> b s (two h d)')
 83 |             x_unpad, indices, cu_seqlens_k, max_sk = unpad_input(x, key_padding_mask)
 84 |             x_unpad = rearrange(x_unpad, 'nnz (two h d) -> nnz two h d', two=2, h=nheads)
 85 |             output_unpad = flash_attn_unpadded_kvpacked_func(
 86 |                 q, x_unpad, cu_seqlens_q, cu_seqlens_k, max_sq, max_sk,
 87 |                 self.dropout_p if self.training else 0.0,
 88 |                 softmax_scale=self.softmax_scale, causal=causal
 89 |             )
 90 |             output = rearrange(output_unpad, '(b s) ... -> b s ...', b=batch_size)
 91 | 
 92 |         return output, None
 93 | 
 94 | 
 95 | class FlashMHA(nn.Module):
 96 | 
 97 |     def __init__(self, embed_dim, num_heads, bias=True, batch_first=True, attention_dropout=0.0,
 98 |                  causal=False, device=None, dtype=None, **kwargs) -> None:
 99 |         assert batch_first
100 |         factory_kwargs = {'device': device, 'dtype': dtype}
101 |         super().__init__()
102 |         self.embed_dim = embed_dim
103 |         self.causal = causal
104 |         self.bias = bias
105 | 
106 |         self.num_heads = num_heads
107 |         assert self.embed_dim % num_heads == 0, "self.kdim must be divisible by num_heads"
108 |         self.head_dim = self.embed_dim // num_heads
109 |         assert self.head_dim % 8 == 0 and self.head_dim <= 128, "Only support head_dim <= 128 and divisible by 8"
110 | 
111 |         self.in_proj_weight = nn.Parameter(torch.empty((3 * embed_dim, embed_dim)))
112 |         if bias:
113 |             self.in_proj_bias = nn.Parameter(torch.empty(3 * embed_dim))
114 |         else:
115 |             self.register_parameter('in_proj_bias', None)
116 |         self.inner_attn = FlashAttention(attention_dropout=attention_dropout, **factory_kwargs)
117 |         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
118 |         self._reset_parameters()
119 | 
120 |     def _reset_parameters(self) -> None:
121 |         xavier_uniform_(self.in_proj_weight)
122 |         if self.in_proj_bias is not None:
123 |             constant_(self.in_proj_bias, 0.)
124 |             constant_(self.out_proj.bias, 0.)
125 |         
126 |     def forward(self, q, k, v, key_padding_mask=None):
127 |         """x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim)
128 |         key_padding_mask: bool tensor of shape (batch, seqlen)
129 |         """
130 |         # q, k, v = self.Wq(q), self.Wk(k), self.Wv(v)
131 |         q, k, v = _in_projection_packed(q, k, v, self.in_proj_weight, self.in_proj_bias)
132 |         q = rearrange(q, 'b s (h d) -> b s h d', h=self.num_heads)
133 |         k = rearrange(k, 'b s (h d) -> b s h d', h=self.num_heads)
134 |         v = rearrange(v, 'b s (h d) -> b s h d', h=self.num_heads)
135 |         kv = torch.stack([k, v], dim=2)
136 |         
137 |         context, attn_weights = self.inner_attn(q, kv, key_padding_mask=key_padding_mask, causal=self.causal)
138 |         return self.out_proj(rearrange(context, 'b s h d -> b s (h d)')), attn_weights
139 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2023 megvii-model. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from DETR3D (https://github.com/WangYueFt/detr3d)
  5 | # Copyright (c) 2021 Wang, Yue
  6 | # ------------------------------------------------------------------------
  7 | # Modified from mmdetection (https://github.com/open-mmlab/mmdetection)
  8 | # Copyright (c) OpenMMLab. All rights reserved.
  9 | # ------------------------------------------------------------------------
 10 | 
 11 | import torch
 12 | 
 13 | from mmdet.core.bbox.builder import BBOX_ASSIGNERS
 14 | from mmdet.core.bbox.assigners import AssignResult
 15 | from mmdet.core.bbox.assigners import BaseAssigner
 16 | from mmdet.core.bbox.match_costs import build_match_cost
 17 | from mmdet.core.bbox.match_costs.builder import MATCH_COST
 18 | from mmdet.core.bbox.iou_calculators import build_iou_calculator
 19 | from mmdet.models.utils.transformer import inverse_sigmoid
 20 | from scipy.optimize import linear_sum_assignment
 21 | 
 22 | from projects.mmdet3d_plugin.core.bbox.util import (
 23 |     normalize_bbox, 
 24 |     denormalize_bbox
 25 | )
 26 | 
 27 | 
 28 | @BBOX_ASSIGNERS.register_module()
 29 | class HungarianAssigner3D(BaseAssigner):
 30 |     """Computes one-to-one matching between predictions and ground truth.
 31 |     This class computes an assignment between the targets and the predictions
 32 |     based on the costs. The costs are weighted sum of three components:
 33 |     classification cost, regression L1 cost and regression iou cost. The
 34 |     targets don't include the no_object, so generally there are more
 35 |     predictions than targets. After the one-to-one matching, the un-matched
 36 |     are treated as backgrounds. Thus each query prediction will be assigned
 37 |     with `0` or a positive integer indicating the ground truth index:
 38 |     - 0: negative sample, no assigned gt
 39 |     - positive integer: positive sample, index (1-based) of assigned gt
 40 |     Args:
 41 |         cls_weight (int | float, optional): The scale factor for classification
 42 |             cost. Default 1.0.
 43 |         bbox_weight (int | float, optional): The scale factor for regression
 44 |             L1 cost. Default 1.0.
 45 |         iou_weight (int | float, optional): The scale factor for regression
 46 |             iou cost. Default 1.0.
 47 |         iou_calculator (dict | optional): The config for the iou calculation.
 48 |             Default type `BboxOverlaps2D`.
 49 |         iou_mode (str | optional): "iou" (intersection over union), "iof"
 50 |                 (intersection over foreground), or "giou" (generalized
 51 |                 intersection over union). Default "giou".
 52 |     """
 53 | 
 54 |     def __init__(self,
 55 |                  cls_cost=dict(type='ClassificationCost', weight=1.),
 56 |                  reg_cost=dict(type='BBoxL1Cost', weight=1.0),
 57 |                  iou_cost=dict(type='IoUCost', weight=0.0),
 58 |                  pc_range=None,
 59 |                  code_weights=None):
 60 |         self.cls_cost = build_match_cost(cls_cost)
 61 |         self.reg_cost = build_match_cost(reg_cost)
 62 |         self.iou_cost = build_match_cost(iou_cost)
 63 |         self.pc_range = pc_range
 64 |         self.code_weights = code_weights
 65 |         if self.code_weights:
 66 |             self.code_weights = torch.tensor(self.code_weights)[None, :].cuda()
 67 | 
 68 |     def assign(self,
 69 |                bbox_pred,
 70 |                cls_pred,
 71 |                gt_bboxes, 
 72 |                gt_labels,
 73 |                gt_bboxes_ignore=None,
 74 |                eps=1e-7,
 75 |                code_weights=None):
 76 |         """Computes one-to-one matching based on the weighted costs.
 77 |         This method assign each query prediction to a ground truth or
 78 |         background. The `assigned_gt_inds` with -1 means don't care,
 79 |         0 means negative sample, and positive number is the index (1-based)
 80 |         of assigned gt.
 81 |         The assignment is done in the following steps, the order matters.
 82 |         1. assign every prediction to -1
 83 |         2. compute the weighted costs
 84 |         3. do Hungarian matching on CPU based on the costs
 85 |         4. assign all to 0 (background) first, then for each matched pair
 86 |            between predictions and gts, treat this prediction as foreground
 87 |            and assign the corresponding gt index (plus 1) to it.
 88 |         Args:
 89 |             bbox_pred (Tensor): Predicted boxes with normalized coordinates
 90 |                 (cx, cy, w, h), which are all in range [0, 1]. Shape
 91 |                 [num_query, 4].
 92 |             cls_pred (Tensor): Predicted classification logits, shape
 93 |                 [num_query, num_class].
 94 |             gt_bboxes (Tensor): Ground truth boxes with unnormalized
 95 |                 coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
 96 |             gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
 97 |             gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
 98 |                 labelled as `ignored`. Default None.
 99 |             eps (int | float, optional): A value added to the denominator for
100 |                 numerical stability. Default 1e-7.
101 |         Returns:
102 |             :obj:`AssignResult`: The assigned result.
103 |         """
104 |         assert gt_bboxes_ignore is None, \
105 |             'Only case when gt_bboxes_ignore is None is supported.'
106 |         num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
107 | 
108 |         # 1. assign -1 by default
109 |         assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
110 |                                               -1,
111 |                                               dtype=torch.long)
112 |         assigned_labels = bbox_pred.new_full((num_bboxes, ),
113 |                                              -1,
114 |                                              dtype=torch.long)
115 |         if num_gts == 0 or num_bboxes == 0:
116 |             # No ground truth or boxes, return empty assignment
117 |             if num_gts == 0:
118 |                 # No ground truth, assign all to background
119 |                 assigned_gt_inds[:] = 0
120 |             return AssignResult(
121 |                 num_gts, assigned_gt_inds, None, labels=assigned_labels)
122 | 
123 |         # 2. compute the weighted costs
124 |         # classification and bboxcost.
125 |         cls_cost = self.cls_cost(cls_pred, gt_labels)
126 |         # regression L1 cost
127 |         normalized_gt_bboxes = normalize_bbox(gt_bboxes, self.pc_range)
128 | 
129 |         if self.code_weights is not None:
130 |             bbox_pred = bbox_pred * self.code_weights
131 |             normalized_gt_bboxes = normalized_gt_bboxes * self.code_weights
132 |         
133 |         reg_cost = self.reg_cost(bbox_pred[:, :8], normalized_gt_bboxes[:, :8])
134 |       
135 |         # weighted sum of above two costs
136 |         cost = cls_cost + reg_cost
137 |         
138 |         # 3. do Hungarian matching on CPU using linear_sum_assignment
139 |         cost = cost.detach().cpu()
140 |         if linear_sum_assignment is None:
141 |             raise ImportError('Please run "pip install scipy" '
142 |                               'to install scipy first.')
143 |         matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
144 |         matched_row_inds = torch.from_numpy(matched_row_inds).to(
145 |             bbox_pred.device)
146 |         matched_col_inds = torch.from_numpy(matched_col_inds).to(
147 |             bbox_pred.device)
148 | 
149 |         # 4. assign backgrounds and foregrounds
150 |         # assign all indices to backgrounds first
151 |         assigned_gt_inds[:] = 0
152 |         # assign foregrounds based on matching results
153 |         assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
154 |         assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
155 |         return AssignResult(
156 |             num_gts, assigned_gt_inds, None, labels=assigned_labels)
157 | 
158 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/detectors/fstr.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2022 megvii-model. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
  5 | # Copyright (c) OpenMMLab. All rights reserved.
  6 | # ------------------------------------------------------------------------
  7 | 
  8 | import torch
  9 | import torch.nn.functional as F
 10 | 
 11 | from mmcv.runner import force_fp32
 12 | from mmdet.models import DETECTORS
 13 | from mmdet3d.core import bbox3d2result
 14 | from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector
 15 | 
 16 | from projects.mmdet3d_plugin import SPConvVoxelization
 17 | 
 18 | @DETECTORS.register_module()
 19 | class FSTRDetector(MVXTwoStageDetector):
 20 | 
 21 |     def __init__(self,
 22 |                  **kwargs):
 23 |         pts_voxel_cfg = kwargs.get('pts_voxel_layer', None)
 24 |         kwargs['pts_voxel_layer'] = None
 25 |         super(FSTRDetector, self).__init__(**kwargs)
 26 |         if pts_voxel_cfg:
 27 |             self.pts_voxel_layer = SPConvVoxelization(**pts_voxel_cfg)
 28 | 
 29 |     def init_weights(self):
 30 |         """Initialize model weights."""
 31 |         super(FSTRDetector, self).init_weights()
 32 | 
 33 |     def extract_feat(self, points, img_metas):
 34 |         """Extract features from images and points."""
 35 |         pts_feats = self.extract_pts_feat(points, img_metas)
 36 |         return pts_feats
 37 | 
 38 |     @force_fp32(apply_to=('pts'))    
 39 |     def extract_pts_feat(self, pts, img_metas):
 40 |         """Extract features of points."""
 41 |         if not self.with_pts_bbox:
 42 |             return None
 43 |         if pts is None:
 44 |             return None
 45 |         voxels, num_points, coors = self.voxelize(pts)
 46 |         voxel_features = self.pts_voxel_encoder(voxels, num_points, coors,
 47 |                                                 )
 48 |         batch_size = coors[-1, 0] + 1
 49 |         x = self.pts_middle_encoder(voxel_features, coors, batch_size)
 50 |         return x
 51 | 
 52 |     @torch.no_grad()
 53 |     @force_fp32()
 54 |     def voxelize(self, points):
 55 |         """Apply dynamic voxelization to points.
 56 | 
 57 |         Args:
 58 |             points (list[torch.Tensor]): Points of each sample.
 59 | 
 60 |         Returns:
 61 |             tuple[torch.Tensor]: Concatenated points, number of points
 62 |                 per voxel, and coordinates.
 63 |         """
 64 |         voxels, coors, num_points = [], [], []
 65 |         for res in points:
 66 |             res_voxels, res_coors, res_num_points = self.pts_voxel_layer(res)
 67 |             voxels.append(res_voxels)
 68 |             coors.append(res_coors)
 69 |             num_points.append(res_num_points)
 70 |         voxels = torch.cat(voxels, dim=0)
 71 |         num_points = torch.cat(num_points, dim=0)
 72 |         coors_batch = []
 73 |         for i, coor in enumerate(coors):
 74 |             coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
 75 |             coors_batch.append(coor_pad)
 76 |         coors_batch = torch.cat(coors_batch, dim=0)
 77 |         return voxels, num_points, coors_batch
 78 | 
 79 |     def forward_train(self,
 80 |                       points=None,
 81 |                       img_metas=None,
 82 |                       gt_bboxes_3d=None,
 83 |                       gt_labels_3d=None,
 84 |                       gt_labels=None,
 85 |                       gt_bboxes=None,
 86 |                       proposals=None,
 87 |                       gt_bboxes_ignore=None,
 88 |                       **kwargs):
 89 |         """Forward training function.
 90 | 
 91 |         Args:
 92 |             points (list[torch.Tensor], optional): Points of each sample.
 93 |                 Defaults to None.
 94 |             img_metas (list[dict], optional): Meta information of each sample.
 95 |                 Defaults to None.
 96 |             gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
 97 |                 Ground truth 3D boxes. Defaults to None.
 98 |             gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
 99 |                 of 3D boxes. Defaults to None.
100 |             gt_labels (list[torch.Tensor], optional): Ground truth labels
101 |                 of 2D boxes in images. Defaults to None.
102 |             gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
103 |                 images. Defaults to None.
104 |             img (torch.Tensor optional): Images of each sample with shape
105 |                 (N, C, H, W). Defaults to None.
106 |             proposals ([list[torch.Tensor], optional): Predicted proposals
107 |                 used for training Fast RCNN. Defaults to None.
108 |             gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
109 |                 2D boxes in images to be ignored. Defaults to None.
110 | 
111 |         Returns:
112 |             dict: Losses of different branches.
113 |         """
114 |         # nvtx.range_push('forward')
115 |         # nvtx.range_push('voxel_backbone')
116 |         pts_feats = self.extract_feat(
117 |             points=points, img_metas=img_metas)
118 |         # nvtx.range_pop()
119 |         # nvtx.range_push('fstr_head')
120 |         losses = dict()
121 |         if pts_feats :
122 |             losses_pts = self.forward_pts_train(pts_feats, gt_bboxes_3d,
123 |                                                 gt_labels_3d, img_metas,
124 |                                                 gt_bboxes_ignore)
125 |             losses.update(losses_pts)
126 |         # nvtx.range_pop()
127 |         # nvtx.range_pop()
128 |         return losses
129 | 
130 |     @force_fp32(apply_to=('pts_feats'))
131 |     def forward_pts_train(self,
132 |                           pts_feats,
133 |                           gt_bboxes_3d,
134 |                           gt_labels_3d,
135 |                           img_metas,
136 |                           gt_bboxes_ignore=None,
137 |                           ):
138 |         """Forward function for point cloud branch.
139 | 
140 |         Args:
141 |             pts_feats (list[torch.Tensor]): Features of point cloud branch
142 |             gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
143 |                 boxes for each sample.
144 |             gt_labels_3d (list[torch.Tensor]): Ground truth labels for
145 |                 boxes of each sampole
146 |             img_metas (list[dict]): Meta information of samples.
147 |             gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
148 |                 boxes to be ignored. Defaults to None.
149 | 
150 |         Returns:
151 |             dict: Losses of each branch.
152 |         """
153 |         outs = self.pts_bbox_head(pts_feats, img_metas)
154 |         loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs]
155 |         losses = self.pts_bbox_head.loss(*loss_inputs)
156 |         return losses
157 | 
158 |     def forward_test(self,
159 |                      points=None,
160 |                      img_metas=None, 
161 |                      **kwargs):
162 |         """
163 |         Args:
164 |             points (list[torch.Tensor]): the outer list indicates test-time
165 |                 augmentations and inner torch.Tensor should have a shape NxC,
166 |                 which contains all points in the batch.
167 |             img_metas (list[list[dict]]): the outer list indicates test-time
168 |                 augs (multiscale, flip, etc.) and the inner list indicates
169 |                 images in a batch
170 |             img (list[torch.Tensor], optional): the outer
171 |                 list indicates test-time augmentations and inner
172 |                 torch.Tensor should have a shape NxCxHxW, which contains
173 |                 all images in the batch. Defaults to None.
174 |         """
175 |         if points is None:
176 |             points = [None]
177 |         for var, name in [(points, 'points'), (img_metas, 'img_metas')]:
178 |             if not isinstance(var, list):
179 |                 raise TypeError('{} must be a list, but got {}'.format(
180 |                     name, type(var)))
181 | 
182 |         num_augs = len(points)
183 |         if num_augs != len(img_metas):
184 |             raise ValueError(
185 |                 'num of augmentations ({}) != num of image meta ({})'.format(
186 |                     len(points), len(img_metas)))
187 | 
188 |         if num_augs == 1:
189 |             return self.simple_test(points[0], img_metas[0],**kwargs)
190 |         else:
191 |             return self.aug_test(points, img_metas, **kwargs)
192 |     
193 |     @force_fp32(apply_to=('x'))
194 |     def simple_test_pts(self, x, img_metas, rescale=False):
195 |         """Test function of point cloud branch."""
196 |         outs = self.pts_bbox_head(x, img_metas)
197 |         bbox_list = self.pts_bbox_head.get_bboxes(
198 |             outs, img_metas, rescale=rescale)
199 |         bbox_results = [
200 |             bbox3d2result(bboxes, scores, labels)
201 |             for bboxes, scores, labels in bbox_list
202 |         ] 
203 |         return bbox_results
204 | 
205 |     def simple_test(self, points, img_metas, rescale=False):
206 |         """Test function without augmentaiton."""
207 | 
208 |         pts_feats = self.extract_feat(
209 |             points, img_metas=img_metas)
210 |         bbox_list = [dict() for i in range(len(img_metas))]
211 |         if self.with_pts_bbox:
212 |             bbox_pts = self.simple_test_pts(
213 |                 pts_feats, img_metas, rescale=rescale)
214 |             for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
215 |                 result_dict['pts_bbox'] = pts_bbox
216 |         return bbox_list
217 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/backbones/voxelnext.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) OpenMMLab. All rights reserved.
  2 | from mmcv.runner import auto_fp16
  3 | from torch import nn as nn
  4 | 
  5 | from mmdet3d.ops import SparseBasicBlock, make_sparse_convmodule
  6 | # from mmdet3d.ops import spconv as spconv
  7 | from mmdet3d.models.builder import MIDDLE_ENCODERS
  8 | import torch
  9 | from mmcv.cnn import build_conv_layer, build_norm_layer
 10 | from torch import nn
 11 | 
 12 | # from mmdet3d.ops import spconv
 13 | import spconv.pytorch as spconv
 14 | from mmdet.models.backbones.resnet import BasicBlock, Bottleneck
 15 | 
 16 | 
 17 | @MIDDLE_ENCODERS.register_module()
 18 | class VoxelNextEncoder(nn.Module):
 19 |     r"""Sparse encoder for SECOND and Part-A2.
 20 | 
 21 |     Args:
 22 |         in_channels (int): The number of input channels.
 23 |         sparse_shape (list[int]): The sparse shape of input tensor.
 24 |         order (list[str]): Order of conv module. Defaults to ('conv',
 25 |             'norm', 'act').
 26 |         norm_cfg (dict): Config of normalization layer. Defaults to
 27 |             dict(type='BN1d', eps=1e-3, momentum=0.01).
 28 |         base_channels (int): Out channels for conv_input layer.
 29 |             Defaults to 16.
 30 |         output_channels (int): Out channels for conv_out layer.
 31 |             Defaults to 128.
 32 |         encoder_channels (tuple[tuple[int]]):
 33 |             Convolutional channels of each encode block.
 34 |         encoder_paddings (tuple[tuple[int]]): Paddings of each encode block.
 35 |             Defaults to ((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)).
 36 |         block_type (str): Type of the block to use. Defaults to 'conv_module'.
 37 |     """
 38 | 
 39 |     def __init__(self,
 40 |                  in_channels,
 41 |                  sparse_shape,
 42 |                  order=('conv', 'norm', 'act'),
 43 |                  norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
 44 |                  base_channels=16,
 45 |                  output_channels=128,
 46 |                  encoder_channels=((16, 16), (32, 32, 32), (64, 64, 64), (128, 128, 128),(128, 128, 128),(128, 128, 128)),
 47 |                  encoder_paddings=((1, 1 ), (1, 1, 1), (1, 1, 1), (1, 1, 1),(1, 1, 1),(1, 1, 1)),
 48 |                  sparse_conv_kernel = (3, 3, 3, 3, 3),
 49 |                  block_type='basicblock'):
 50 |         super().__init__()
 51 |         assert block_type in ['conv_module', 'basicblock']
 52 |         self.sparse_shape = sparse_shape
 53 |         self.in_channels = in_channels
 54 |         self.order = order
 55 |         self.base_channels = base_channels
 56 |         self.output_channels = output_channels
 57 |         self.encoder_channels = encoder_channels
 58 |         self.encoder_paddings = encoder_paddings
 59 |         self.stage_num = len(self.encoder_channels)
 60 |         self.sparse_conv_kernel = sparse_conv_kernel
 61 |         self.fp16_enabled = False
 62 |         # Spconv init all weight on its own
 63 | 
 64 |         assert isinstance(order, tuple) and len(order) == 3
 65 |         assert set(order) == {'conv', 'norm', 'act'}
 66 | 
 67 |         if self.order[0] != 'conv':  # pre activate
 68 |             self.conv_input = make_sparse_convmodule(
 69 |                 in_channels,
 70 |                 self.base_channels,
 71 |                 3,
 72 |                 norm_cfg=norm_cfg,
 73 |                 padding=1,
 74 |                 indice_key='subm1',
 75 |                 conv_type='SubMConv3d',
 76 |                 order=('conv', ))
 77 |         else:  # post activate
 78 |             self.conv_input = make_sparse_convmodule(
 79 |                 in_channels,
 80 |                 self.base_channels,
 81 |                 3,
 82 |                 norm_cfg=norm_cfg,
 83 |                 padding=1,
 84 |                 indice_key='subm1',
 85 |                 conv_type='SubMConv3d')
 86 | 
 87 |         encoder_out_channels = self.make_encoder_layers(
 88 |             make_sparse_convmodule,
 89 |             norm_cfg,
 90 |             self.base_channels,
 91 |             block_type=block_type)
 92 | 
 93 |         self.conv_out = make_sparse_convmodule(
 94 |             encoder_out_channels,
 95 |             self.output_channels,
 96 |             kernel_size=3,
 97 |             stride=1,
 98 |             norm_cfg=norm_cfg,
 99 |             padding=1,
100 |             indice_key='spconv_down2',
101 |             conv_type='SparseConv2d')
102 |         
103 |         self.shared_out = make_sparse_convmodule(
104 |             self.output_channels,
105 |             self.output_channels,
106 |             kernel_size=3,
107 |             stride=1,
108 |             norm_cfg=norm_cfg,
109 |             padding=1,
110 |             indice_key='spconv_out',
111 |             conv_type='SubMConv2d')
112 | 
113 |     @auto_fp16(apply_to=('voxel_features', ))
114 |     def forward(self, voxel_features, coors, batch_size):
115 |         """Forward of SparseEncoder.
116 | 
117 |         Args:
118 |             voxel_features (torch.float32): Voxel features in shape (N, C).
119 |             coors (torch.int32): Coordinates in shape (N, 4), \
120 |                 the columns in the order of (batch_idx, z_idx, y_idx, x_idx).
121 |             batch_size (int): Batch size.
122 | 
123 |         Returns:
124 |             dict: Backbone features.
125 |         """
126 |         coors = coors.int()
127 |         input_sp_tensor = spconv.SparseConvTensor(voxel_features, coors,
128 |                                                   self.sparse_shape,
129 |                                                   batch_size)
130 |         x = self.conv_input(input_sp_tensor)
131 | 
132 |         encode_features = []
133 |         for encoder_layer in self.encoder_layers:
134 |             x = encoder_layer(x)
135 |             encode_features.append(x)
136 | 
137 |         encode_features[4].indices[:, 1:] *= 2
138 |         encode_features[5].indices[:, 1:] *= 4
139 |         encode_features[3] = encode_features[3].replace_feature(torch.cat([encode_features[3].features, encode_features[4].features, encode_features[5].features]))
140 |         encode_features[3].indices = torch.cat([ encode_features[3].indices,  encode_features[4].indices,  encode_features[5].indices])
141 | 
142 |         out = self.bev_out(encode_features[3])
143 |         out = self.conv_out(out)
144 |         out = self.shared_out(out)
145 | 
146 | 
147 |         return [out]
148 |     
149 |     def bev_out(self, x_conv):
150 |         features_cat = x_conv.features
151 |         indices_cat = x_conv.indices[:, [0, 2, 3]]
152 |         spatial_shape = x_conv.spatial_shape[1:]
153 | 
154 |         indices_unique, _inv = torch.unique(indices_cat, dim=0, return_inverse=True)
155 |         features_unique = features_cat.new_zeros((indices_unique.shape[0], features_cat.shape[1]))
156 |         features_unique.index_add_(0, _inv, features_cat)
157 | 
158 |         x_out = spconv.SparseConvTensor(
159 |             features=features_unique,
160 |             indices=indices_unique,
161 |             spatial_shape=spatial_shape,
162 |             batch_size=x_conv.batch_size
163 |         )
164 |         return x_out
165 |     def make_encoder_layers(self,
166 |                             make_block,
167 |                             norm_cfg,
168 |                             in_channels,
169 |                             block_type='conv_module',
170 |                             conv_cfg=dict(type='SubMConv3d')):
171 |         """make encoder layers using sparse convs.
172 | 
173 |         Args:
174 |             make_block (method): A bounded function to build blocks.
175 |             norm_cfg (dict[str]): Config of normalization layer.
176 |             in_channels (int): The number of encoder input channels.
177 |             block_type (str): Type of the block to use. Defaults to
178 |                 'conv_module'.
179 |             conv_cfg (dict): Config of conv layer. Defaults to
180 |                 dict(type='SubMConv3d').
181 | 
182 |         Returns:
183 |             int: The number of encoder output channels.
184 |         """
185 |         assert block_type in ['conv_module', 'basicblock']
186 |         self.encoder_layers = spconv.SparseSequential()
187 | 
188 |         for i, blocks in enumerate(self.encoder_channels):
189 |             blocks_list = []
190 |             for j, out_channels in enumerate(tuple(blocks)):
191 |                 padding = tuple(self.encoder_paddings[i])[j]
192 |                 # each stage started with a spconv layer
193 |                 # except the first stage
194 |                 if i != 0 and j == 0 and block_type == 'conv_module':
195 |                     blocks_list.append(
196 |                         make_block(
197 |                             in_channels,
198 |                             out_channels,
199 |                             3,
200 |                             norm_cfg=norm_cfg,
201 |                             stride=2,
202 |                             padding=padding,
203 |                             indice_key=f'spconv{i + 1}',
204 |                             conv_type='SparseConv3d'))
205 |                 elif block_type == 'basicblock':
206 |                     if j == 0 and len(blocks) > 2:
207 |                         blocks_list.append(
208 |                             make_block(
209 |                                 in_channels,
210 |                                 out_channels,
211 |                                 self.sparse_conv_kernel[i - 1],
212 |                                 norm_cfg=norm_cfg,
213 |                                 stride=2,
214 |                                 padding=int(self.sparse_conv_kernel[i - 1]//2),
215 |                                 indice_key=f'spconv{i + 1}',
216 |                                 conv_type='SparseConv3d'))
217 |                     else:
218 |                         blocks_list.append(
219 |                             SparseBasicBlock(
220 |                                 out_channels,
221 |                                 out_channels,
222 |                                 norm_cfg=norm_cfg,
223 |                                 conv_cfg=conv_cfg))
224 |                 else:
225 |                     blocks_list.append(
226 |                         make_block(
227 |                             in_channels,
228 |                             out_channels,
229 |                             3,
230 |                             norm_cfg=norm_cfg,
231 |                             padding=padding,
232 |                             indice_key=f'subm{i + 1}',
233 |                             conv_type='SubMConv3d'))
234 |                 in_channels = out_channels
235 |             stage_name = f'encoder_layer{i + 1}'
236 |             stage_layers = spconv.SparseSequential(*blocks_list)
237 |             self.encoder_layers.add_module(stage_name, stage_layers)
238 |         return out_channels


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | CMT
  2 | 
  3 |                                  Apache License
  4 |                            Version 2.0, January 2004
  5 |                         http://www.apache.org/licenses/
  6 | 
  7 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  8 | 
  9 |    1. Definitions.
 10 | 
 11 |       "License" shall mean the terms and conditions for use, reproduction,
 12 |       and distribution as defined by Sections 1 through 9 of this document.
 13 | 
 14 |       "Licensor" shall mean the copyright owner or entity authorized by
 15 |       the copyright owner that is granting the License.
 16 | 
 17 |       "Legal Entity" shall mean the union of the acting entity and all
 18 |       other entities that control, are controlled by, or are under common
 19 |       control with that entity. For the purposes of this definition,
 20 |       "control" means (i) the power, direct or indirect, to cause the
 21 |       direction or management of such entity, whether by contract or
 22 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 23 |       outstanding shares, or (iii) beneficial ownership of such entity.
 24 | 
 25 |       "You" (or "Your") shall mean an individual or Legal Entity
 26 |       exercising permissions granted by this License.
 27 | 
 28 |       "Source" form shall mean the preferred form for making modifications,
 29 |       including but not limited to software source code, documentation
 30 |       source, and configuration files.
 31 | 
 32 |       "Object" form shall mean any form resulting from mechanical
 33 |       transformation or translation of a Source form, including but
 34 |       not limited to compiled object code, generated documentation,
 35 |       and conversions to other media types.
 36 | 
 37 |       "Work" shall mean the work of authorship, whether in Source or
 38 |       Object form, made available under the License, as indicated by a
 39 |       copyright notice that is included in or attached to the work
 40 |       (an example is provided in the Appendix below).
 41 | 
 42 |       "Derivative Works" shall mean any work, whether in Source or Object
 43 |       form, that is based on (or derived from) the Work and for which the
 44 |       editorial revisions, annotations, elaborations, or other modifications
 45 |       represent, as a whole, an original work of authorship. For the purposes
 46 |       of this License, Derivative Works shall not include works that remain
 47 |       separable from, or merely link (or bind by name) to the interfaces of,
 48 |       the Work and Derivative Works thereof.
 49 | 
 50 |       "Contribution" shall mean any work of authorship, including
 51 |       the original version of the Work and any modifications or additions
 52 |       to that Work or Derivative Works thereof, that is intentionally
 53 |       submitted to Licensor for inclusion in the Work by the copyright owner
 54 |       or by an individual or Legal Entity authorized to submit on behalf of
 55 |       the copyright owner. For the purposes of this definition, "submitted"
 56 |       means any form of electronic, verbal, or written communication sent
 57 |       to the Licensor or its representatives, including but not limited to
 58 |       communication on electronic mailing lists, source code control systems,
 59 |       and issue tracking systems that are managed by, or on behalf of, the
 60 |       Licensor for the purpose of discussing and improving the Work, but
 61 |       excluding communication that is conspicuously marked or otherwise
 62 |       designated in writing by the copyright owner as "Not a Contribution."
 63 | 
 64 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 65 |       on behalf of whom a Contribution has been received by Licensor and
 66 |       subsequently incorporated within the Work.
 67 | 
 68 |    2. Grant of Copyright License. Subject to the terms and conditions of
 69 |       this License, each Contributor hereby grants to You a perpetual,
 70 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 71 |       copyright license to reproduce, prepare Derivative Works of,
 72 |       publicly display, publicly perform, sublicense, and distribute the
 73 |       Work and such Derivative Works in Source or Object form.
 74 | 
 75 |    3. Grant of Patent License. Subject to the terms and conditions of
 76 |       this License, each Contributor hereby grants to You a perpetual,
 77 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 78 |       (except as stated in this section) patent license to make, have made,
 79 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 80 |       where such license applies only to those patent claims licensable
 81 |       by such Contributor that are necessarily infringed by their
 82 |       Contribution(s) alone or by combination of their Contribution(s)
 83 |       with the Work to which such Contribution(s) was submitted. If You
 84 |       institute patent litigation against any entity (including a
 85 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 86 |       or a Contribution incorporated within the Work constitutes direct
 87 |       or contributory patent infringement, then any patent licenses
 88 |       granted to You under this License for that Work shall terminate
 89 |       as of the date such litigation is filed.
 90 | 
 91 |    4. Redistribution. You may reproduce and distribute copies of the
 92 |       Work or Derivative Works thereof in any medium, with or without
 93 |       modifications, and in Source or Object form, provided that You
 94 |       meet the following conditions:
 95 | 
 96 |       (a) You must give any other recipients of the Work or
 97 |           Derivative Works a copy of this License; and
 98 | 
 99 |       (b) You must cause any modified files to carry prominent notices
100 |           stating that You changed the files; and
101 | 
102 |       (c) You must retain, in the Source form of any Derivative Works
103 |           that You distribute, all copyright, patent, trademark, and
104 |           attribution notices from the Source form of the Work,
105 |           excluding those notices that do not pertain to any part of
106 |           the Derivative Works; and
107 | 
108 |       (d) If the Work includes a "NOTICE" text file as part of its
109 |           distribution, then any Derivative Works that You distribute must
110 |           include a readable copy of the attribution notices contained
111 |           within such NOTICE file, excluding those notices that do not
112 |           pertain to any part of the Derivative Works, in at least one
113 |           of the following places: within a NOTICE text file distributed
114 |           as part of the Derivative Works; within the Source form or
115 |           documentation, if provided along with the Derivative Works; or,
116 |           within a display generated by the Derivative Works, if and
117 |           wherever such third-party notices normally appear. The contents
118 |           of the NOTICE file are for informational purposes only and
119 |           do not modify the License. You may add Your own attribution
120 |           notices within Derivative Works that You distribute, alongside
121 |           or as an addendum to the NOTICE text from the Work, provided
122 |           that such additional attribution notices cannot be construed
123 |           as modifying the License.
124 | 
125 |       You may add Your own copyright statement to Your modifications and
126 |       may provide additional or different license terms and conditions
127 |       for use, reproduction, or distribution of Your modifications, or
128 |       for any such Derivative Works as a whole, provided Your use,
129 |       reproduction, and distribution of the Work otherwise complies with
130 |       the conditions stated in this License.
131 | 
132 |    5. Submission of Contributions. Unless You explicitly state otherwise,
133 |       any Contribution intentionally submitted for inclusion in the Work
134 |       by You to the Licensor shall be under the terms and conditions of
135 |       this License, without any additional terms or conditions.
136 |       Notwithstanding the above, nothing herein shall supersede or modify
137 |       the terms of any separate license agreement you may have executed
138 |       with Licensor regarding such Contributions.
139 | 
140 |    6. Trademarks. This License does not grant permission to use the trade
141 |       names, trademarks, service marks, or product names of the Licensor,
142 |       except as required for reasonable and customary use in describing the
143 |       origin of the Work and reproducing the content of the NOTICE file.
144 | 
145 |    7. Disclaimer of Warranty. Unless required by applicable law or
146 |       agreed to in writing, Licensor provides the Work (and each
147 |       Contributor provides its Contributions) on an "AS IS" BASIS,
148 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149 |       implied, including, without limitation, any warranties or conditions
150 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151 |       PARTICULAR PURPOSE. You are solely responsible for determining the
152 |       appropriateness of using or redistributing the Work and assume any
153 |       risks associated with Your exercise of permissions under this License.
154 | 
155 |    8. Limitation of Liability. In no event and under no legal theory,
156 |       whether in tort (including negligence), contract, or otherwise,
157 |       unless required by applicable law (such as deliberate and grossly
158 |       negligent acts) or agreed to in writing, shall any Contributor be
159 |       liable to You for damages, including any direct, indirect, special,
160 |       incidental, or consequential damages of any character arising as a
161 |       result of this License or out of the use or inability to use the
162 |       Work (including but not limited to damages for loss of goodwill,
163 |       work stoppage, computer failure or malfunction, or any and all
164 |       other commercial damages or losses), even if such Contributor
165 |       has been advised of the possibility of such damages.
166 | 
167 |    9. Accepting Warranty or Additional Liability. While redistributing
168 |       the Work or Derivative Works thereof, You may choose to offer,
169 |       and charge a fee for, acceptance of support, warranty, indemnity,
170 |       or other liability obligations and/or rights consistent with this
171 |       License. However, in accepting such obligations, You may act only
172 |       on Your own behalf and on Your sole responsibility, not on behalf
173 |       of any other Contributor, and only if You agree to indemnify,
174 |       defend, and hold each Contributor harmless for any liability
175 |       incurred by, or claims asserted against, such Contributor by reason
176 |       of your accepting any such warranty or additional liability.
177 | 
178 |    END OF TERMS AND CONDITIONS
179 | 
180 |    Copyright (c) 2023 Megvii Inc. All rights reserved.
181 | 
182 |    Licensed under the Apache License, Version 2.0 (the "License");
183 |    you may not use this file except in compliance with the License.
184 |    You may obtain a copy of the License at
185 | 
186 |        http://www.apache.org/licenses/LICENSE-2.0
187 | 
188 |    Unless required by applicable law or agreed to in writing, software
189 |    distributed under the License is distributed on an "AS IS" BASIS,
190 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
191 |    See the License for the specific language governing permissions and
192 |    limitations under the License.
193 | 


--------------------------------------------------------------------------------
/projects/configs/lidar/fstr_voxel0075_cbgs_20e.py:
--------------------------------------------------------------------------------
  1 | plugin=True
  2 | plugin_dir='projects/mmdet3d_plugin/'
  3 | 
  4 | point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
  5 | class_names = [
  6 |     'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
  7 |     'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
  8 | ]
  9 | voxel_size = [0.075, 0.075, 0.2]
 10 | out_size_factor = 8
 11 | evaluation = dict(interval=20)
 12 | dataset_type = 'CustomNuScenesDataset'
 13 | data_root = 'data/nuscenes/'
 14 | input_modality = dict(
 15 |     use_lidar=True,
 16 |     use_camera=False,
 17 |     use_radar=False,
 18 |     use_map=False,
 19 |     use_external=False)
 20 | train_pipeline = [
 21 |     dict(
 22 |         type='LoadPointsFromFile',
 23 |         coord_type='LIDAR',
 24 |         load_dim=5,
 25 |         use_dim=[0, 1, 2, 3, 4],
 26 |     ),
 27 |     dict(
 28 |         type='LoadPointsFromMultiSweeps',
 29 |         sweeps_num=10,
 30 |         use_dim=[0, 1, 2, 3, 4],
 31 |     ),
 32 |     dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
 33 |     dict(
 34 |         type='ObjectSample',
 35 |         db_sampler=dict(
 36 |             data_root='data/nuscenes/',
 37 |             info_path='data/nuscenes/' + 'nuscenes_dbinfos_train.pkl',
 38 |             rate=1.0,
 39 |             prepare=dict(
 40 |                 filter_by_difficulty=[-1],
 41 |                 filter_by_min_points=dict(
 42 |                     car=5,
 43 |                     truck=5,
 44 |                     bus=5,
 45 |                     trailer=5,
 46 |                     construction_vehicle=5,
 47 |                     traffic_cone=5,
 48 |                     barrier=5,
 49 |                     motorcycle=5,
 50 |                     bicycle=5,
 51 |                     pedestrian=5)),
 52 |             classes=class_names,
 53 |             sample_groups=dict(
 54 |                 car=2,
 55 |                 truck=3,
 56 |                 construction_vehicle=7,
 57 |                 bus=4,
 58 |                 trailer=6,
 59 |                 barrier=2,
 60 |                 motorcycle=6,
 61 |                 bicycle=6,
 62 |                 pedestrian=2,
 63 |                 traffic_cone=2),
 64 |             points_loader=dict(
 65 |                 type='LoadPointsFromFile',
 66 |                 coord_type='LIDAR',
 67 |                 load_dim=5,
 68 |                 use_dim=[0, 1, 2, 3, 4],
 69 |             ))),
 70 |     dict(
 71 |         type='GlobalRotScaleTrans',
 72 |         rot_range=[-0.3925 * 2, 0.3925 * 2],
 73 |         scale_ratio_range=[0.9, 1.1],
 74 |         translation_std=[0.5, 0.5, 0.5]),
 75 |     dict(
 76 |         type='RandomFlip3D',
 77 |         sync_2d=False,
 78 |         flip_ratio_bev_horizontal=0.5,
 79 |         flip_ratio_bev_vertical=0.5),
 80 |     dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
 81 |     dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
 82 |     dict(type='ObjectNameFilter', classes=class_names),
 83 |     dict(type='PointShuffle'),
 84 |     dict(type='DefaultFormatBundle3D', class_names=class_names),
 85 |     dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'],
 86 |          meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
 87 |                     'depth2img', 'cam2img', 'pad_shape',
 88 |                     'scale_factor', 'flip', 'pcd_horizontal_flip',
 89 |                     'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
 90 |                     'img_norm_cfg', 'pcd_trans', 'sample_idx',
 91 |                     'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
 92 |                     'transformation_3d_flow', 'rot_degree',
 93 |                     'gt_bboxes_3d', 'gt_labels_3d'))
 94 | ]
 95 | test_pipeline = [
 96 |     dict(
 97 |         type='LoadPointsFromFile',
 98 |         coord_type='LIDAR',
 99 |         load_dim=5,
100 |         use_dim=[0, 1, 2, 3, 4],
101 |     ),
102 |     dict(
103 |         type='LoadPointsFromMultiSweeps',
104 |         sweeps_num=10,
105 |         use_dim=[0, 1, 2, 3, 4],
106 |     ),
107 |     dict(
108 |         type='MultiScaleFlipAug3D',
109 |         img_scale=(1333, 800),
110 |         pts_scale_ratio=1,
111 |         flip=False,
112 |         transforms=[
113 |             dict(
114 |                 type='GlobalRotScaleTrans',
115 |                 rot_range=[0, 0],
116 |                 scale_ratio_range=[1.0, 1.0],
117 |                 translation_std=[0, 0, 0]),
118 |             dict(type='RandomFlip3D'),
119 |             dict(
120 |                 type='DefaultFormatBundle3D',
121 |                 class_names=class_names,
122 |                 with_label=False),
123 |             dict(type='Collect3D', keys=['points'])
124 |         ])
125 | ]
126 | data = dict(
127 |     samples_per_gpu=2,
128 |     workers_per_gpu=4,
129 |     train=dict(
130 |         type='CBGSDataset',
131 |         dataset=dict(
132 |             type=dataset_type,
133 |             data_root=data_root,
134 |             ann_file=data_root + '/nuscenes_infos_train.pkl',
135 |             load_interval=1,
136 |             pipeline=train_pipeline,
137 |             classes=class_names,
138 |             modality=input_modality,
139 |             test_mode=False,
140 |             box_type_3d='LiDAR')),
141 |     val=dict(
142 |         type=dataset_type,
143 |         data_root=data_root,
144 |         ann_file=data_root + '/nuscenes_infos_val.pkl',
145 |         load_interval=1,
146 |         pipeline=test_pipeline,
147 |         classes=class_names,
148 |         modality=input_modality,
149 |         test_mode=True,
150 |         box_type_3d='LiDAR'),
151 |     test=dict(
152 |         type=dataset_type,
153 |         data_root=data_root,
154 |         ann_file=data_root + '/nuscenes_infos_val.pkl',
155 |         load_interval=1,
156 |         pipeline=test_pipeline,
157 |         classes=class_names,
158 |         modality=input_modality,
159 |         test_mode=True,
160 |         box_type_3d='LiDAR'))
161 | model = dict(
162 |     type='FSTRDetector',
163 |     pts_voxel_layer=dict(
164 |         num_point_features=5,
165 |         max_num_points=10,
166 |         voxel_size=voxel_size,
167 |         max_voxels=(120000, 160000),
168 |         point_cloud_range=point_cloud_range),
169 |     pts_voxel_encoder=dict(
170 |         type='HardSimpleVFE',
171 |         num_features=5,
172 |     ),
173 |     pts_middle_encoder=dict(
174 |         type='VoxelNextEncoder',
175 |         in_channels=5,
176 |         sparse_shape=[41, 1440, 1440],
177 |         base_channels=16,
178 |         output_channels=128,
179 |         order=('conv', 'norm', 'act'),
180 |         block_type='basicblock'),
181 | 
182 |     pts_bbox_head=dict(
183 |         type='FSTRHead',
184 |         in_channels=128,
185 |         hidden_dim=256,
186 |         downsample_scale=8,
187 |         num_query=500,
188 |         num_init_query=200,
189 |         init_dn_query = False,
190 |         init_learnable_query = False,
191 |         init_query_topk = 1,
192 |         init_query_radius = 1,
193 |         gauusian_dn_sampling=False,
194 |         noise_mean = 0.5,
195 |         noise_std = 0.125,
196 |         max_sparse_token_per_sample = 10000,
197 |         common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
198 |         tasks=[
199 |             dict(num_class=10, class_names=[
200 |                 'car', 'truck', 'construction_vehicle',
201 |                 'bus', 'trailer', 'barrier',
202 |                 'motorcycle', 'bicycle',
203 |                 'pedestrian', 'traffic_cone'
204 |             ]),
205 |         ],
206 |         bbox_coder=dict(
207 |             type='MultiTaskBBoxCoder',
208 |             post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
209 |             pc_range=point_cloud_range,
210 |             max_num=300,
211 |             voxel_size=voxel_size,
212 |             num_classes=10), 
213 |         separate_head=dict(
214 |             type='SeparateTaskHead', init_bias=-2.19, final_kernel=3),
215 |         transformer=dict(
216 |             type='FSTRTransformer',
217 |             decoder=dict(
218 |                 type='PETRTransformerDecoder',
219 |                 return_intermediate=True,
220 |                 num_layers=1,
221 |                 transformerlayers=dict(
222 |                     type='PETRTransformerDecoderLayer',
223 |                     attn_cfgs=[
224 |                         dict(
225 |                             type='MultiheadAttention',
226 |                             embed_dims=256,
227 |                             num_heads=8,
228 |                             dropout=0.1),
229 |                         dict(
230 |                             type='PETRMultiheadFlashAttention',
231 |                             embed_dims=256,
232 |                             num_heads=8,
233 |                             dropout=0.1),
234 |                         ],
235 |                     ffn_cfgs=dict(
236 |                         type='FFN',
237 |                         embed_dims=256,
238 |                         feedforward_channels=1024,
239 |                         num_fcs=2,
240 |                         ffn_drop=0.,
241 |                         act_cfg=dict(type='ReLU', inplace=True),
242 |                     ),
243 | 
244 |                     feedforward_channels=1024, #unused
245 |                     operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
246 |                                      'ffn', 'norm')),
247 |             )),
248 |         loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=2.0),
249 |         loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
250 |         loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0),
251 |     ),
252 |     train_cfg=dict(
253 |         pts=dict(
254 |             dataset='nuScenes',
255 |             assigner=dict(
256 |                 type='HungarianAssigner3D',
257 |                 cls_cost=dict(type='FocalLossCost', weight=2.0),
258 |                 reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
259 |                 iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. 
260 |                 pc_range=point_cloud_range,
261 |                 code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
262 |             ),
263 |             pos_weight=-1,
264 |             gaussian_overlap=0.1,
265 |             min_radius=2,
266 |             grid_size=[1440, 1440, 40],  # [x_len, y_len, 1]
267 |             voxel_size=voxel_size,
268 |             out_size_factor=out_size_factor,
269 |             code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
270 |             point_cloud_range=point_cloud_range)),
271 |     test_cfg=dict(
272 |         pts=dict(
273 |             dataset='nuScenes',
274 |             grid_size=[1440, 1440, 40],
275 |             out_size_factor=out_size_factor,
276 |             pc_range=point_cloud_range[0:2],
277 |             voxel_size=voxel_size[:2],
278 |             nms_type=None,
279 |             nms_thr=0.1,
280 |             use_rotate_nms=True,
281 |             max_num=300
282 |         )))
283 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.01)  # for 8gpu * 2sample_per_gpu
284 | optimizer_config = dict(
285 |     type='CustomFp16OptimizerHook',
286 |     loss_scale=512.,
287 |     grad_clip=dict(max_norm=35, norm_type=2),
288 |     custom_fp16=dict(pts_voxel_encoder=False, pts_middle_encoder=False, pts_bbox_head=False))
289 | lr_config = dict(
290 |     policy='cyclic',
291 |     target_ratio=(8, 0.0001),
292 |     cyclic_times=1,
293 |     step_ratio_up=0.4)
294 | momentum_config = dict(
295 |     policy='cyclic',
296 |     target_ratio=(0.8947368421052632, 1),
297 |     cyclic_times=1,
298 |     step_ratio_up=0.4)
299 | total_epochs = 20
300 | checkpoint_config = dict(interval=1)
301 | evaluation = dict(interval=5, pipeline=test_pipeline)
302 | log_config = dict(
303 |     interval=50,
304 |     hooks=[dict(type='TextLoggerHook'),
305 |            dict(type='TensorboardLoggerHook')])
306 | dist_params = dict(backend='nccl')
307 | log_level = 'INFO'
308 | work_dir = None
309 | load_from = None
310 | resume_from = None
311 | workflow = [('train', 1)]
312 | gpu_ids = range(0, 8)
313 | 
314 | 


--------------------------------------------------------------------------------
/tools/test.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) OpenMMLab. All rights reserved.
  2 | import argparse
  3 | import os
  4 | import warnings
  5 | 
  6 | import mmcv
  7 | import torch
  8 | from mmcv import Config, DictAction
  9 | from mmcv.cnn import fuse_conv_bn
 10 | from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
 11 | from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
 12 |                          wrap_fp16_model)
 13 | 
 14 | import mmdet
 15 | from mmdet3d.apis import single_gpu_test
 16 | from mmdet3d.datasets import build_dataloader, build_dataset
 17 | from mmdet3d.models import build_model
 18 | from mmdet.apis import multi_gpu_test, set_random_seed
 19 | from mmdet.datasets import replace_ImageToTensor
 20 | 
 21 | if mmdet.__version__ > '2.23.0':
 22 |     # If mmdet version > 2.23.0, setup_multi_processes would be imported and
 23 |     # used from mmdet instead of mmdet3d.
 24 |     from mmdet.utils import setup_multi_processes
 25 | else:
 26 |     from mmdet3d.utils import setup_multi_processes
 27 | 
 28 | try:
 29 |     # If mmdet version > 2.23.0, compat_cfg would be imported and
 30 |     # used from mmdet instead of mmdet3d.
 31 |     from mmdet.utils import compat_cfg
 32 | except ImportError:
 33 |     from mmdet3d.utils import compat_cfg
 34 | 
 35 | 
 36 | def parse_args():
 37 |     parser = argparse.ArgumentParser(
 38 |         description='MMDet test (and eval) a model')
 39 |     parser.add_argument('config', help='test config file path')
 40 |     parser.add_argument('checkpoint', help='checkpoint file')
 41 |     parser.add_argument('--out', help='output result file in pickle format')
 42 |     parser.add_argument(
 43 |         '--fuse-conv-bn',
 44 |         action='store_true',
 45 |         help='Whether to fuse conv and bn, this will slightly increase'
 46 |         'the inference speed')
 47 |     parser.add_argument(
 48 |         '--gpu-ids',
 49 |         type=int,
 50 |         nargs='+',
 51 |         help='(Deprecated, please use --gpu-id) ids of gpus to use '
 52 |         '(only applicable to non-distributed training)')
 53 |     parser.add_argument(
 54 |         '--gpu-id',
 55 |         type=int,
 56 |         default=0,
 57 |         help='id of gpu to use '
 58 |         '(only applicable to non-distributed testing)')
 59 |     parser.add_argument(
 60 |         '--format-only',
 61 |         action='store_true',
 62 |         help='Format the output results without perform evaluation. It is'
 63 |         'useful when you want to format the result to a specific format and '
 64 |         'submit it to the test server')
 65 |     parser.add_argument(
 66 |         '--eval',
 67 |         type=str,
 68 |         nargs='+',
 69 |         help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
 70 |         ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
 71 |     parser.add_argument('--show', action='store_true', help='show results')
 72 |     parser.add_argument(
 73 |         '--show-dir', help='directory where results will be saved')
 74 |     parser.add_argument(
 75 |         '--gpu-collect',
 76 |         action='store_true',
 77 |         help='whether to use gpu to collect results.')
 78 |     parser.add_argument(
 79 |         '--tmpdir',
 80 |         help='tmp directory used for collecting results from multiple '
 81 |         'workers, available when gpu-collect is not specified')
 82 |     parser.add_argument('--seed', type=int, default=0, help='random seed')
 83 |     parser.add_argument(
 84 |         '--deterministic',
 85 |         action='store_true',
 86 |         help='whether to set deterministic options for CUDNN backend.')
 87 |     parser.add_argument(
 88 |         '--cfg-options',
 89 |         nargs='+',
 90 |         action=DictAction,
 91 |         help='override some settings in the used config, the key-value pair '
 92 |         'in xxx=yyy format will be merged into config file. If the value to '
 93 |         'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
 94 |         'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
 95 |         'Note that the quotation marks are necessary and that no white space '
 96 |         'is allowed.')
 97 |     parser.add_argument(
 98 |         '--options',
 99 |         nargs='+',
100 |         action=DictAction,
101 |         help='custom options for evaluation, the key-value pair in xxx=yyy '
102 |         'format will be kwargs for dataset.evaluate() function (deprecate), '
103 |         'change to --eval-options instead.')
104 |     parser.add_argument(
105 |         '--eval-options',
106 |         nargs='+',
107 |         action=DictAction,
108 |         help='custom options for evaluation, the key-value pair in xxx=yyy '
109 |         'format will be kwargs for dataset.evaluate() function')
110 |     parser.add_argument(
111 |         '--launcher',
112 |         choices=['none', 'pytorch', 'slurm', 'mpi'],
113 |         default='none',
114 |         help='job launcher')
115 |     parser.add_argument('--local_rank', type=int, default=0)
116 |     args = parser.parse_args()
117 |     if 'LOCAL_RANK' not in os.environ:
118 |         os.environ['LOCAL_RANK'] = str(args.local_rank)
119 | 
120 |     if args.options and args.eval_options:
121 |         raise ValueError(
122 |             '--options and --eval-options cannot be both specified, '
123 |             '--options is deprecated in favor of --eval-options')
124 |     if args.options:
125 |         warnings.warn('--options is deprecated in favor of --eval-options')
126 |         args.eval_options = args.options
127 |     return args
128 | 
129 | 
130 | def main():
131 |     args = parse_args()
132 | 
133 |     assert args.out or args.eval or args.format_only or args.show \
134 |         or args.show_dir, \
135 |         ('Please specify at least one operation (save/eval/format/show the '
136 |          'results / save the results) with the argument "--out", "--eval"'
137 |          ', "--format-only", "--show" or "--show-dir"')
138 | 
139 |     if args.eval and args.format_only:
140 |         raise ValueError('--eval and --format_only cannot be both specified')
141 | 
142 |     if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
143 |         raise ValueError('The output file must be a pkl file.')
144 | 
145 |     cfg = Config.fromfile(args.config)
146 |     if args.cfg_options is not None:
147 |         cfg.merge_from_dict(args.cfg_options)
148 | 
149 |     # import modules from string list.
150 |     if cfg.get('custom_imports', None):
151 |         from mmcv.utils import import_modules_from_strings
152 |         import_modules_from_strings(**cfg['custom_imports'])
153 | 
154 |     # import modules from plguin/xx, registry will be updated
155 |     if hasattr(cfg, 'plugin'):
156 |         if cfg.plugin:
157 |             import importlib
158 |             if hasattr(cfg, 'plugin_dir'):
159 |                 plugin_dir = cfg.plugin_dir
160 |                 _module_dir = os.path.dirname(plugin_dir)
161 |                 _module_dir = _module_dir.split('/')
162 |                 _module_path = _module_dir[0]
163 | 
164 |                 for m in _module_dir[1:]:
165 |                     _module_path = _module_path + '.' + m
166 |                 print(_module_path)
167 |                 plg_lib = importlib.import_module(_module_path)
168 |             else:
169 |                 # import dir is the dirpath for the config file
170 |                 _module_dir = os.path.dirname(args.config)
171 |                 _module_dir = _module_dir.split('/')
172 |                 _module_path = _module_dir[0]
173 |                 for m in _module_dir[1:]:
174 |                     _module_path = _module_path + '.' + m
175 |                 print(_module_path)
176 |                 plg_lib = importlib.import_module(_module_path)
177 | 
178 |     cfg = compat_cfg(cfg)
179 | 
180 |     # set multi-process settings
181 |     setup_multi_processes(cfg)
182 | 
183 |     # set cudnn_benchmark
184 |     if cfg.get('cudnn_benchmark', False):
185 |         torch.backends.cudnn.benchmark = True
186 | 
187 |     cfg.model.pretrained = None
188 | 
189 |     if args.gpu_ids is not None:
190 |         cfg.gpu_ids = args.gpu_ids[0:1]
191 |         warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
192 |                       'Because we only support single GPU mode in '
193 |                       'non-distributed testing. Use the first GPU '
194 |                       'in `gpu_ids` now.')
195 |     else:
196 |         cfg.gpu_ids = [args.gpu_id]
197 | 
198 |     # init distributed env first, since logger depends on the dist info.
199 |     if args.launcher == 'none':
200 |         distributed = False
201 |     else:
202 |         distributed = True
203 |         init_dist(args.launcher, **cfg.dist_params)
204 | 
205 |     test_dataloader_default_args = dict(
206 |         samples_per_gpu=1, workers_per_gpu=2, dist=distributed, shuffle=False)
207 | 
208 |     # in case the test dataset is concatenated
209 |     if isinstance(cfg.data.test, dict):
210 |         cfg.data.test.test_mode = True
211 |         if cfg.data.test_dataloader.get('samples_per_gpu', 1) > 1:
212 |             # Replace 'ImageToTensor' to 'DefaultFormatBundle'
213 |             cfg.data.test.pipeline = replace_ImageToTensor(
214 |                 cfg.data.test.pipeline)
215 |     elif isinstance(cfg.data.test, list):
216 |         for ds_cfg in cfg.data.test:
217 |             ds_cfg.test_mode = True
218 |         if cfg.data.test_dataloader.get('samples_per_gpu', 1) > 1:
219 |             for ds_cfg in cfg.data.test:
220 |                 ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
221 | 
222 |     test_loader_cfg = {
223 |         **test_dataloader_default_args,
224 |         **cfg.data.get('test_dataloader', {})
225 |     }
226 | 
227 |     # set random seeds
228 |     if args.seed is not None:
229 |         set_random_seed(args.seed, deterministic=args.deterministic)
230 | 
231 |     # build the dataloader
232 |     dataset = build_dataset(cfg.data.test)
233 |     data_loader = build_dataloader(dataset, **test_loader_cfg)
234 | 
235 |     # build the model and load checkpoint
236 |     cfg.model.train_cfg = None
237 |     model = build_model(cfg.model, test_cfg=cfg.get('test_cfg'))
238 |     fp16_cfg = cfg.get('fp16', None)
239 |     if fp16_cfg is not None:
240 |         wrap_fp16_model(model)
241 |     checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
242 |     if args.fuse_conv_bn:
243 |         model = fuse_conv_bn(model)
244 |     # old versions did not save class info in checkpoints, this walkaround is
245 |     # for backward compatibility
246 |     if 'CLASSES' in checkpoint.get('meta', {}):
247 |         model.CLASSES = checkpoint['meta']['CLASSES']
248 |     else:
249 |         model.CLASSES = dataset.CLASSES
250 |     # palette for visualization in segmentation tasks
251 |     if 'PALETTE' in checkpoint.get('meta', {}):
252 |         model.PALETTE = checkpoint['meta']['PALETTE']
253 |     elif hasattr(dataset, 'PALETTE'):
254 |         # segmentation dataset has `PALETTE` attribute
255 |         model.PALETTE = dataset.PALETTE
256 | 
257 |     if not distributed:
258 |         model = MMDataParallel(model, device_ids=cfg.gpu_ids)
259 |         outputs = single_gpu_test(model, data_loader, args.show, args.show_dir)
260 |     else:
261 |         model = MMDistributedDataParallel(
262 |             model.cuda(),
263 |             device_ids=[torch.cuda.current_device()],
264 |             broadcast_buffers=False)
265 |         outputs = multi_gpu_test(model, data_loader, args.tmpdir,
266 |                                  args.gpu_collect)
267 | 
268 |     rank, _ = get_dist_info()
269 |     if rank == 0:
270 |         if args.out:
271 |             print(f'\nwriting results to {args.out}')
272 |             mmcv.dump(outputs, args.out)
273 |         kwargs = {} if args.eval_options is None else args.eval_options
274 |         if args.format_only:
275 |             dataset.format_results(outputs, **kwargs)
276 |         if args.eval:
277 |             eval_kwargs = cfg.get('evaluation', {}).copy()
278 |             # hard-code way to remove EvalHook args
279 |             for key in [
280 |                     'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
281 |                     'rule'
282 |             ]:
283 |                 eval_kwargs.pop(key, None)
284 |             eval_kwargs.update(dict(metric=args.eval, **kwargs))
285 |             print(dataset.evaluate(outputs, **eval_kwargs))
286 | 
287 | 
288 | if __name__ == '__main__':
289 |     main()
290 | 


--------------------------------------------------------------------------------
/projects/configs/lidar/fstr_large_voxel0075_cbgs_20e.py:
--------------------------------------------------------------------------------
  1 | plugin=True
  2 | plugin_dir='projects/mmdet3d_plugin/'
  3 | 
  4 | point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
  5 | class_names = [
  6 |     'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
  7 |     'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
  8 | ]
  9 | voxel_size = [0.075, 0.075, 0.2]
 10 | out_size_factor = 8
 11 | evaluation = dict(interval=20)
 12 | dataset_type = 'CustomNuScenesDataset'
 13 | data_root = 'data/nuscenes/'
 14 | input_modality = dict(
 15 |     use_lidar=True,
 16 |     use_camera=False,
 17 |     use_radar=False,
 18 |     use_map=False,
 19 |     use_external=False)
 20 | train_pipeline = [
 21 |     dict(
 22 |         type='LoadPointsFromFile',
 23 |         coord_type='LIDAR',
 24 |         load_dim=5,
 25 |         use_dim=[0, 1, 2, 3, 4],
 26 |     ),
 27 |     dict(
 28 |         type='LoadPointsFromMultiSweeps',
 29 |         sweeps_num=10,
 30 |         use_dim=[0, 1, 2, 3, 4],
 31 |     ),
 32 |     dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
 33 |     dict(
 34 |         type='ObjectSample',
 35 |         db_sampler=dict(
 36 |             data_root='data/nuscenes/',
 37 |             info_path='data/nuscenes/' + 'nuscenes_dbinfos_train.pkl',
 38 |             rate=1.0,
 39 |             prepare=dict(
 40 |                 filter_by_difficulty=[-1],
 41 |                 filter_by_min_points=dict(
 42 |                     car=5,
 43 |                     truck=5,
 44 |                     bus=5,
 45 |                     trailer=5,
 46 |                     construction_vehicle=5,
 47 |                     traffic_cone=5,
 48 |                     barrier=5,
 49 |                     motorcycle=5,
 50 |                     bicycle=5,
 51 |                     pedestrian=5)),
 52 |             classes=class_names,
 53 |             sample_groups=dict(
 54 |                 car=2,
 55 |                 truck=3,
 56 |                 construction_vehicle=7,
 57 |                 bus=4,
 58 |                 trailer=6,
 59 |                 barrier=2,
 60 |                 motorcycle=6,
 61 |                 bicycle=6,
 62 |                 pedestrian=2,
 63 |                 traffic_cone=2),
 64 |             points_loader=dict(
 65 |                 type='LoadPointsFromFile',
 66 |                 coord_type='LIDAR',
 67 |                 load_dim=5,
 68 |                 use_dim=[0, 1, 2, 3, 4],
 69 |             ))),
 70 |     dict(
 71 |         type='GlobalRotScaleTrans',
 72 |         rot_range=[-0.3925 * 2, 0.3925 * 2],
 73 |         scale_ratio_range=[0.9, 1.1],
 74 |         translation_std=[0.5, 0.5, 0.5]),
 75 |     dict(
 76 |         type='RandomFlip3D',
 77 |         sync_2d=False,
 78 |         flip_ratio_bev_horizontal=0.5,
 79 |         flip_ratio_bev_vertical=0.5),
 80 |     dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
 81 |     dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
 82 |     dict(type='ObjectNameFilter', classes=class_names),
 83 |     dict(type='PointShuffle'),
 84 |     dict(type='DefaultFormatBundle3D', class_names=class_names),
 85 |     dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'],
 86 |          meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
 87 |                     'depth2img', 'cam2img', 'pad_shape',
 88 |                     'scale_factor', 'flip', 'pcd_horizontal_flip',
 89 |                     'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
 90 |                     'img_norm_cfg', 'pcd_trans', 'sample_idx',
 91 |                     'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
 92 |                     'transformation_3d_flow', 'rot_degree',
 93 |                     'gt_bboxes_3d', 'gt_labels_3d'))
 94 | ]
 95 | test_pipeline = [
 96 |     dict(
 97 |         type='LoadPointsFromFile',
 98 |         coord_type='LIDAR',
 99 |         load_dim=5,
100 |         use_dim=[0, 1, 2, 3, 4],
101 |     ),
102 |     dict(
103 |         type='LoadPointsFromMultiSweeps',
104 |         sweeps_num=10,
105 |         use_dim=[0, 1, 2, 3, 4],
106 |     ),
107 |     dict(
108 |         type='MultiScaleFlipAug3D',
109 |         img_scale=(1333, 800),
110 |         pts_scale_ratio=1,
111 |         flip=False,
112 |         transforms=[
113 |             dict(
114 |                 type='GlobalRotScaleTrans',
115 |                 rot_range=[0, 0],
116 |                 scale_ratio_range=[1.0, 1.0],
117 |                 translation_std=[0, 0, 0]),
118 |             dict(type='RandomFlip3D'),
119 |             dict(
120 |                 type='DefaultFormatBundle3D',
121 |                 class_names=class_names,
122 |                 with_label=False),
123 |             dict(type='Collect3D', keys=['points'])
124 |         ])
125 | ]
126 | data = dict(
127 |     samples_per_gpu=2,
128 |     workers_per_gpu=4,
129 |     train=dict(
130 |         type='CBGSDataset',
131 |         dataset=dict(
132 |             type=dataset_type,
133 |             data_root=data_root,
134 |             ann_file=data_root + '/nuscenes_infos_train.pkl',
135 |             load_interval=1,
136 |             pipeline=train_pipeline,
137 |             classes=class_names,
138 |             modality=input_modality,
139 |             test_mode=False,
140 |             box_type_3d='LiDAR')),
141 |     val=dict(
142 |         type=dataset_type,
143 |         data_root=data_root,
144 |         ann_file=data_root + '/nuscenes_infos_val.pkl',
145 |         load_interval=1,
146 |         pipeline=test_pipeline,
147 |         classes=class_names,
148 |         modality=input_modality,
149 |         test_mode=True,
150 |         box_type_3d='LiDAR'),
151 |     test=dict(
152 |         type=dataset_type,
153 |         data_root=data_root,
154 |         ann_file=data_root + '/nuscenes_infos_val.pkl',
155 |         load_interval=1,
156 |         pipeline=test_pipeline,
157 |         classes=class_names,
158 |         modality=input_modality,
159 |         test_mode=True,
160 |         box_type_3d='LiDAR'))
161 | model = dict(
162 |     type='FSTRDetector',
163 |     pts_voxel_layer=dict(
164 |         num_point_features=5,
165 |         max_num_points=10,
166 |         voxel_size=voxel_size,
167 |         max_voxels=(120000, 160000),
168 |         point_cloud_range=point_cloud_range),
169 |     pts_voxel_encoder=dict(
170 |         type='HardSimpleVFE',
171 |         num_features=5,
172 |     ),
173 |     pts_middle_encoder=dict(
174 |         type='VoxelNextEncoder',
175 |         in_channels=5,
176 |         sparse_shape=[41, 1440, 1440],
177 |         base_channels=32,
178 |         output_channels=256,
179 |         encoder_channels=((32, 32), (64, 64, 64), (128, 128, 128), (256, 256, 256),(256, 256, 256),(256, 256, 256)),
180 |         sparse_conv_kernel = (5, 3, 3, 3, 3),
181 |         order=('conv', 'norm', 'act'),
182 |         block_type='basicblock'),
183 | 
184 |     pts_bbox_head=dict(
185 |         type='FSTRHead',
186 |         in_channels=256,
187 |         hidden_dim=256,
188 |         downsample_scale=8,
189 |         num_query=500,
190 |         num_init_query=200,
191 |         init_dn_query = False,
192 |         init_learnable_query = False,
193 |         init_query_topk = 1,
194 |         init_query_radius = 1,
195 |         gauusian_dn_sampling=False,
196 |         noise_mean = 0.5,
197 |         noise_std = 0.125,
198 |         max_sparse_token_per_sample = 10000,
199 |         common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
200 |         tasks=[
201 |             dict(num_class=10, class_names=[
202 |                 'car', 'truck', 'construction_vehicle',
203 |                 'bus', 'trailer', 'barrier',
204 |                 'motorcycle', 'bicycle',
205 |                 'pedestrian', 'traffic_cone'
206 |             ]),
207 |         ],
208 |         bbox_coder=dict(
209 |             type='MultiTaskBBoxCoder',
210 |             post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
211 |             pc_range=point_cloud_range,
212 |             max_num=300,
213 |             voxel_size=voxel_size,
214 |             num_classes=10), 
215 |         separate_head=dict(
216 |             type='SeparateTaskHead', init_bias=-2.19, final_kernel=3),
217 |         transformer=dict(
218 |             type='FSTRTransformer',
219 |             decoder=dict(
220 |                 type='PETRTransformerDecoder',
221 |                 return_intermediate=True,
222 |                 num_layers=1,
223 |                 transformerlayers=dict(
224 |                     type='PETRTransformerDecoderLayer',
225 |                     attn_cfgs=[
226 |                         dict(
227 |                             type='MultiheadAttention',
228 |                             embed_dims=256,
229 |                             num_heads=8,
230 |                             dropout=0.1),
231 |                         dict(
232 |                             type='PETRMultiheadFlashAttention',
233 |                             embed_dims=256,
234 |                             num_heads=8,
235 |                             dropout=0.1),
236 |                         ],
237 |                     ffn_cfgs=dict(
238 |                         type='FFN',
239 |                         embed_dims=256,
240 |                         feedforward_channels=1024,
241 |                         num_fcs=2,
242 |                         ffn_drop=0.,
243 |                         act_cfg=dict(type='ReLU', inplace=True),
244 |                     ),
245 | 
246 |                     feedforward_channels=1024, #unused
247 |                     operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
248 |                                      'ffn', 'norm')),
249 |             )),
250 |         loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=2.0),
251 |         loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
252 |         loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0),
253 |     ),
254 |     train_cfg=dict(
255 |         pts=dict(
256 |             dataset='nuScenes',
257 |             assigner=dict(
258 |                 type='HungarianAssigner3D',
259 |                 cls_cost=dict(type='FocalLossCost', weight=2.0),
260 |                 reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
261 |                 iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. 
262 |                 pc_range=point_cloud_range,
263 |                 code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
264 |             ),
265 |             pos_weight=-1,
266 |             gaussian_overlap=0.1,
267 |             min_radius=2,
268 |             grid_size=[1440, 1440, 40],  # [x_len, y_len, 1]
269 |             voxel_size=voxel_size,
270 |             out_size_factor=out_size_factor,
271 |             code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
272 |             point_cloud_range=point_cloud_range)),
273 |     test_cfg=dict(
274 |         pts=dict(
275 |             dataset='nuScenes',
276 |             grid_size=[1440, 1440, 40],
277 |             out_size_factor=out_size_factor,
278 |             pc_range=point_cloud_range[0:2],
279 |             voxel_size=voxel_size[:2],
280 |             nms_type=None,
281 |             nms_thr=0.1,
282 |             use_rotate_nms=True,
283 |             max_num=300
284 |         )))
285 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.01)  # for 8gpu * 2sample_per_gpu
286 | optimizer_config = dict(
287 |     type='CustomFp16OptimizerHook',
288 |     loss_scale=512.,
289 |     grad_clip=dict(max_norm=35, norm_type=2),
290 |     custom_fp16=dict(pts_voxel_encoder=False, pts_middle_encoder=False, pts_bbox_head=False))
291 | lr_config = dict(
292 |     policy='cyclic',
293 |     target_ratio=(8, 0.0001),
294 |     cyclic_times=1,
295 |     step_ratio_up=0.4)
296 | momentum_config = dict(
297 |     policy='cyclic',
298 |     target_ratio=(0.8947368421052632, 1),
299 |     cyclic_times=1,
300 |     step_ratio_up=0.4)
301 | total_epochs = 20
302 | checkpoint_config = dict(interval=1)
303 | evaluation = dict(interval=5, pipeline=test_pipeline)
304 | log_config = dict(
305 |     interval=50,
306 |     hooks=[dict(type='TextLoggerHook'),
307 |            dict(type='TensorboardLoggerHook')])
308 | dist_params = dict(backend='nccl')
309 | log_level = 'INFO'
310 | work_dir = None
311 | load_from = None
312 | resume_from = None
313 | workflow = [('train', 1)]
314 | gpu_ids = range(0, 8)
315 | 
316 | 


--------------------------------------------------------------------------------
/projects/configs/lidar/fstr_xlarge_voxel0050_cbgs_20e.py:
--------------------------------------------------------------------------------
  1 | plugin=True
  2 | plugin_dir='projects/mmdet3d_plugin/'
  3 | 
  4 | point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
  5 | class_names = [
  6 |     'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
  7 |     'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
  8 | ]
  9 | voxel_size = [0.050, 0.050, 0.2]
 10 | out_size_factor = 8
 11 | evaluation = dict(interval=20)
 12 | dataset_type = 'CustomNuScenesDataset'
 13 | data_root = 'data/nuscenes/'
 14 | input_modality = dict(
 15 |     use_lidar=True,
 16 |     use_camera=False,
 17 |     use_radar=False,
 18 |     use_map=False,
 19 |     use_external=False)
 20 | train_pipeline = [
 21 |     dict(
 22 |         type='LoadPointsFromFile',
 23 |         coord_type='LIDAR',
 24 |         load_dim=5,
 25 |         use_dim=[0, 1, 2, 3, 4],
 26 |     ),
 27 |     dict(
 28 |         type='LoadPointsFromMultiSweeps',
 29 |         sweeps_num=10,
 30 |         use_dim=[0, 1, 2, 3, 4],
 31 |     ),
 32 |     dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
 33 |     dict(
 34 |         type='ObjectSample',
 35 |         db_sampler=dict(
 36 |             data_root='data/nuscenes/',
 37 |             info_path='data/nuscenes/' + 'nuscenes_dbinfos_train.pkl',
 38 |             rate=1.0,
 39 |             prepare=dict(
 40 |                 filter_by_difficulty=[-1],
 41 |                 filter_by_min_points=dict(
 42 |                     car=5,
 43 |                     truck=5,
 44 |                     bus=5,
 45 |                     trailer=5,
 46 |                     construction_vehicle=5,
 47 |                     traffic_cone=5,
 48 |                     barrier=5,
 49 |                     motorcycle=5,
 50 |                     bicycle=5,
 51 |                     pedestrian=5)),
 52 |             classes=class_names,
 53 |             sample_groups=dict(
 54 |                 car=2,
 55 |                 truck=3,
 56 |                 construction_vehicle=7,
 57 |                 bus=4,
 58 |                 trailer=6,
 59 |                 barrier=2,
 60 |                 motorcycle=6,
 61 |                 bicycle=6,
 62 |                 pedestrian=2,
 63 |                 traffic_cone=2),
 64 |             points_loader=dict(
 65 |                 type='LoadPointsFromFile',
 66 |                 coord_type='LIDAR',
 67 |                 load_dim=5,
 68 |                 use_dim=[0, 1, 2, 3, 4],
 69 |             ))),
 70 |     dict(
 71 |         type='GlobalRotScaleTrans',
 72 |         rot_range=[-0.3925 * 2, 0.3925 * 2],
 73 |         scale_ratio_range=[0.9, 1.1],
 74 |         translation_std=[0.5, 0.5, 0.5]),
 75 |     dict(
 76 |         type='RandomFlip3D',
 77 |         sync_2d=False,
 78 |         flip_ratio_bev_horizontal=0.5,
 79 |         flip_ratio_bev_vertical=0.5),
 80 |     dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
 81 |     dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
 82 |     dict(type='ObjectNameFilter', classes=class_names),
 83 |     dict(type='PointShuffle'),
 84 |     dict(type='DefaultFormatBundle3D', class_names=class_names),
 85 |     dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'],
 86 |          meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
 87 |                     'depth2img', 'cam2img', 'pad_shape',
 88 |                     'scale_factor', 'flip', 'pcd_horizontal_flip',
 89 |                     'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
 90 |                     'img_norm_cfg', 'pcd_trans', 'sample_idx',
 91 |                     'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
 92 |                     'transformation_3d_flow', 'rot_degree',
 93 |                     'gt_bboxes_3d', 'gt_labels_3d'))
 94 | ]
 95 | test_pipeline = [
 96 |     dict(
 97 |         type='LoadPointsFromFile',
 98 |         coord_type='LIDAR',
 99 |         load_dim=5,
100 |         use_dim=[0, 1, 2, 3, 4],
101 |     ),
102 |     dict(
103 |         type='LoadPointsFromMultiSweeps',
104 |         sweeps_num=10,
105 |         use_dim=[0, 1, 2, 3, 4],
106 |     ),
107 |     dict(
108 |         type='MultiScaleFlipAug3D',
109 |         img_scale=(1333, 800),
110 |         pts_scale_ratio=1,
111 |         flip=False,
112 |         transforms=[
113 |             dict(
114 |                 type='GlobalRotScaleTrans',
115 |                 rot_range=[0, 0],
116 |                 scale_ratio_range=[1.0, 1.0],
117 |                 translation_std=[0, 0, 0]),
118 |             dict(type='RandomFlip3D'),
119 |             dict(
120 |                 type='DefaultFormatBundle3D',
121 |                 class_names=class_names,
122 |                 with_label=False),
123 |             dict(type='Collect3D', keys=['points'])
124 |         ])
125 | ]
126 | data = dict(
127 |     samples_per_gpu=2,
128 |     workers_per_gpu=4,
129 |     train=dict(
130 |         type='CBGSDataset',
131 |         dataset=dict(
132 |             type=dataset_type,
133 |             data_root=data_root,
134 |             ann_file=data_root + '/nuscenes_infos_train.pkl',
135 |             load_interval=1,
136 |             pipeline=train_pipeline,
137 |             classes=class_names,
138 |             modality=input_modality,
139 |             test_mode=False,
140 |             box_type_3d='LiDAR')),
141 |     val=dict(
142 |         type=dataset_type,
143 |         data_root=data_root,
144 |         ann_file=data_root + '/nuscenes_infos_val.pkl',
145 |         load_interval=1,
146 |         pipeline=test_pipeline,
147 |         classes=class_names,
148 |         modality=input_modality,
149 |         test_mode=True,
150 |         box_type_3d='LiDAR'),
151 |     test=dict(
152 |         type=dataset_type,
153 |         data_root=data_root,
154 |         ann_file=data_root + '/nuscenes_infos_val.pkl',
155 |         load_interval=1,
156 |         pipeline=test_pipeline,
157 |         classes=class_names,
158 |         modality=input_modality,
159 |         test_mode=True,
160 |         box_type_3d='LiDAR'))
161 | model = dict(
162 |     type='FSTRDetector',
163 |     pts_voxel_layer=dict(
164 |         num_point_features=5,
165 |         max_num_points=10,
166 |         voxel_size=voxel_size,
167 |         max_voxels=(120000, 160000),
168 |         point_cloud_range=point_cloud_range),
169 |     pts_voxel_encoder=dict(
170 |         type='HardSimpleVFE',
171 |         num_features=5,
172 |     ),
173 |     pts_middle_encoder=dict(
174 |         type='VoxelNextEncoder',
175 |         in_channels=5,
176 |         sparse_shape=[41, 2160, 2160],
177 |         base_channels=32,
178 |         output_channels=256,
179 |         encoder_channels=((32, 32), (64, 64, 64, 64), (128, 128, 128, 128, 128), (256, 256, 256, 256, 256, 256, 256),(256, 256, 256, 256),(256, 256, 256, 256)),
180 |         encoder_paddings=((1, 1 ), (1, 1, 1, 1), (1, 1, 1, 1, 1), (1, 1, 1, 1, 1, 1, 1),(1, 1, 1, 1),(1, 1, 1, 1)),
181 |         sparse_conv_kernel = (5, 3, 3, 3, 3),
182 |         order=('conv', 'norm', 'act'),
183 |         block_type='basicblock'),
184 | 
185 |     pts_bbox_head=dict(
186 |         type='FSTRHead',
187 |         in_channels=256,
188 |         hidden_dim=256,
189 |         downsample_scale=8,
190 |         num_query=500,
191 |         num_init_query=200,
192 |         init_dn_query = False,
193 |         init_learnable_query = False,
194 |         init_query_topk = 1,
195 |         init_query_radius = 1,
196 |         gauusian_dn_sampling=False,
197 |         noise_mean = 0.5,
198 |         noise_std = 0.125,
199 |         max_sparse_token_per_sample = 10000,
200 |         common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
201 |         tasks=[
202 |             dict(num_class=10, class_names=[
203 |                 'car', 'truck', 'construction_vehicle',
204 |                 'bus', 'trailer', 'barrier',
205 |                 'motorcycle', 'bicycle',
206 |                 'pedestrian', 'traffic_cone'
207 |             ]),
208 |         ],
209 |         bbox_coder=dict(
210 |             type='MultiTaskBBoxCoder',
211 |             post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
212 |             pc_range=point_cloud_range,
213 |             max_num=300,
214 |             voxel_size=voxel_size,
215 |             num_classes=10), 
216 |         separate_head=dict(
217 |             type='SeparateTaskHead', init_bias=-2.19, final_kernel=3),
218 |         transformer=dict(
219 |             type='FSTRTransformer',
220 |             decoder=dict(
221 |                 type='PETRTransformerDecoder',
222 |                 return_intermediate=True,
223 |                 num_layers=1,
224 |                 transformerlayers=dict(
225 |                     type='PETRTransformerDecoderLayer',
226 |                     attn_cfgs=[
227 |                         dict(
228 |                             type='MultiheadAttention',
229 |                             embed_dims=256,
230 |                             num_heads=8,
231 |                             dropout=0.1),
232 |                         dict(
233 |                             type='PETRMultiheadFlashAttention',
234 |                             embed_dims=256,
235 |                             num_heads=8,
236 |                             dropout=0.1),
237 |                         ],
238 |                     ffn_cfgs=dict(
239 |                         type='FFN',
240 |                         embed_dims=256,
241 |                         feedforward_channels=1024,
242 |                         num_fcs=2,
243 |                         ffn_drop=0.,
244 |                         act_cfg=dict(type='ReLU', inplace=True),
245 |                     ),
246 | 
247 |                     feedforward_channels=1024, #unused
248 |                     operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
249 |                                      'ffn', 'norm')),
250 |             )),
251 |         loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=2.0),
252 |         loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
253 |         loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0),
254 |     ),
255 |     train_cfg=dict(
256 |         pts=dict(
257 |             dataset='nuScenes',
258 |             assigner=dict(
259 |                 type='HungarianAssigner3D',
260 |                 cls_cost=dict(type='FocalLossCost', weight=2.0),
261 |                 reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
262 |                 iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. 
263 |                 pc_range=point_cloud_range,
264 |                 code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
265 |             ),
266 |             pos_weight=-1,
267 |             gaussian_overlap=0.1,
268 |             min_radius=2,
269 |             grid_size=[2160, 2160, 40],  # [x_len, y_len, 1]
270 |             voxel_size=voxel_size,
271 |             out_size_factor=out_size_factor,
272 |             code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
273 |             point_cloud_range=point_cloud_range)),
274 |     test_cfg=dict(
275 |         pts=dict(
276 |             dataset='nuScenes',
277 |             grid_size=[2160, 2160, 40],
278 |             out_size_factor=out_size_factor,
279 |             pc_range=point_cloud_range[0:2],
280 |             voxel_size=voxel_size[:2],
281 |             nms_type=None,
282 |             nms_thr=0.1,
283 |             use_rotate_nms=True,
284 |             max_num=300
285 |         )))
286 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.01)  # for 8gpu * 2sample_per_gpu
287 | optimizer_config = dict(
288 |     type='CustomFp16OptimizerHook',
289 |     loss_scale=512.,
290 |     grad_clip=dict(max_norm=35, norm_type=2),
291 |     custom_fp16=dict(pts_voxel_encoder=False, pts_middle_encoder=False, pts_bbox_head=False))
292 | lr_config = dict(
293 |     policy='cyclic',
294 |     target_ratio=(8, 0.0001),
295 |     cyclic_times=1,
296 |     step_ratio_up=0.4)
297 | momentum_config = dict(
298 |     policy='cyclic',
299 |     target_ratio=(0.8947368421052632, 1),
300 |     cyclic_times=1,
301 |     step_ratio_up=0.4)
302 | total_epochs = 20
303 | checkpoint_config = dict(interval=1)
304 | evaluation = dict(interval=5, pipeline=test_pipeline)
305 | log_config = dict(
306 |     interval=50,
307 |     hooks=[dict(type='TextLoggerHook'),
308 |            dict(type='TensorboardLoggerHook')])
309 | dist_params = dict(backend='nccl')
310 | log_level = 'INFO'
311 | work_dir = None
312 | load_from = None
313 | resume_from = None
314 | workflow = [('train', 1)]
315 | gpu_ids = range(0, 8)
316 | 
317 | 


--------------------------------------------------------------------------------
/tools/train.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) OpenMMLab. All rights reserved.
  2 | from __future__ import division
  3 | import argparse
  4 | import copy
  5 | import os
  6 | import time
  7 | import warnings
  8 | from os import path as osp
  9 | 
 10 | import mmcv
 11 | import torch
 12 | import torch.distributed as dist
 13 | from mmcv import Config, DictAction
 14 | from mmcv.runner import get_dist_info, init_dist
 15 | 
 16 | from mmdet import __version__ as mmdet_version
 17 | from mmdet3d import __version__ as mmdet3d_version
 18 | from mmdet3d.apis import init_random_seed, train_model
 19 | from mmdet3d.datasets import build_dataset
 20 | from mmdet3d.models import build_model
 21 | from mmdet3d.utils import collect_env, get_root_logger
 22 | from mmdet.apis import set_random_seed
 23 | from mmseg import __version__ as mmseg_version
 24 | 
 25 | try:
 26 |     # If mmdet version > 2.20.0, setup_multi_processes would be imported and
 27 |     # used from mmdet instead of mmdet3d.
 28 |     from mmdet.utils import setup_multi_processes
 29 | except ImportError:
 30 |     from mmdet3d.utils import setup_multi_processes
 31 | 
 32 | 
 33 | def parse_args():
 34 |     parser = argparse.ArgumentParser(description='Train a detector')
 35 |     parser.add_argument('config', help='train config file path')
 36 |     parser.add_argument('--work-dir', help='the dir to save logs and models')
 37 |     parser.add_argument(
 38 |         '--resume-from', help='the checkpoint file to resume from')
 39 |     parser.add_argument(
 40 |         '--auto-resume',
 41 |         action='store_true',
 42 |         help='resume from the latest checkpoint automatically')
 43 |     parser.add_argument(
 44 |         '--no-validate',
 45 |         action='store_true',
 46 |         help='whether not to evaluate the checkpoint during training')
 47 |     group_gpus = parser.add_mutually_exclusive_group()
 48 |     group_gpus.add_argument(
 49 |         '--gpus',
 50 |         type=int,
 51 |         help='(Deprecated, please use --gpu-id) number of gpus to use '
 52 |         '(only applicable to non-distributed training)')
 53 |     group_gpus.add_argument(
 54 |         '--gpu-ids',
 55 |         type=int,
 56 |         nargs='+',
 57 |         help='(Deprecated, please use --gpu-id) ids of gpus to use '
 58 |         '(only applicable to non-distributed training)')
 59 |     group_gpus.add_argument(
 60 |         '--gpu-id',
 61 |         type=int,
 62 |         default=0,
 63 |         help='number of gpus to use '
 64 |         '(only applicable to non-distributed training)')
 65 |     parser.add_argument('--seed', type=int, default=0, help='random seed')
 66 |     parser.add_argument(
 67 |         '--diff-seed',
 68 |         action='store_true',
 69 |         help='Whether or not set different seeds for different ranks')
 70 |     parser.add_argument(
 71 |         '--deterministic',
 72 |         action='store_true',
 73 |         help='whether to set deterministic options for CUDNN backend.')
 74 |     parser.add_argument(
 75 |         '--options',
 76 |         nargs='+',
 77 |         action=DictAction,
 78 |         help='override some settings in the used config, the key-value pair '
 79 |         'in xxx=yyy format will be merged into config file (deprecate), '
 80 |         'change to --cfg-options instead.')
 81 |     parser.add_argument(
 82 |         '--cfg-options',
 83 |         nargs='+',
 84 |         action=DictAction,
 85 |         help='override some settings in the used config, the key-value pair '
 86 |         'in xxx=yyy format will be merged into config file. If the value to '
 87 |         'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
 88 |         'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
 89 |         'Note that the quotation marks are necessary and that no white space '
 90 |         'is allowed.')
 91 |     parser.add_argument(
 92 |         '--launcher',
 93 |         choices=['none', 'pytorch', 'slurm', 'mpi'],
 94 |         default='none',
 95 |         help='job launcher')
 96 |     parser.add_argument('--local_rank', type=int, default=0)
 97 |     parser.add_argument(
 98 |         '--autoscale-lr',
 99 |         action='store_true',
100 |         help='automatically scale lr with the number of gpus')
101 |     args = parser.parse_args()
102 |     if 'LOCAL_RANK' not in os.environ:
103 |         os.environ['LOCAL_RANK'] = str(args.local_rank)
104 | 
105 |     if args.options and args.cfg_options:
106 |         raise ValueError(
107 |             '--options and --cfg-options cannot be both specified, '
108 |             '--options is deprecated in favor of --cfg-options')
109 |     if args.options:
110 |         warnings.warn('--options is deprecated in favor of --cfg-options')
111 |         args.cfg_options = args.options
112 | 
113 |     return args
114 | 
115 | 
116 | def main():
117 |     args = parse_args()
118 | 
119 |     cfg = Config.fromfile(args.config)
120 |     if args.cfg_options is not None:
121 |         cfg.merge_from_dict(args.cfg_options)
122 | 
123 |     # set multi-process settings
124 |     setup_multi_processes(cfg)
125 | 
126 |     if cfg.get('custom_imports', None):
127 |         from mmcv.utils import import_modules_from_strings
128 |         import_modules_from_strings(**cfg['custom_imports'])
129 | 
130 |     # import modules from plguin/xx, registry will be updated
131 |     if hasattr(cfg, 'plugin'):
132 |         if cfg.plugin:
133 |             import importlib
134 |             if hasattr(cfg, 'plugin_dir'):
135 |                 plugin_dir = cfg.plugin_dir
136 |                 _module_dir = os.path.dirname(plugin_dir)
137 |                 _module_dir = _module_dir.split('/')
138 |                 _module_path = _module_dir[0]
139 | 
140 |                 for m in _module_dir[1:]:
141 |                     _module_path = _module_path + '.' + m
142 |                 print(_module_path)
143 |                 plg_lib = importlib.import_module(_module_path)
144 |             else:
145 |                 # import dir is the dirpath for the config file
146 |                 _module_dir = os.path.dirname(args.config)
147 |                 _module_dir = _module_dir.split('/')
148 |                 _module_path = _module_dir[0]
149 |                 for m in _module_dir[1:]:
150 |                     _module_path = _module_path + '.' + m
151 |                 print(_module_path)
152 |                 plg_lib = importlib.import_module(_module_path)
153 |                 
154 |     plg_lib = importlib.import_module('mmdetection3d.mmdet3d')
155 | 
156 |     # set cudnn_benchmark
157 |     if cfg.get('cudnn_benchmark', False):
158 |         torch.backends.cudnn.benchmark = True
159 | 
160 |     # work_dir is determined in this priority: CLI > segment in file > filename
161 |     if args.work_dir is not None:
162 |         # update configs according to CLI args if args.work_dir is not None
163 |         cfg.work_dir = args.work_dir
164 |     elif cfg.get('work_dir', None) is None:
165 |         # use config filename as default work_dir if cfg.work_dir is None
166 |         cfg.work_dir = osp.join('./work_dirs',
167 |                                 osp.splitext(osp.basename(args.config))[0])
168 |     if args.resume_from is not None:
169 |         cfg.resume_from = args.resume_from
170 | 
171 |     if args.auto_resume:
172 |         cfg.auto_resume = args.auto_resume
173 |         warnings.warn('`--auto-resume` is only supported when mmdet'
174 |                       'version >= 2.20.0 for 3D detection model or'
175 |                       'mmsegmentation verision >= 0.21.0 for 3D'
176 |                       'segmentation model')
177 | 
178 |     if args.gpus is not None:
179 |         cfg.gpu_ids = range(1)
180 |         warnings.warn('`--gpus` is deprecated because we only support '
181 |                       'single GPU mode in non-distributed training. '
182 |                       'Use `gpus=1` now.')
183 |     if args.gpu_ids is not None:
184 |         cfg.gpu_ids = args.gpu_ids[0:1]
185 |         warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
186 |                       'Because we only support single GPU mode in '
187 |                       'non-distributed training. Use the first GPU '
188 |                       'in `gpu_ids` now.')
189 |     if args.gpus is None and args.gpu_ids is None:
190 |         cfg.gpu_ids = [args.gpu_id]
191 | 
192 |     if args.autoscale_lr:
193 |         # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
194 |         cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
195 | 
196 |     # init distributed env first, since logger depends on the dist info.
197 |     if args.launcher == 'none':
198 |         distributed = False
199 |     else:
200 |         distributed = True
201 |         init_dist(args.launcher, **cfg.dist_params)
202 |         # re-set gpu_ids with distributed training mode
203 |         _, world_size = get_dist_info()
204 |         cfg.gpu_ids = range(world_size)
205 | 
206 |     # create work_dir
207 |     mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
208 |     # dump config
209 |     cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
210 |     # init the logger before other steps
211 |     timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
212 |     log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
213 |     # specify logger name, if we still use 'mmdet', the output info will be
214 |     # filtered and won't be saved in the log_file
215 |     # TODO: ugly workaround to judge whether we are training det or seg model
216 |     if cfg.model.type in ['EncoderDecoder3D']:
217 |         logger_name = 'mmseg'
218 |     else:
219 |         logger_name = 'mmdet'
220 |     logger = get_root_logger(
221 |         log_file=log_file, log_level=cfg.log_level, name=logger_name)
222 | 
223 |     # init the meta dict to record some important information such as
224 |     # environment info and seed, which will be logged
225 |     meta = dict()
226 |     # log env info
227 |     env_info_dict = collect_env()
228 |     env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
229 |     dash_line = '-' * 60 + '\n'
230 |     logger.info('Environment info:\n' + dash_line + env_info + '\n' +
231 |                 dash_line)
232 |     meta['env_info'] = env_info
233 |     meta['config'] = cfg.pretty_text
234 | 
235 |     # log some basic info
236 |     logger.info(f'Distributed training: {distributed}')
237 |     logger.info(f'Config:\n{cfg.pretty_text}')
238 | 
239 |     # set random seeds
240 |     seed = init_random_seed(args.seed)
241 |     seed = seed + dist.get_rank() if args.diff_seed else seed
242 |     logger.info(f'Set random seed to {seed}, '
243 |                 f'deterministic: {args.deterministic}')
244 |     set_random_seed(seed, deterministic=args.deterministic)
245 |     cfg.seed = seed
246 |     meta['seed'] = seed
247 |     meta['exp_name'] = osp.basename(args.config)
248 | 
249 |     model = build_model(
250 |         cfg.model,
251 |         train_cfg=cfg.get('train_cfg'),
252 |         test_cfg=cfg.get('test_cfg'))
253 |     model.init_weights()
254 | 
255 |     logger.info(f'Model:\n{model}')
256 |     datasets = [build_dataset(cfg.data.train)]
257 |     if len(cfg.workflow) == 2:
258 |         val_dataset = copy.deepcopy(cfg.data.val)
259 |         # in case we use a dataset wrapper
260 |         if 'dataset' in cfg.data.train:
261 |             val_dataset.pipeline = cfg.data.train.dataset.pipeline
262 |         else:
263 |             val_dataset.pipeline = cfg.data.train.pipeline
264 |         # set test_mode=False here in deep copied config
265 |         # which do not affect AP/AR calc ulation later
266 |         # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow  # noqa
267 |         val_dataset.test_mode = False
268 |         datasets.append(build_dataset(val_dataset))
269 |     if cfg.checkpoint_config is not None:
270 |         # save mmdet version, config file content and class names in
271 |         # checkpoints as meta data
272 |         cfg.checkpoint_config.meta = dict(
273 |             mmdet_version=mmdet_version,
274 |             mmseg_version=mmseg_version,
275 |             mmdet3d_version=mmdet3d_version,
276 |             config=cfg.pretty_text,
277 |             CLASSES=datasets[0].CLASSES,
278 |             PALETTE=datasets[0].PALETTE  # for segmentors
279 |             if hasattr(datasets[0], 'PALETTE') else None)
280 |     # add an attribute for visualization convenience
281 |     model.CLASSES = datasets[0].CLASSES
282 |     train_model(
283 |         model,
284 |         datasets,
285 |         cfg,
286 |         distributed=distributed,
287 |         validate=(not args.no_validate),
288 |         timestamp=timestamp,
289 |         meta=meta)
290 | 
291 | 
292 | if __name__ == '__main__':
293 |     main()


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/utils/cmt_transformer.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2022 megvii-model. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from DETR3D (https://github.com/WangYueFt/detr3d)
  5 | # Copyright (c) 2021 Wang, Yue
  6 | # ------------------------------------------------------------------------
  7 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
  8 | # Copyright (c) OpenMMLab. All rights reserved.
  9 | # ------------------------------------------------------------------------
 10 | 
 11 | import math
 12 | import copy
 13 | import warnings
 14 | import torch
 15 | import torch.nn as nn
 16 | import torch.nn.functional as F
 17 | import torch.utils.checkpoint as cp
 18 | 
 19 | from typing import Sequence
 20 | from einops import rearrange
 21 | from mmcv.cnn.bricks.drop import build_dropout
 22 | from mmcv.runner.base_module import BaseModule
 23 | from mmcv.cnn.bricks.transformer import (
 24 |     BaseTransformerLayer,
 25 |     TransformerLayerSequence,
 26 |     build_transformer_layer_sequence
 27 | )
 28 | from mmcv.cnn import (
 29 |     build_activation_layer,
 30 |     build_conv_layer,
 31 |     build_norm_layer,
 32 |     xavier_init
 33 | )
 34 | from mmcv.cnn.bricks.registry import (
 35 |     ATTENTION,
 36 |     TRANSFORMER_LAYER,
 37 |     TRANSFORMER_LAYER_SEQUENCE
 38 | )
 39 | from mmcv.utils import (
 40 |     ConfigDict,
 41 |     build_from_cfg,
 42 |     deprecated_api_warning,
 43 |     to_2tuple
 44 | )
 45 | from mmdet.models.utils.builder import TRANSFORMER
 46 | 
 47 | 
 48 | @TRANSFORMER.register_module()
 49 | class CmtTransformer(BaseModule):
 50 |     """Implements the DETR transformer.
 51 |     Following the official DETR implementation, this module copy-paste
 52 |     from torch.nn.Transformer with modifications:
 53 |         * positional encodings are passed in MultiheadAttention
 54 |         * extra LN at the end of encoder is removed
 55 |         * decoder returns a stack of activations from all decoding layers
 56 |     See `paper: End-to-End Object Detection with Transformers
 57 |     <https://arxiv.org/pdf/2005.12872>`_ for details.
 58 |     Args:
 59 |         encoder (`mmcv.ConfigDict` | Dict): Config of
 60 |             TransformerEncoder. Defaults to None.
 61 |         decoder ((`mmcv.ConfigDict` | Dict)): Config of
 62 |             TransformerDecoder. Defaults to None
 63 |         init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
 64 |             Defaults to None.
 65 |     """
 66 | 
 67 |     def __init__(self, encoder=None, decoder=None, init_cfg=None, cross=False):
 68 |         super(CmtTransformer, self).__init__(init_cfg=init_cfg)
 69 |         if encoder is not None:
 70 |             self.encoder = build_transformer_layer_sequence(encoder)
 71 |         else:
 72 |             self.encoder = None
 73 |         self.decoder = build_transformer_layer_sequence(decoder)
 74 |         self.embed_dims = self.decoder.embed_dims
 75 |         self.cross = cross
 76 | 
 77 |     def init_weights(self):
 78 |         # follow the official DETR to init parameters
 79 |         for m in self.modules():
 80 |             if hasattr(m, 'weight') and m.weight.dim() > 1:
 81 |                 xavier_init(m, distribution='uniform')
 82 |         self._is_init = True
 83 | 
 84 |     def forward(self, x, x_img, query_embed, bev_pos_embed, rv_pos_embed, attn_masks=None, reg_branch=None):
 85 |         """Forward function for `Transformer`.
 86 |         Args:
 87 |             x (Tensor): Input query with shape [bs, c, h, w] where
 88 |                 c = embed_dims.
 89 |             mask (Tensor): The key_padding_mask used for encoder and decoder,
 90 |                 with shape [bs, h, w].
 91 |             query_embed (Tensor): The query embedding for decoder, with shape
 92 |                 [num_query, c].
 93 |             pos_embed (Tensor): The positional encoding for encoder and
 94 |                 decoder, with the same shape as `x`.
 95 |         Returns:
 96 |             tuple[Tensor]: results of decoder containing the following tensor.
 97 |                 - out_dec: Output from decoder. If return_intermediate_dec \
 98 |                       is True output has shape [num_dec_layers, bs,
 99 |                       num_query, embed_dims], else has shape [1, bs, \
100 |                       num_query, embed_dims].
101 |                 - memory: Output results from encoder, with shape \
102 |                       [bs, embed_dims, h, w].
103 |         """
104 |         bs, c, h, w = x.shape
105 |         bev_memory = rearrange(x, "bs c h w -> (h w) bs c") # [bs, n, c, h, w] -> [n*h*w, bs, c]
106 |         rv_memory = rearrange(x_img, "(bs v) c h w -> (v h w) bs c", bs=bs)
107 |         bev_pos_embed = bev_pos_embed.unsqueeze(1).repeat(1, bs, 1) # [bs, n, c, h, w] -> [n*h*w, bs, c]
108 |         rv_pos_embed = rearrange(rv_pos_embed, "(bs v) h w c -> (v h w) bs c", bs=bs)
109 |         
110 |         memory, pos_embed = torch.cat([bev_memory, rv_memory], dim=0), torch.cat([bev_pos_embed, rv_pos_embed], dim=0)
111 |         query_embed = query_embed.transpose(0, 1)  # [num_query, dim] -> [num_query, bs, dim]
112 |         mask =  memory.new_zeros(bs, memory.shape[0]) # [bs, n, h, w] -> [bs, n*h*w]
113 | 
114 |         target = torch.zeros_like(query_embed)
115 |         # out_dec: [num_layers, num_query, bs, dim]
116 |         out_dec = self.decoder(
117 |             query=target,
118 |             key=memory,
119 |             value=memory,
120 |             key_pos=pos_embed,
121 |             query_pos=query_embed,
122 |             key_padding_mask=mask,
123 |             attn_masks=[attn_masks, None],
124 |             reg_branch=reg_branch,
125 |             )
126 |         out_dec = out_dec.transpose(1, 2)
127 |         return  out_dec, memory
128 | 
129 | 
130 | @TRANSFORMER.register_module()
131 | class CmtLidarTransformer(BaseModule):
132 |     """Implements the DETR transformer.
133 |     Following the official DETR implementation, this module copy-paste
134 |     from torch.nn.Transformer with modifications:
135 |         * positional encodings are passed in MultiheadAttention
136 |         * extra LN at the end of encoder is removed
137 |         * decoder returns a stack of activations from all decoding layers
138 |     See `paper: End-to-End Object Detection with Transformers
139 |     <https://arxiv.org/pdf/2005.12872>`_ for details.
140 |     Args:
141 |         encoder (`mmcv.ConfigDict` | Dict): Config of
142 |             TransformerEncoder. Defaults to None.
143 |         decoder ((`mmcv.ConfigDict` | Dict)): Config of
144 |             TransformerDecoder. Defaults to None
145 |         init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
146 |             Defaults to None.
147 |     """
148 | 
149 |     def __init__(self, encoder=None, decoder=None, init_cfg=None, cross=False):
150 |         super(CmtLidarTransformer, self).__init__(init_cfg=init_cfg)
151 |         if encoder is not None:
152 |             self.encoder = build_transformer_layer_sequence(encoder)
153 |         else:
154 |             self.encoder = None
155 |         self.decoder = build_transformer_layer_sequence(decoder)
156 |         self.embed_dims = self.decoder.embed_dims
157 |         self.cross = cross
158 | 
159 |     def init_weights(self):
160 |         # follow the official DETR to init parameters
161 |         for m in self.modules():
162 |             if hasattr(m, 'weight') and m.weight.dim() > 1:
163 |                 xavier_init(m, distribution='uniform')
164 |         self._is_init = True
165 | 
166 |     def forward(self, x, mask, query_embed, pos_embed, attn_masks=None, reg_branch=None):
167 |         """Forward function for `Transformer`.
168 |         Args:
169 |             x (Tensor): Input query with shape [bs, c, h, w] where
170 |                 c = embed_dims.
171 |             mask (Tensor): The key_padding_mask used for encoder and decoder,
172 |                 with shape [bs, h, w].
173 |             query_embed (Tensor): The query embedding for decoder, with shape
174 |                 [num_query, c].
175 |             pos_embed (Tensor): The positional encoding for encoder and
176 |                 decoder, with the same shape as `x`.
177 |         Returns:
178 |             tuple[Tensor]: results of decoder containing the following tensor.
179 |                 - out_dec: Output from decoder. If return_intermediate_dec \
180 |                       is True output has shape [num_dec_layers, bs,
181 |                       num_query, embed_dims], else has shape [1, bs, \
182 |                       num_query, embed_dims].
183 |                 - memory: Output results from encoder, with shape \
184 |                       [bs, embed_dims, h, w].
185 |         """
186 |         bs, c, h, w = x.shape
187 |         memory = rearrange(x, "bs c h w -> (h w) bs c") # [bs, n, c, h, w] -> [n*h*w, bs, c]
188 |         pos_embed = pos_embed.unsqueeze(1).repeat(1, bs, 1) # [bs, n, c, h, w] -> [n*h*w, bs, c]
189 |         query_embed = query_embed.transpose(0, 1)  # [num_query, dim] -> [num_query, bs, dim]
190 |         mask = mask.view(bs, -1)  # [bs, n, h, w] -> [bs, n*h*w]
191 |         target = torch.zeros_like(query_embed)
192 |         # out_dec: [num_layers, num_query, bs, dim]
193 |         out_dec = self.decoder(
194 |             query=target,
195 |             key=memory,
196 |             value=memory,
197 |             key_pos=pos_embed,
198 |             query_pos=query_embed,
199 |             key_padding_mask=mask,
200 |             attn_masks=[attn_masks, None],
201 |             reg_branch=reg_branch,
202 |             )
203 |         out_dec = out_dec.transpose(1, 2)
204 |         return  out_dec, memory
205 | 
206 | 
207 | 
208 | @TRANSFORMER.register_module()
209 | class FSTRTransformer(CmtLidarTransformer):
210 |     """Implements the DETR transformer.
211 |     Following the official DETR implementation, this module copy-paste
212 |     from torch.nn.Transformer with modifications:
213 |         * positional encodings are passed in MultiheadAttention
214 |         * extra LN at the end of encoder is removed
215 |         * decoder returns a stack of activations from all decoding layers
216 |     See `paper: End-to-End Object Detection with Transformers
217 |     <https://arxiv.org/pdf/2005.12872>`_ for details.
218 |     Args:
219 |         encoder (`mmcv.ConfigDict` | Dict): Config of
220 |             TransformerEncoder. Defaults to None.
221 |         decoder ((`mmcv.ConfigDict` | Dict)): Config of
222 |             TransformerDecoder. Defaults to None
223 |         init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
224 |             Defaults to None.
225 |     """
226 | 
227 |     def __init__(self, **kwargs):
228 |         super(FSTRTransformer, self).__init__(**kwargs)
229 | 
230 |     def forward(self, x, query_embed, bev_pos_embed, attn_masks=None, bev_key_padding_mask=None, reg_branch=None, target = None):
231 |         """Forward function for `Transformer`.
232 |         Args:
233 |             x (Tensor): Input query with shape [bs, c, h, w] where
234 |                 c = embed_dims.
235 |             mask (Tensor): The key_padding_mask used for encoder and decoder,
236 |                 with shape [bs, h, w].
237 |             query_embed (Tensor): The query embedding for decoder, with shape
238 |                 [num_query, c].
239 |             pos_embed (Tensor): The positional encoding for encoder and
240 |                 decoder, with the same shape as `x`.
241 |         Returns:
242 |             tuple[Tensor]: results of decoder containing the following tensor.
243 |                 - out_dec: Output from decoder. If return_intermediate_dec \
244 |                       is True output has shape [num_dec_layers, bs,
245 |                       num_query, embed_dims], else has shape [1, bs, \
246 |                       num_query, embed_dims].
247 |                 - memory: Output results from encoder, with shape \
248 |                       [bs, embed_dims, h, w].
249 |         """
250 |         bs, n, c = x.shape
251 |         bev_memory = rearrange(x, "bs n c -> n bs c") # [bs, n, c, h, w] -> [n*h*w, bs, c]
252 |         bev_pos_embed = rearrange(bev_pos_embed, "bs n c -> n bs c") # [bs, n, c, h, w] -> [n*h*w, bs, c]
253 |         
254 |         memory, pos_embed = bev_memory, bev_pos_embed
255 |         query_embed = query_embed.transpose(0, 1)  # [bs, num_query, dim] -> [num_query, bs, dim]
256 | 
257 |         if bev_key_padding_mask is None:
258 |             mask =  memory.new_zeros(bs, memory.shape[0]) # [bs, n, h, w] -> [bs, n*h*w]
259 |         else:
260 |             mask = bev_key_padding_mask
261 |         
262 |         assert target is not None
263 |         out_dec = self.decoder(
264 |             query=target,
265 |             key=memory,
266 |             value=memory,
267 |             key_pos=pos_embed,
268 |             query_pos=query_embed,
269 |             key_padding_mask=mask,
270 |             attn_masks=[attn_masks, None],
271 |             reg_branch=reg_branch,
272 |             )
273 |         out_dec = out_dec.transpose(1, 2)
274 |         return  out_dec, memory


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/utils/petr_transformer.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import copy
  3 | import warnings
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import torch.utils.checkpoint as cp
  8 | 
  9 | from einops import rearrange
 10 | from mmcv.cnn.bricks.drop import build_dropout
 11 | from mmcv.runner.base_module import BaseModule
 12 | 
 13 | from mmcv.cnn.bricks.transformer import (
 14 |     BaseTransformerLayer,
 15 |     TransformerLayerSequence,
 16 |     build_transformer_layer_sequence
 17 | )
 18 | from mmcv.cnn import (
 19 |     build_activation_layer,
 20 |     build_conv_layer,
 21 |     build_norm_layer,
 22 |     xavier_init
 23 | )
 24 | from mmcv.cnn.bricks.registry import (
 25 |     ATTENTION,TRANSFORMER_LAYER,
 26 |     TRANSFORMER_LAYER_SEQUENCE
 27 | )
 28 | from mmcv.utils import (
 29 |     ConfigDict,
 30 |     build_from_cfg,
 31 |     deprecated_api_warning,
 32 |     to_2tuple
 33 | )
 34 | from mmdet.models.utils.builder import TRANSFORMER
 35 | 
 36 | 
 37 | @ATTENTION.register_module()
 38 | class PETRMultiheadAttention(BaseModule):
 39 |     """A wrapper for ``torch.nn.MultiheadAttention``.
 40 |     This module implements MultiheadAttention with identity connection,
 41 |     and positional encoding  is also passed as input.
 42 |     Args:
 43 |         embed_dims (int): The embedding dimension.
 44 |         num_heads (int): Parallel attention heads.
 45 |         attn_drop (float): A Dropout layer on attn_output_weights.
 46 |             Default: 0.0.
 47 |         proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
 48 |             Default: 0.0.
 49 |         dropout_layer (obj:`ConfigDict`): The dropout_layer used
 50 |             when adding the shortcut.
 51 |         init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
 52 |             Default: None.
 53 |         batch_first (bool): When it is True,  Key, Query and Value are shape of
 54 |             (batch, n, embed_dim), otherwise (n, batch, embed_dim).
 55 |              Default to False.
 56 |     """
 57 | 
 58 |     def __init__(self,
 59 |                  embed_dims,
 60 |                  num_heads,
 61 |                  attn_drop=0.,
 62 |                  proj_drop=0.,
 63 |                  dropout_layer=dict(type='Dropout', drop_prob=0.),
 64 |                  init_cfg=None,
 65 |                  batch_first=False,
 66 |                  **kwargs):
 67 |         super(PETRMultiheadAttention, self).__init__(init_cfg)
 68 |         if 'dropout' in kwargs:
 69 |             warnings.warn(
 70 |                 'The arguments `dropout` in MultiheadAttention '
 71 |                 'has been deprecated, now you can separately '
 72 |                 'set `attn_drop`(float), proj_drop(float), '
 73 |                 'and `dropout_layer`(dict) ', DeprecationWarning)
 74 |             attn_drop = kwargs['dropout']
 75 |             dropout_layer['drop_prob'] = kwargs.pop('dropout')
 76 | 
 77 |         self.embed_dims = embed_dims
 78 |         self.num_heads = num_heads
 79 |         self.batch_first = batch_first
 80 | 
 81 |         self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop,
 82 |                                           **kwargs)
 83 | 
 84 |         self.proj_drop = nn.Dropout(proj_drop)
 85 |         self.dropout_layer = build_dropout(
 86 |             dropout_layer) if dropout_layer else nn.Identity()
 87 | 
 88 |     @deprecated_api_warning({'residual': 'identity'},
 89 |                             cls_name='MultiheadAttention')
 90 |     def forward(self,
 91 |                 query,
 92 |                 key=None,
 93 |                 value=None,
 94 |                 identity=None,
 95 |                 query_pos=None,
 96 |                 key_pos=None,
 97 |                 attn_mask=None,
 98 |                 key_padding_mask=None,
 99 |                 **kwargs):
100 |         """Forward function for `MultiheadAttention`.
101 |         **kwargs allow passing a more general data flow when combining
102 |         with other operations in `transformerlayer`.
103 |         Args:
104 |             query (Tensor): The input query with shape [num_queries, bs,
105 |                 embed_dims] if self.batch_first is False, else
106 |                 [bs, num_queries embed_dims].
107 |             key (Tensor): The key tensor with shape [num_keys, bs,
108 |                 embed_dims] if self.batch_first is False, else
109 |                 [bs, num_keys, embed_dims] .
110 |                 If None, the ``query`` will be used. Defaults to None.
111 |             value (Tensor): The value tensor with same shape as `key`.
112 |                 Same in `nn.MultiheadAttention.forward`. Defaults to None.
113 |                 If None, the `key` will be used.
114 |             identity (Tensor): This tensor, with the same shape as x,
115 |                 will be used for the identity link.
116 |                 If None, `x` will be used. Defaults to None.
117 |             query_pos (Tensor): The positional encoding for query, with
118 |                 the same shape as `x`. If not None, it will
119 |                 be added to `x` before forward function. Defaults to None.
120 |             key_pos (Tensor): The positional encoding for `key`, with the
121 |                 same shape as `key`. Defaults to None. If not None, it will
122 |                 be added to `key` before forward function. If None, and
123 |                 `query_pos` has the same shape as `key`, then `query_pos`
124 |                 will be used for `key_pos`. Defaults to None.
125 |             attn_mask (Tensor): ByteTensor mask with shape [num_queries,
126 |                 num_keys]. Same in `nn.MultiheadAttention.forward`.
127 |                 Defaults to None.
128 |             key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
129 |                 Defaults to None.
130 |         Returns:
131 |             Tensor: forwarded results with shape
132 |             [num_queries, bs, embed_dims]
133 |             if self.batch_first is False, else
134 |             [bs, num_queries embed_dims].
135 |         """
136 | 
137 |         if key is None:
138 |             key = query
139 |         if value is None:
140 |             value = key
141 |         if identity is None:
142 |             identity = query
143 |         if key_pos is None:
144 |             if query_pos is not None:
145 |                 # use query_pos if key_pos is not available
146 |                 if query_pos.shape == key.shape:
147 |                     key_pos = query_pos
148 |                 else:
149 |                     warnings.warn(f'position encoding of key is'
150 |                                   f'missing in {self.__class__.__name__}.')
151 |         if query_pos is not None:
152 |             query = query + query_pos
153 |         if key_pos is not None:
154 |             key = key + key_pos
155 | 
156 |         # Because the dataflow('key', 'query', 'value') of
157 |         # ``torch.nn.MultiheadAttention`` is (num_query, batch,
158 |         # embed_dims), We should adjust the shape of dataflow from
159 |         # batch_first (batch, num_query, embed_dims) to num_query_first
160 |         # (num_query ,batch, embed_dims), and recover ``attn_output``
161 |         # from num_query_first to batch_first.
162 |         if self.batch_first:
163 |             query = query.transpose(0, 1)
164 |             key = key.transpose(0, 1)
165 |             value = value.transpose(0, 1)
166 | 
167 |         out = self.attn(
168 |             query=query,
169 |             key=key,
170 |             value=value,
171 |             attn_mask=attn_mask,
172 |             key_padding_mask=key_padding_mask)[0]
173 | 
174 |         if self.batch_first:
175 |             out = out.transpose(0, 1)
176 | 
177 |         return identity + self.dropout_layer(self.proj_drop(out))
178 | 
179 | 
180 | from .attention import FlashMHA
181 | 
182 | @ATTENTION.register_module()
183 | class PETRMultiheadFlashAttention(BaseModule):
184 |     """A wrapper for ``torch.nn.MultiheadAttention``.
185 |     This module implements MultiheadAttention with identity connection,
186 |     and positional encoding  is also passed as input.
187 |     Args:
188 |         embed_dims (int): The embedding dimension.
189 |         num_heads (int): Parallel attention heads.
190 |         attn_drop (float): A Dropout layer on attn_output_weights.
191 |             Default: 0.0.
192 |         proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
193 |             Default: 0.0.
194 |         dropout_layer (obj:`ConfigDict`): The dropout_layer used
195 |             when adding the shortcut.
196 |         init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
197 |             Default: None.
198 |         batch_first (bool): When it is True,  Key, Query and Value are shape of
199 |             (batch, n, embed_dim), otherwise (n, batch, embed_dim).
200 |              Default to False.
201 |     """
202 | 
203 |     def __init__(self,
204 |                  embed_dims,
205 |                  num_heads,
206 |                  attn_drop=0.,
207 |                  proj_drop=0.,
208 |                  dropout_layer=dict(type='Dropout', drop_prob=0.),
209 |                  init_cfg=None,
210 |                  batch_first=True,
211 |                  **kwargs):
212 |         super(PETRMultiheadFlashAttention, self).__init__(init_cfg)
213 |         if 'dropout' in kwargs:
214 |             warnings.warn(
215 |                 'The arguments `dropout` in MultiheadAttention '
216 |                 'has been deprecated, now you can separately '
217 |                 'set `attn_drop`(float), proj_drop(float), '
218 |                 'and `dropout_layer`(dict) ', DeprecationWarning)
219 |             attn_drop = kwargs['dropout']
220 |             dropout_layer['drop_prob'] = kwargs.pop('dropout')
221 | 
222 |         self.embed_dims = embed_dims
223 |         self.num_heads = num_heads
224 |         self.batch_first = True
225 | 
226 |         self.attn = FlashMHA(embed_dims, num_heads, attn_drop, dtype=torch.float16, device='cuda',
227 |                                           **kwargs)
228 | 
229 |         self.proj_drop = nn.Dropout(proj_drop)
230 |         self.dropout_layer = build_dropout(
231 |             dropout_layer) if dropout_layer else nn.Identity()
232 | 
233 |     @deprecated_api_warning({'residual': 'identity'},
234 |                             cls_name='MultiheadAttention')
235 |     def forward(self,
236 |                 query,
237 |                 key=None,
238 |                 value=None,
239 |                 identity=None,
240 |                 query_pos=None,
241 |                 key_pos=None,
242 |                 attn_mask=None,
243 |                 key_padding_mask=None,
244 |                 **kwargs):
245 |         """Forward function for `MultiheadAttention`.
246 |         **kwargs allow passing a more general data flow when combining
247 |         with other operations in `transformerlayer`.
248 |         Args:
249 |             query (Tensor): The input query with shape [num_queries, bs,
250 |                 embed_dims] if self.batch_first is False, else
251 |                 [bs, num_queries embed_dims].
252 |             key (Tensor): The key tensor with shape [num_keys, bs,
253 |                 embed_dims] if self.batch_first is False, else
254 |                 [bs, num_keys, embed_dims] .
255 |                 If None, the ``query`` will be used. Defaults to None.
256 |             value (Tensor): The value tensor with same shape as `key`.
257 |                 Same in `nn.MultiheadAttention.forward`. Defaults to None.
258 |                 If None, the `key` will be used.
259 |             identity (Tensor): This tensor, with the same shape as x,
260 |                 will be used for the identity link.
261 |                 If None, `x` will be used. Defaults to None.
262 |             query_pos (Tensor): The positional encoding for query, with
263 |                 the same shape as `x`. If not None, it will
264 |                 be added to `x` before forward function. Defaults to None.
265 |             key_pos (Tensor): The positional encoding for `key`, with the
266 |                 same shape as `key`. Defaults to None. If not None, it will
267 |                 be added to `key` before forward function. If None, and
268 |                 `query_pos` has the same shape as `key`, then `query_pos`
269 |                 will be used for `key_pos`. Defaults to None.
270 |             attn_mask (Tensor): ByteTensor mask with shape [num_queries,
271 |                 num_keys]. Same in `nn.MultiheadAttention.forward`.
272 |                 Defaults to None.
273 |             key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
274 |                 Defaults to None.
275 |         Returns:
276 |             Tensor: forwarded results with shape
277 |             [num_queries, bs, embed_dims]
278 |             if self.batch_first is False, else
279 |             [bs, num_queries embed_dims].
280 |         """
281 | 
282 |         if key is None:
283 |             key = query
284 |         if value is None:
285 |             value = key
286 |         if identity is None:
287 |             identity = query
288 |         if key_pos is None:
289 |             if query_pos is not None:
290 |                 # use query_pos if key_pos is not available
291 |                 if query_pos.shape == key.shape:
292 |                     key_pos = query_pos
293 |                 else:
294 |                     warnings.warn(f'position encoding of key is'
295 |                                   f'missing in {self.__class__.__name__}.')
296 |         if query_pos is not None:
297 |             query = query + query_pos
298 |         if key_pos is not None:
299 |             key = key + key_pos
300 | 
301 |         # Because the dataflow('key', 'query', 'value') of
302 |         # ``torch.nn.MultiheadAttention`` is (num_query, batch,
303 |         # embed_dims), We should adjust the shape of dataflow from
304 |         # batch_first (batch, num_query, embed_dims) to num_query_first
305 |         # (num_query ,batch, embed_dims), and recover ``attn_output``
306 |         # from num_query_first to batch_first.
307 |         if self.batch_first:
308 |             query = query.transpose(0, 1)
309 |             key = key.transpose(0, 1)
310 |             value = value.transpose(0, 1)
311 |         
312 |         out = self.attn(
313 |             q=query,
314 |             k=key,
315 |             v=value,
316 |             key_padding_mask=None)[0]
317 | 
318 |         if self.batch_first:
319 |             out = out.transpose(0, 1)
320 | 
321 |         return identity + self.dropout_layer(self.proj_drop(out))
322 | 
323 | 
324 | @TRANSFORMER_LAYER_SEQUENCE.register_module()
325 | class PETRTransformerDecoder(TransformerLayerSequence):
326 |     """Implements the decoder in DETR transformer.
327 |     Args:
328 |         return_intermediate (bool): Whether to return intermediate outputs.
329 |         post_norm_cfg (dict): Config of last normalization layer. Default：
330 |             `LN`.
331 |     """
332 | 
333 |     def __init__(self,
334 |                  *args,
335 |                  post_norm_cfg=dict(type='LN'),
336 |                  return_intermediate=False,
337 |                  **kwargs):
338 | 
339 |         super(PETRTransformerDecoder, self).__init__(*args, **kwargs)
340 |         self.return_intermediate = return_intermediate
341 |         if post_norm_cfg is not None:
342 |             self.post_norm = build_norm_layer(post_norm_cfg,
343 |                                               self.embed_dims)[1]
344 |         else:
345 |             self.post_norm = None
346 | 
347 |     def forward(self, query, *args, **kwargs):
348 |         """Forward function for `TransformerDecoder`.
349 |         Args:
350 |             query (Tensor): Input query with shape
351 |                 `(num_query, bs, embed_dims)`.
352 |         Returns:
353 |             Tensor: Results with shape [1, num_query, bs, embed_dims] when
354 |                 return_intermediate is `False`, otherwise it has shape
355 |                 [num_layers, num_query, bs, embed_dims].
356 |         """
357 |         if not self.return_intermediate:
358 |             x = super().forward(query, *args, **kwargs)
359 |             if self.post_norm:
360 |                 x = self.post_norm(x)[None]
361 |             return x
362 | 
363 |         intermediate = []
364 |         for layer in self.layers:
365 |             query = layer(query, *args, **kwargs)
366 |             if self.return_intermediate:
367 |                 if self.post_norm is not None:
368 |                     intermediate.append(self.post_norm(query))
369 |                 else:
370 |                     intermediate.append(query)
371 |         return torch.stack(intermediate)
372 | 
373 | 
374 | @TRANSFORMER_LAYER.register_module()
375 | class PETRTransformerDecoderLayer(BaseTransformerLayer):
376 |     """Implements decoder layer in DETR transformer.
377 |     Args:
378 |         attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )):
379 |             Configs for self_attention or cross_attention, the order
380 |             should be consistent with it in `operation_order`. If it is
381 |             a dict, it would be expand to the number of attention in
382 |             `operation_order`.
383 |         feedforward_channels (int): The hidden dimension for FFNs.
384 |         ffn_dropout (float): Probability of an element to be zeroed
385 |             in ffn. Default 0.0.
386 |         operation_order (tuple[str]): The execution order of operation
387 |             in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
388 |             Default：None
389 |         act_cfg (dict): The activation config for FFNs. Default: `LN`
390 |         norm_cfg (dict): Config dict for normalization layer.
391 |             Default: `LN`.
392 |         ffn_num_fcs (int): The number of fully-connected layers in FFNs.
393 |             Default：2.
394 |     """
395 | 
396 |     def __init__(self,
397 |                  attn_cfgs,
398 |                  feedforward_channels,
399 |                  ffn_dropout=0.0,
400 |                  operation_order=None,
401 |                  act_cfg=dict(type='ReLU', inplace=True),
402 |                  norm_cfg=dict(type='LN'),
403 |                  ffn_num_fcs=2,
404 |                  with_cp=True,
405 |                  **kwargs):
406 |         super(PETRTransformerDecoderLayer, self).__init__(
407 |             attn_cfgs=attn_cfgs,
408 |             feedforward_channels=feedforward_channels,
409 |             ffn_dropout=ffn_dropout,
410 |             operation_order=operation_order,
411 |             act_cfg=act_cfg,
412 |             norm_cfg=norm_cfg,
413 |             ffn_num_fcs=ffn_num_fcs,
414 |             **kwargs)
415 |         assert len(operation_order) == 6
416 |         assert set(operation_order) == set(
417 |             ['self_attn', 'norm', 'cross_attn', 'ffn'])
418 |         self.use_checkpoint = with_cp
419 |     
420 |     def _forward(self, 
421 |                 query,
422 |                 key=None,
423 |                 value=None,
424 |                 query_pos=None,
425 |                 key_pos=None,
426 |                 attn_masks=None,
427 |                 query_key_padding_mask=None,
428 |                 key_padding_mask=None,
429 |                 ):
430 |         """Forward function for `TransformerCoder`.
431 |         Returns:
432 |             Tensor: forwarded results with shape [num_query, bs, embed_dims].
433 |         """
434 |         x = super(PETRTransformerDecoderLayer, self).forward(
435 |                 query,
436 |                 key=key,
437 |                 value=value,
438 |                 query_pos=query_pos,
439 |                 key_pos=key_pos,
440 |                 attn_masks=attn_masks,
441 |                 query_key_padding_mask=query_key_padding_mask,
442 |                 key_padding_mask=key_padding_mask,
443 |                 )
444 | 
445 |         return x
446 | 
447 |     def forward(self, 
448 |                 query,
449 |                 key=None,
450 |                 value=None,
451 |                 query_pos=None,
452 |                 key_pos=None,
453 |                 attn_masks=None,
454 |                 query_key_padding_mask=None,
455 |                 key_padding_mask=None,
456 |                 **kwargs
457 |                 ):
458 |         """Forward function for `TransformerCoder`.
459 |         Returns:
460 |             Tensor: forwarded results with shape [num_query, bs, embed_dims].
461 |         """
462 | 
463 |         if self.use_checkpoint and self.training:
464 |             x = cp.checkpoint(
465 |                 self._forward, 
466 |                 query,
467 |                 key,
468 |                 value,
469 |                 query_pos,
470 |                 key_pos,
471 |                 attn_masks,
472 |                 query_key_padding_mask,
473 |                 key_padding_mask,
474 |                 )
475 |         else:
476 |             x = self._forward(
477 |             query,
478 |             key=key,
479 |             value=value,
480 |             query_pos=query_pos,
481 |             key_pos=key_pos,
482 |             attn_masks=attn_masks,
483 |             query_key_padding_mask=query_key_padding_mask,
484 |             key_padding_mask=key_padding_mask
485 |             )
486 |         
487 |         return x
488 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/dense_heads/fstr_head.py:
--------------------------------------------------------------------------------
   1 | # Copyright (c) OpenMMLab. All rights reserved.
   2 | import math
   3 | import copy
   4 | import torch
   5 | import torch.nn as nn
   6 | from mmcv.cnn import build_conv_layer
   7 | from mmcv.runner import BaseModule, force_fp32
   8 | from mmdet.core import (build_assigner, build_sampler, multi_apply,
   9 |                         reduce_mean, build_bbox_coder)
  10 | from mmdet.models.utils import build_transformer
  11 | from mmdet.models import HEADS, build_loss
  12 | from mmdet.models.utils.transformer import inverse_sigmoid
  13 | from mmdet3d.models.utils.clip_sigmoid import clip_sigmoid
  14 | from mmdet3d.models import builder
  15 | from einops import rearrange
  16 | import collections
  17 | 
  18 | from functools import reduce
  19 | from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox
  20 | from mmdet3d.ops import  make_sparse_convmodule
  21 | import spconv.pytorch as spconv
  22 | from mmcv.cnn import build_conv_layer
  23 | import copy
  24 | from spconv.core import ConvAlgo
  25 | 
  26 | def pos2embed(pos, num_pos_feats=128, temperature=10000):
  27 |     scale = 2 * math.pi
  28 |     pos = pos * scale
  29 |     dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos.device)
  30 |     dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
  31 |     pos_x = pos[..., 0, None] / dim_t
  32 |     pos_y = pos[..., 1, None] / dim_t
  33 |     # pos_z = pos[..., 2, None] / dim_t
  34 |     pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), dim=-1).flatten(-2)
  35 |     pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()), dim=-1).flatten(-2)
  36 |     # pos_z = torch.stack((pos_z[..., 0::2].sin(), pos_z[..., 1::2].cos()), dim=-1).flatten(-2)
  37 |     posemb = torch.cat((pos_y, pos_x), dim=-1)
  38 |     return posemb
  39 | 
  40 | 
  41 | class LayerNormFunction(torch.autograd.Function):
  42 | 
  43 |     @staticmethod
  44 |     def forward(ctx, x, weight, bias, groups, eps):
  45 |         ctx.groups = groups
  46 |         ctx.eps = eps
  47 |         N, C, L = x.size()
  48 |         x = x.view(N, groups, C // groups, L)
  49 |         mu = x.mean(2, keepdim=True)
  50 |         var = (x - mu).pow(2).mean(2, keepdim=True)
  51 |         y = (x - mu) / (var + eps).sqrt()
  52 |         ctx.save_for_backward(y, var, weight)
  53 |         y = weight.view(1, C, 1) * y.view(N, C, L) + bias.view(1, C, 1)
  54 |         return y
  55 | 
  56 |     @staticmethod
  57 |     def backward(ctx, grad_output):
  58 |         groups = ctx.groups
  59 |         eps = ctx.eps
  60 | 
  61 |         N, C, L = grad_output.size()
  62 |         y, var, weight = ctx.saved_variables
  63 |         g = grad_output * weight.view(1, C, 1)
  64 |         g = g.view(N, groups, C//groups, L)
  65 |         mean_g = g.mean(dim=2, keepdim=True)
  66 |         mean_gy = (g * y).mean(dim=2, keepdim=True)
  67 |         gx = 1. / torch.sqrt(var + eps) * (g - y * mean_gy - mean_g)
  68 |         return gx.view(N, C, L), (grad_output * y.view(N, C, L)).sum(dim=2).sum(dim=0), grad_output.sum(dim=2).sum(
  69 |             dim=0), None, None
  70 | 
  71 | 
  72 | class GroupLayerNorm1d(nn.Module):
  73 | 
  74 |     def __init__(self, channels, groups=1, eps=1e-6):
  75 |         super(GroupLayerNorm1d, self).__init__()
  76 |         self.register_parameter('weight', nn.Parameter(torch.ones(channels)))
  77 |         self.register_parameter('bias', nn.Parameter(torch.zeros(channels)))
  78 |         self.groups = groups
  79 |         self.eps = eps
  80 | 
  81 |     def forward(self, x):
  82 |         return LayerNormFunction.apply(x, self.weight, self.bias, self.groups, self.eps)
  83 | 
  84 | 
  85 | @HEADS.register_module()
  86 | class SeparateTaskHead(BaseModule):
  87 |     """SeparateHead for CenterHead.
  88 | 
  89 |     Args:
  90 |         in_channels (int): Input channels for conv_layer.
  91 |         heads (dict): Conv information.
  92 |         head_conv (int): Output channels.
  93 |             Default: 64.
  94 |         final_kernal (int): Kernal size for the last conv layer.
  95 |             Deafult: 1.
  96 |         init_bias (float): Initial bias. Default: -2.19.
  97 |         conv_cfg (dict): Config of conv layer.
  98 |             Default: dict(type='Conv2d')
  99 |         norm_cfg (dict): Config of norm layer.
 100 |             Default: dict(type='BN2d').
 101 |         bias (str): Type of bias. Default: 'auto'.
 102 |     """
 103 | 
 104 |     def __init__(self,
 105 |                  in_channels,
 106 |                  heads,
 107 |                  groups=1,
 108 |                  head_conv=64,
 109 |                  final_kernel=1,
 110 |                  init_bias=-2.19,
 111 |                  init_cfg=None,
 112 |                  **kwargs):
 113 |         assert init_cfg is None, 'To prevent abnormal initialization ' \
 114 |             'behavior, init_cfg is not allowed to be set'
 115 |         super(SeparateTaskHead, self).__init__(init_cfg=init_cfg)
 116 |         self.heads = heads
 117 |         self.groups = groups
 118 |         self.init_bias = init_bias
 119 |         for head in self.heads:
 120 |             classes, num_conv = self.heads[head]
 121 | 
 122 |             conv_layers = []
 123 |             c_in = in_channels
 124 |             for i in range(num_conv - 1):
 125 |                 conv_layers.extend([
 126 |                     nn.Conv1d(
 127 |                         c_in * groups,
 128 |                         head_conv * groups,
 129 |                         kernel_size=final_kernel,
 130 |                         stride=1,
 131 |                         padding=final_kernel // 2,
 132 |                         groups=groups,
 133 |                         bias=False),
 134 |                     GroupLayerNorm1d(head_conv * groups, groups=groups),
 135 |                     nn.ReLU(inplace=True)
 136 |                 ])
 137 |                 c_in = head_conv
 138 | 
 139 |             conv_layers.append(
 140 |                 nn.Conv1d(
 141 |                     head_conv * groups,
 142 |                     classes * groups,
 143 |                     kernel_size=final_kernel,
 144 |                     stride=1,
 145 |                     padding=final_kernel // 2,
 146 |                     groups=groups,
 147 |                     bias=True))
 148 |             conv_layers = nn.Sequential(*conv_layers)
 149 | 
 150 |             self.__setattr__(head, conv_layers)
 151 | 
 152 |             if init_cfg is None:
 153 |                 self.init_cfg = dict(type='Kaiming', layer='Conv1d')
 154 | 
 155 |     def init_weights(self):
 156 |         """Initialize weights."""
 157 |         super().init_weights()
 158 |         for head in self.heads:
 159 |             if head == 'cls_logits':
 160 |                 self.__getattr__(head)[-1].bias.data.fill_(self.init_bias)
 161 | 
 162 |     def forward(self, x):
 163 |         """Forward function for SepHead.
 164 | 
 165 |         Args:
 166 |             x (torch.Tensor): Input feature map with the shape of
 167 |                 [N, B, query, C].
 168 | 
 169 |         Returns:
 170 |             dict[str: torch.Tensor]: contains the following keys:
 171 | 
 172 |                 -reg （torch.Tensor): 2D regression value with the \
 173 |                     shape of [N, B, query, 2].
 174 |                 -height (torch.Tensor): Height value with the \
 175 |                     shape of [N, B, query, 1].
 176 |                 -dim (torch.Tensor): Size value with the shape \
 177 |                     of [N, B, query, 3].
 178 |                 -rot (torch.Tensor): Rotation value with the \
 179 |                     shape of [N, B, query, 2].
 180 |                 -vel (torch.Tensor): Velocity value with the \
 181 |                     shape of [N, B, query, 2].
 182 |         """
 183 |         N, B, query_num, c1 = x.shape
 184 |         x = rearrange(x, "n b q c -> b (n c) q")
 185 |         ret_dict = dict()
 186 |         
 187 |         for head in self.heads:
 188 |              head_output = self.__getattr__(head)(x)
 189 |              ret_dict[head] = rearrange(head_output, "b (n c) q -> n b q c", n=N)
 190 | 
 191 |         return ret_dict
 192 | 
 193 | 
 194 | 
 195 | @HEADS.register_module()
 196 | class FSTRHead(BaseModule):
 197 |     "only init lidar proposal query"
 198 |     def __init__(self,
 199 |                 in_channels,
 200 |                 num_init_query = 200,
 201 |                 num_query=900,
 202 |                 max_sparse_token_per_sample = 10000,
 203 |                 proposal_head_kernel = 3,
 204 |                 hidden_dim=128,
 205 |                 norm_bbox=True,
 206 |                 downsample_scale=8,
 207 |                 scalar=10,
 208 |                 noise_scale=1.0,
 209 |                 noise_trans=0.0,
 210 |                 dn_weight=1.0,
 211 |                 split=0.75,
 212 |                 depth_num=64,
 213 |                 nms_kernel_size=3,
 214 |                 init_dn_query=False,
 215 |                 init_learnable_query = False,
 216 |                 init_query_topk = 1,
 217 |                 init_query_radius = 1,
 218 |                 gauusian_dn_sampling=False,
 219 |                 noise_mean = 0.5,
 220 |                 noise_std = 0.125,
 221 |                 train_cfg=None,
 222 |                 test_cfg=None,
 223 |                 common_heads=dict(
 224 |                     center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)
 225 |                 ),
 226 |                 tasks=[
 227 |                 dict(num_class=1, class_names=['car']),
 228 |                 dict(num_class=2, class_names=['truck', 'construction_vehicle']),
 229 |                 dict(num_class=2, class_names=['bus', 'trailer']),
 230 |                 dict(num_class=1, class_names=['barrier']),
 231 |                 dict(num_class=2, class_names=['motorcycle', 'bicycle']),
 232 |                 dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
 233 |                 ],
 234 |                 transformer=None,
 235 |                 bbox_coder=None,
 236 |                 loss_cls=dict(
 237 |                     type="FocalLoss",
 238 |                     use_sigmoid=True,
 239 |                     reduction="mean",
 240 |                     gamma=2, alpha=0.25, loss_weight=1.0
 241 |                 ),
 242 |                 loss_bbox=dict(
 243 |                 type="L1Loss",
 244 |                 reduction="mean",
 245 |                 loss_weight=0.25,
 246 |                 ),
 247 |                 loss_heatmap=dict(
 248 |                     type="GuassianFocalLoss",
 249 |                     reduction="mean"
 250 |                 ),
 251 |                 separate_head=dict(
 252 |                     type='SeparateMlpHead', init_bias=-2.19, final_kernel=3),
 253 |                 init_cfg=None,
 254 |                 **kwargs):
 255 |         super(FSTRHead, self).__init__(**kwargs)
 256 | 
 257 | 
 258 |         self.num_classes = [len(t["class_names"]) for t in tasks]
 259 |         self.class_names = [t["class_names"] for t in tasks]
 260 |         self.hidden_dim = hidden_dim
 261 |         self.train_cfg = train_cfg
 262 |         self.test_cfg = test_cfg
 263 |         self.num_query = num_query
 264 |         self.in_channels = in_channels
 265 |         self.norm_bbox = norm_bbox
 266 |         self.downsample_scale = downsample_scale
 267 |         self.scalar = scalar
 268 |         self.bbox_noise_scale = noise_scale
 269 |         self.bbox_noise_trans = noise_trans
 270 |         self.dn_weight = dn_weight
 271 |         self.split = split
 272 |         self.depth_num = depth_num
 273 |         self.nms_kernel_size = nms_kernel_size
 274 |         self.num_proposals = num_query
 275 |         self.loss_cls = build_loss(loss_cls)
 276 |         self.loss_bbox = build_loss(loss_bbox)
 277 |         self.loss_heatmap = build_loss(loss_heatmap)
 278 |         self.bbox_coder = build_bbox_coder(bbox_coder)
 279 |         self.pc_range = self.bbox_coder.pc_range
 280 |         self.fp16_enabled = False
 281 |         self.init_dn_query = init_dn_query
 282 |         self.init_learnable_query = init_learnable_query
 283 |         self.gauusian_dn_sampling = gauusian_dn_sampling
 284 |         self.noise_mean = noise_mean
 285 |         self.noise_std = noise_std
 286 |         self.init_query_topk = init_query_topk
 287 |         self.init_query_radius = init_query_radius
 288 | 
 289 |         # transformer
 290 |         self.transformer = build_transformer(transformer)
 291 |         # self.reference_points = nn.Embedding(num_query, 3)
 292 |         self.bev_embedding = nn.Sequential(
 293 |             nn.Linear(hidden_dim * 2, hidden_dim),
 294 |             nn.ReLU(inplace=True),
 295 |             nn.Linear(hidden_dim, hidden_dim)
 296 |         )
 297 | 
 298 |         # task head
 299 |         self.task_heads = nn.ModuleList()
 300 |         for num_cls in self.num_classes:
 301 |             heads = copy.deepcopy(common_heads)
 302 |             heads.update(dict(cls_logits=(num_cls, 2)))
 303 |             separate_head.update(
 304 |                 in_channels=hidden_dim,
 305 |                 heads=heads, num_cls=num_cls,
 306 |                 groups=transformer.decoder.num_layers
 307 |             )
 308 |             self.task_heads.append(builder.build_head(separate_head))
 309 | 
 310 |         # assigner
 311 |         if train_cfg:
 312 |             self.assigner = build_assigner(train_cfg["assigner"])
 313 |             sampler_cfg = dict(type='PseudoSampler')
 314 |             self.sampler = build_sampler(sampler_cfg, context=self)
 315 | 
 316 | 
 317 |         self.num_init_query = num_init_query
 318 |         assert self.num_init_query < self.num_query, "number of init query must less than number of query"
 319 |         self.reference_points = nn.Embedding(self.num_query - self.num_init_query, 3)
 320 |         self.class_encoding = nn.Sequential()
 321 |         self.shared_conv = make_sparse_convmodule(
 322 |                 self.in_channels,
 323 |                 self.hidden_dim,
 324 |                 (3,3),
 325 |                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
 326 |                 padding=(1,1),
 327 |                 indice_key='head_spconv_1',
 328 |                 conv_type='SubMConv2d',
 329 |                 order=('conv', 'norm', 'act'))
 330 |         self.sparse_maxpool_2d = spconv.SparseMaxPool2d(3, 1, 1, subm=True, algo=ConvAlgo.Native, indice_key='max_pool_head_3')
 331 |         self.sparse_maxpool_2d_small = spconv.SparseMaxPool2d(1, 1, 0, subm=True, algo=ConvAlgo.Native, indice_key='max_pool_head_3')
 332 |         self.max_sparse_token_per_sample = max_sparse_token_per_sample
 333 | 
 334 |         # for sparse heatmap
 335 |         self.proposal_head_kernel = proposal_head_kernel
 336 |         output_channels = sum(self.num_classes)
 337 |         num_conv = 2
 338 |         self.heatmap_head = nn.Sequential()
 339 |         fc_list = []
 340 |         for k in range(num_conv - 1):
 341 |             fc_list.append(
 342 |                 make_sparse_convmodule(
 343 |                 self.hidden_dim,
 344 |                 self.hidden_dim,
 345 |                 self.proposal_head_kernel,
 346 |                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
 347 |                 padding=int(self.proposal_head_kernel//2),
 348 |                 indice_key='head_spconv_1',
 349 |                 conv_type='SubMConv2d',
 350 |                 order=('conv', 'norm', 'act')),
 351 |             )
 352 |         fc_list.append(build_conv_layer(
 353 |                         dict(type='SubMConv2d', indice_key='hm_out'),
 354 |                         self.hidden_dim,
 355 |                         sum(self.num_classes),
 356 |                         1,
 357 |                         stride=1,
 358 |                         padding=0,
 359 |                         bias=True))
 360 |         
 361 |         
 362 |         self.sparse_hm_layer = nn.Sequential(*fc_list)
 363 |         self.sparse_hm_layer[-1].bias.data.fill_(-2.19)
 364 | 
 365 |     @property
 366 |     def coords_bev(self):
 367 |         cfg = self.train_cfg if self.train_cfg else self.test_cfg
 368 |         x_size, y_size = (
 369 |             cfg['grid_size'][1] // self.downsample_scale,
 370 |             cfg['grid_size'][0] // self.downsample_scale
 371 |         )
 372 |         meshgrid = [[0, y_size - 1, y_size], [0, x_size - 1, x_size]]
 373 |         batch_y, batch_x = torch.meshgrid(*[torch.linspace(it[0], it[1], it[2]) for it in meshgrid])
 374 |         batch_x = (batch_x + 0.5) / x_size
 375 |         batch_y = (batch_y + 0.5) / y_size
 376 |         coord_base = torch.cat([batch_x[None], batch_y[None]], dim=0)
 377 |         coord_base = coord_base.view(2, -1).transpose(1, 0) # (H*W, 2)
 378 |         return coord_base
 379 |     def init_weights(self):
 380 |         super(FSTRHead, self).init_weights()
 381 |         nn.init.uniform_(self.reference_points.weight.data, 0, 1)
 382 |         
 383 |     def _bev_query_embed(self, ref_points, img_metas):
 384 |         bev_embeds = self.bev_embedding(pos2embed(ref_points, num_pos_feats=self.hidden_dim))
 385 |         return bev_embeds
 386 |     def forward(self, points_feats, img_metas=None):
 387 |         """
 388 |             list([bs, c, h, w])
 389 |         """
 390 |         img_metas = [img_metas]
 391 |         return multi_apply(self.forward_single, points_feats, img_metas)    
 392 | 
 393 |     def forward_single(self, x, img_metas):
 394 |         """
 395 |             x: [bs c h w]
 396 |             return List(dict(head_name: [num_dec x bs x num_query * head_dim]) ) x task_num
 397 |         """
 398 |         ret_dicts = []
 399 |         batch_size = len(img_metas)
 400 |         x = self.shared_conv(x)
 401 |         x_feature = torch.zeros(*(x.features.shape),device = x.features.device)
 402 |         x_feature[:,:] = x.features
 403 |         x_batch_indices = torch.zeros(x.indices.shape[0],1,device = x.features.device)
 404 |         x_ind = torch.zeros(x.indices.shape[0],2,device = x.features.device)
 405 |         x_2dpos = torch.zeros(x.indices.shape[0],2,device = x.features.device)
 406 |         x_batch_indices[:,:] = x.indices[:,:1]
 407 |         x_ind[:,:] = x.indices[:,-2:]
 408 |         x_ind = x_ind.to(torch.float32)
 409 |         cfg = self.train_cfg if self.train_cfg else self.test_cfg
 410 |         y_size, x_size = x.spatial_shape
 411 |         x_2dpos[:,0] = (x_ind[:,1] + 0.5) / x_size
 412 |         x_2dpos[:,1] = (x_ind[:,0] + 0.5) / y_size
 413 |         batch_size = int(x.batch_size)
 414 | 
 415 |         sparse_hm = self.sparse_hm_layer(x)
 416 |         sparse_hm_clone = spconv.SparseConvTensor(
 417 |                 features=sparse_hm.features.clone().detach().sigmoid(),
 418 |                 indices=sparse_hm.indices.clone(),
 419 |                 spatial_shape=sparse_hm.spatial_shape,
 420 |                 batch_size=sparse_hm.batch_size
 421 |             )
 422 |         x_hm_max = self.sparse_maxpool_2d(sparse_hm_clone, True)
 423 |         x_hm_max_small = self.sparse_maxpool_2d_small(sparse_hm_clone, True)
 424 | 
 425 | 
 426 |         selected = (x_hm_max.features == sparse_hm_clone.features)
 427 |         selected_small = (x_hm_max_small.features == sparse_hm_clone.features)
 428 |         selected[:,8] = selected_small[:,8]
 429 |         selected[:,9] = selected_small[:,9]
 430 | 
 431 |         score = sparse_hm_clone.features * selected
 432 |         score, _ = score.topk(1,dim=1)
 433 |         proposal_list = []
 434 |         proposal_feature = []
 435 |         # topk for each sample in batch
 436 |         for i in range(batch_size):
 437 |             mask = (x_batch_indices == i).squeeze(-1)
 438 |             sample_voxel_pos = x_2dpos[mask]
 439 |             sample_voxel_hm = score[mask]
 440 |             sample_voxel_feature = x_feature[mask]
 441 |             _, proposal_ind = sample_voxel_hm.topk(self.num_init_query,dim=0)
 442 |             proposal_list.append(sample_voxel_pos.gather(0, proposal_ind.repeat(1,2))[None,...])
 443 |             proposal_feature.append(sample_voxel_feature.gather(0, proposal_ind.repeat(1,sample_voxel_feature.shape[1]))[None,...])
 444 |         query_pos = torch.cat(proposal_list,dim=0)
 445 |         query_init_feature = torch.cat(proposal_feature,dim=0)
 446 | 
 447 |         reference_points = self.reference_points.weight
 448 |         reference_points = reference_points.unsqueeze(0).repeat(batch_size,1,1)
 449 | 
 450 |         init_reference_points = torch.cat([query_pos,0.5*torch.ones([*query_pos.shape[:-1],1]).to(query_pos.device)],dim=-1)
 451 | 
 452 |         reference_points = torch.cat([init_reference_points, reference_points],dim=1)
 453 | 
 454 |         reference_points, attn_mask, mask_dict = self.prepare_for_dn(batch_size, reference_points, img_metas)
 455 |         
 456 |         pad_size = mask_dict['pad_size'] if mask_dict is not None else 0
 457 |         
 458 |         target = self.get_sparse_init_query(reference_points, x_feature , x_2dpos, x_batch_indices, pad_size)
 459 |         
 460 |         bev_pos_embeds = self.bev_embedding(pos2embed(x_2dpos, num_pos_feats=self.hidden_dim))
 461 |         
 462 |         bev_query_embeds  = self.query_embed(reference_points, img_metas)
 463 |         query_embeds = bev_query_embeds
 464 | 
 465 | 
 466 |         # pad or drop 
 467 | 
 468 |         batch_feature = torch.zeros(batch_size,self.max_sparse_token_per_sample,self.hidden_dim,device = x.features.device)
 469 |         batch_bevemb = torch.zeros(batch_size,self.max_sparse_token_per_sample,self.hidden_dim,device = x.features.device)
 470 | 
 471 |         for i in range(batch_size):
 472 |             sample_token_num = (x_batch_indices==i).sum()
 473 |             batch_token_num = min(sample_token_num,self.max_sparse_token_per_sample)
 474 |             mask = (x_batch_indices == i).squeeze(-1)
 475 |             sample_voxel_hm = score[mask]
 476 |             sample_voxel_feature = x_feature[mask]
 477 |             sample_voxel_bev_emb = bev_pos_embeds[mask]
 478 |             _, voxel_ind = sample_voxel_hm.topk(batch_token_num,dim=0)
 479 |             # a = sample_voxel_feature.gather(0, voxel_ind.repeat(1,sample_voxel_feature.shape[1]))[None,...]
 480 |             batch_feature[i][:batch_token_num] = sample_voxel_feature.gather(0, voxel_ind.repeat(1,sample_voxel_feature.shape[1]))
 481 |             batch_bevemb[i][:batch_token_num] = sample_voxel_bev_emb.gather(0, voxel_ind.repeat(1,sample_voxel_bev_emb.shape[1]))
 482 | 
 483 |         outs_dec, _ = self.transformer(
 484 |                             batch_feature, query_embeds,
 485 |                             batch_bevemb,
 486 |                             attn_masks=attn_mask,
 487 |                             target = target
 488 |                         )
 489 |         outs_dec = torch.nan_to_num(outs_dec)
 490 | 
 491 |         reference = inverse_sigmoid(reference_points.clone())
 492 |         
 493 |         flag = 0
 494 |         for task_id, task in enumerate(self.task_heads, 0):
 495 |             outs = task(outs_dec)
 496 |             center = (outs['center'] + reference[None, :, :, :2]).sigmoid()
 497 |             height = (outs['height'] + reference[None, :, :, 2:3]).sigmoid()
 498 |             _center, _height = center.new_zeros(center.shape), height.new_zeros(height.shape)
 499 |             _center[..., 0:1] = center[..., 0:1] * (self.pc_range[3] - self.pc_range[0]) + self.pc_range[0]
 500 |             _center[..., 1:2] = center[..., 1:2] * (self.pc_range[4] - self.pc_range[1]) + self.pc_range[1]
 501 |             _height[..., 0:1] = height[..., 0:1] * (self.pc_range[5] - self.pc_range[2]) + self.pc_range[2]
 502 |             outs['center'] = _center
 503 |             outs['height'] = _height
 504 |             
 505 |             if mask_dict and mask_dict['pad_size'] > 0:
 506 |                 task_mask_dict = copy.deepcopy(mask_dict)
 507 |                 class_name = self.class_names[task_id]
 508 | 
 509 |                 known_lbs_bboxes_label =  task_mask_dict['known_lbs_bboxes'][0]
 510 |                 known_labels_raw = task_mask_dict['known_labels_raw']
 511 |                 new_lbs_bboxes_label = known_lbs_bboxes_label.new_zeros(known_lbs_bboxes_label.shape)
 512 |                 new_lbs_bboxes_label[:] = len(class_name)
 513 |                 new_labels_raw = known_labels_raw.new_zeros(known_labels_raw.shape)
 514 |                 new_labels_raw[:] = len(class_name)
 515 |                 task_masks = [
 516 |                     torch.where(known_lbs_bboxes_label == class_name.index(i) + flag)
 517 |                     for i in class_name
 518 |                 ]
 519 |                 task_masks_raw = [
 520 |                     torch.where(known_labels_raw == class_name.index(i) + flag)
 521 |                     for i in class_name
 522 |                 ]
 523 |                 for cname, task_mask, task_mask_raw in zip(class_name, task_masks, task_masks_raw):
 524 |                     new_lbs_bboxes_label[task_mask] = class_name.index(cname)
 525 |                     new_labels_raw[task_mask_raw] = class_name.index(cname)
 526 |                 task_mask_dict['known_lbs_bboxes'] = (new_lbs_bboxes_label, task_mask_dict['known_lbs_bboxes'][1])
 527 |                 task_mask_dict['known_labels_raw'] = new_labels_raw
 528 |                 flag += len(class_name)
 529 |                 
 530 |                 for key in list(outs.keys()):
 531 |                     outs['dn_' + key] = outs[key][:, :, :mask_dict['pad_size'], :]
 532 |                     outs[key] = outs[key][:, :, mask_dict['pad_size']:, :]
 533 |                 outs['dn_mask_dict'] = task_mask_dict
 534 |             
 535 |             ret_dicts.append(outs)
 536 |         ret_dicts[0]['sparse_heatmap'] = sparse_hm
 537 |         return ret_dicts
 538 |     
 539 |     
 540 |     def get_sparse_init_query(self, ref_points, x_feature, x_2dpos , x_batch_indices, pad_size):
 541 | 
 542 |         total_range = self.pc_range[3]-self.pc_range[0]
 543 |         radius = self.init_query_radius
 544 |         diameter = (2 * radius + 1)/total_range
 545 |         sigma = diameter / 6
 546 |         # masked_gaussian = torch.exp(- distances / (2 * sigma * sigma))
 547 |         query_feature_list = []
 548 |         batch_size = ref_points.shape[0]
 549 | 
 550 |         for bs in range(batch_size):
 551 |             sample_q = ref_points[bs][:,:2]
 552 |             sample_mask = x_batch_indices[:,0] == bs
 553 |             sample_token = x_feature[sample_mask]
 554 |             sample_pos = x_2dpos[sample_mask]
 555 |             with torch.no_grad():
 556 |                 dis_mat = sample_q.unsqueeze(1) - sample_pos.unsqueeze(0)
 557 |                 dis_mat = -(dis_mat ** 2).sum(-1)
 558 |                 nearest_dis_topk,nearest_order_topk = dis_mat.topk(self.init_query_topk ,dim=1,sorted= True)
 559 |                 gaussian_weight = torch.exp( nearest_dis_topk / (2 * sigma * sigma))
 560 |                 gaussian_weight_sum = torch.clip(gaussian_weight.sum(-1),1)
 561 |             
 562 |             flatten_order = nearest_order_topk.view(-1,self.init_query_topk)
 563 |             flatten_weight = (gaussian_weight/gaussian_weight_sum.unsqueeze(1)).view(-1,self.init_query_topk)
 564 |             feature = (sample_token.gather(0, flatten_order.repeat(1,sample_token.shape[1]))*flatten_weight).view(-1,self.init_query_topk,sample_token.shape[1]).sum(1).unsqueeze(0)
 565 |             query_feature_list.append(feature)
 566 |         
 567 |         query_feature = torch.cat(query_feature_list,dim=0)
 568 |         if not self.init_dn_query:
 569 |            query_feature[:,:pad_size,:] *=0 
 570 |         if not self.init_learnable_query:
 571 |            query_feature[:,pad_size+self.num_init_query:,:] *=0  
 572 |         query_feature = query_feature.permute(1,0,2)
 573 | 
 574 | 
 575 |         return query_feature
 576 |     
 577 | 
 578 |     def prepare_for_dn(self, batch_size, reference_points, img_metas):
 579 |         if self.training:
 580 |             targets = [torch.cat((img_meta['gt_bboxes_3d']._data.gravity_center, img_meta['gt_bboxes_3d']._data.tensor[:, 3:]),dim=1) for img_meta in img_metas ]
 581 |             labels = [img_meta['gt_labels_3d']._data for img_meta in img_metas ]
 582 | 
 583 |             known = [(torch.ones_like(t)).cuda() for t in labels]
 584 |             know_idx = known
 585 |             unmask_bbox = unmask_label = torch.cat(known)
 586 |             known_num = [t.size(0) for t in targets]
 587 |             labels = torch.cat([t for t in labels])
 588 |             boxes = torch.cat([t for t in targets])
 589 |             batch_idx = torch.cat([torch.full((t.size(0),), i) for i, t in enumerate(targets)])
 590 | 
 591 |             known_indice = torch.nonzero(unmask_label + unmask_bbox)
 592 |             known_indice = known_indice.view(-1)
 593 |             # add noise
 594 |             groups = min(self.scalar, self.num_query // max(known_num))
 595 |             known_indice = known_indice.repeat(groups, 1).view(-1)
 596 |             known_labels = labels.repeat(groups, 1).view(-1).long().to(reference_points.device)
 597 |             known_labels_raw = labels.repeat(groups, 1).view(-1).long().to(reference_points.device)
 598 |             known_bid = batch_idx.repeat(groups, 1).view(-1)
 599 |             known_bboxs = boxes.repeat(groups, 1).to(reference_points.device)
 600 |             known_bbox_center = known_bboxs[:, :3].clone()
 601 |             known_bbox_scale = known_bboxs[:, 3:6].clone()
 602 | 
 603 |             # known_one_hot = F.one_hot(known_labels, self.num_classes[0]).permute(1,0)
 604 |             # known_query_cat_encoding = self.class_encoding(known_one_hot.float().unsqueeze(0))
 605 |             if self.bbox_noise_scale > 0:
 606 |                 diff = known_bbox_scale / 2 + self.bbox_noise_trans
 607 |                 if self.gauusian_dn_sampling:
 608 |                     rand_prob = torch.randn_like(known_bbox_center)*self.noise_std + self.noise_mean
 609 |                     rand_pn = torch.rand_like(known_bbox_center)
 610 |                     p_mask = rand_pn>0.5
 611 |                     n_mask = rand_pn<=0.5
 612 |                     rand_prob[n_mask] *= -1
 613 |                 else:
 614 |                     rand_prob = torch.rand_like(known_bbox_center) * 2 - 1.0
 615 |                 known_bbox_center += torch.mul(rand_prob, diff) * self.bbox_noise_scale
 616 |                 known_bbox_center[..., 0:1] = (known_bbox_center[..., 0:1] - self.pc_range[0]) / (
 617 |                     self.pc_range[3] - self.pc_range[0]
 618 |                 )
 619 |                 known_bbox_center[..., 1:2] = (known_bbox_center[..., 1:2] - self.pc_range[1]) / (
 620 |                     self.pc_range[4] - self.pc_range[1]
 621 |                 )
 622 |                 known_bbox_center[..., 2:3] = (known_bbox_center[..., 2:3] - self.pc_range[2]) / (
 623 |                     self.pc_range[5] - self.pc_range[2]
 624 |                 )
 625 |                 known_bbox_center = known_bbox_center.clamp(min=0.0, max=1.0)
 626 |                 mask = torch.norm(rand_prob, 2, 1) > self.split
 627 |                 known_labels[mask] = sum(self.num_classes)
 628 | 
 629 |             single_pad = int(max(known_num))
 630 |             pad_size = int(single_pad * groups)
 631 |             padding_bbox = torch.zeros(batch_size,pad_size, 3).to(reference_points.device)
 632 |             # padding_cls_encoding = torch.zeros(batch_size,query_cat_encoding.shape[1],pad_size).to(reference_points.device)
 633 |             padded_reference_points = torch.cat([padding_bbox, reference_points], dim=1)
 634 |             # padding_query_cat_encoding = torch.cat([padding_cls_encoding, query_cat_encoding], dim=-1)
 635 |             # padding_query_cat_encoding = padding_query_cat_encoding.permute(0,2,1)
 636 |             # known_query_cat_encoding = known_query_cat_encoding.permute(0,2,1)
 637 | 
 638 |             if len(known_num):
 639 |                 map_known_indice = torch.cat([torch.tensor(range(num)) for num in known_num])  # [1,2, 1,2,3]
 640 |                 map_known_indice = torch.cat([map_known_indice + single_pad * i for i in range(groups)]).long()
 641 |             if len(known_bid):
 642 |                 padded_reference_points[(known_bid.long(), map_known_indice)] = known_bbox_center.to(reference_points.device)
 643 |                 # padding_query_cat_encoding[(known_bid.long(), map_known_indice)] = known_query_cat_encoding
 644 |             
 645 |             # padding_query_cat_encoding = padding_query_cat_encoding.permute(0,2,1)
 646 |             tgt_size = pad_size + self.num_query
 647 |             attn_mask = torch.ones(tgt_size, tgt_size).to(reference_points.device) < 0
 648 |             # match query cannot see the reconstruct
 649 |             attn_mask[pad_size:, :pad_size] = True
 650 |             # reconstruct cannot see each other
 651 |             for i in range(groups):
 652 |                 if i == 0:
 653 |                     attn_mask[single_pad * i : single_pad * (i + 1), single_pad * (i + 1) : pad_size] = True
 654 |                 if i == groups - 1:
 655 |                     attn_mask[single_pad * i : single_pad * (i + 1), : single_pad * i] = True
 656 |                 else:
 657 |                     attn_mask[single_pad * i : single_pad * (i + 1), single_pad * (i + 1) : pad_size] = True
 658 |                     attn_mask[single_pad * i : single_pad * (i + 1), : single_pad * i] = True
 659 | 
 660 |             mask_dict = {
 661 |                 "known_indice": torch.as_tensor(known_indice).long(),
 662 |                 "batch_idx": torch.as_tensor(batch_idx).long(),
 663 |                 "map_known_indice": torch.as_tensor(map_known_indice).long(),
 664 |                 "known_lbs_bboxes": (known_labels, known_bboxs),
 665 |                 "known_labels_raw": known_labels_raw,
 666 |                 "know_idx": know_idx,
 667 |                 "pad_size": pad_size,
 668 |             }
 669 | 
 670 |         else:
 671 |             padded_reference_points = reference_points
 672 |             attn_mask = None
 673 |             mask_dict = None
 674 |             # padding_query_cat_encoding = query_cat_encoding
 675 | 
 676 |         return padded_reference_points, attn_mask, mask_dict
 677 |         
 678 |     @force_fp32(apply_to=('preds_dicts'))
 679 |     def loss(self, gt_bboxes_3d, gt_labels_3d, preds_dicts, **kwargs):
 680 |         """"Loss function.
 681 |         Args:
 682 |             gt_bboxes_3d (list[LiDARInstance3DBoxes]): batch_size * (num_gts, 9)
 683 |             gt_labels_3d (list[Tensor]): Ground truth class indices. batch_size * (num_gts, )
 684 |             preds_dicts(tuple[list[dict]]): nb_tasks x num_lvl
 685 |                 center: (num_dec, batch_size, num_query, 2)
 686 |                 height: (num_dec, batch_size, num_query, 1)
 687 |                 dim: (num_dec, batch_size, num_query, 3)
 688 |                 rot: (num_dec, batch_size, num_query, 2)
 689 |                 vel: (num_dec, batch_size, num_query, 2)
 690 |                 cls_logits: (num_dec, batch_size, num_query, task_classes)
 691 |         Returns:
 692 |             dict[str, Tensor]: A dictionary of loss components.
 693 |         """
 694 |         num_decoder = preds_dicts[0][0]['center'].shape[0]
 695 |         all_pred_bboxes, all_pred_logits = collections.defaultdict(list), collections.defaultdict(list)
 696 | 
 697 |         for task_id, preds_dict in enumerate(preds_dicts, 0):
 698 |             for dec_id in range(num_decoder):
 699 |                 pred_bbox = torch.cat(
 700 |                     (preds_dict[0]['center'][dec_id], preds_dict[0]['height'][dec_id],
 701 |                     preds_dict[0]['dim'][dec_id], preds_dict[0]['rot'][dec_id],
 702 |                     preds_dict[0]['vel'][dec_id]),
 703 |                     dim=-1
 704 |                 )
 705 |                 all_pred_bboxes[dec_id].append(pred_bbox)
 706 |                 all_pred_logits[dec_id].append(preds_dict[0]['cls_logits'][dec_id])
 707 |         all_pred_bboxes = [all_pred_bboxes[idx] for idx in range(num_decoder)]
 708 |         all_pred_logits = [all_pred_logits[idx] for idx in range(num_decoder)]
 709 | 
 710 |         loss_cls, loss_bbox = multi_apply(
 711 |             self.loss_single, all_pred_bboxes, all_pred_logits,
 712 |             [gt_bboxes_3d for _ in range(num_decoder)],
 713 |             [gt_labels_3d for _ in range(num_decoder)], 
 714 |         )
 715 | 
 716 |         loss_dict = dict()
 717 |         loss_dict['loss_cls'] = loss_cls[-1]
 718 |         loss_dict['loss_bbox'] = loss_bbox[-1]
 719 | 
 720 |         num_dec_layer = 0
 721 |         for loss_cls_i, loss_bbox_i in zip(loss_cls[:-1],
 722 |                                            loss_bbox[:-1]):
 723 |             loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
 724 |             loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
 725 |             num_dec_layer += 1
 726 |         
 727 |         dn_pred_bboxes, dn_pred_logits = collections.defaultdict(list), collections.defaultdict(list)
 728 |         dn_mask_dicts = collections.defaultdict(list)
 729 |         for task_id, preds_dict in enumerate(preds_dicts, 0):
 730 |             for dec_id in range(num_decoder):
 731 |                 pred_bbox = torch.cat(
 732 |                     (preds_dict[0]['dn_center'][dec_id], preds_dict[0]['dn_height'][dec_id],
 733 |                     preds_dict[0]['dn_dim'][dec_id], preds_dict[0]['dn_rot'][dec_id],
 734 |                     preds_dict[0]['dn_vel'][dec_id]),
 735 |                     dim=-1
 736 |                 )
 737 |                 dn_pred_bboxes[dec_id].append(pred_bbox)
 738 |                 dn_pred_logits[dec_id].append(preds_dict[0]['dn_cls_logits'][dec_id])
 739 |                 dn_mask_dicts[dec_id].append(preds_dict[0]['dn_mask_dict'])
 740 |         dn_pred_bboxes = [dn_pred_bboxes[idx] for idx in range(num_decoder)]
 741 |         dn_pred_logits = [dn_pred_logits[idx] for idx in range(num_decoder)]
 742 |         dn_mask_dicts = [dn_mask_dicts[idx] for idx in range(num_decoder)]
 743 |         dn_loss_cls, dn_loss_bbox = multi_apply(
 744 |             self.dn_loss_single, dn_pred_bboxes, dn_pred_logits, dn_mask_dicts
 745 |         )
 746 | 
 747 |         loss_dict['dn_loss_cls'] = dn_loss_cls[-1]
 748 |         loss_dict['dn_loss_bbox'] = dn_loss_bbox[-1]
 749 |         num_dec_layer = 0
 750 |         for loss_cls_i, loss_bbox_i in zip(dn_loss_cls[:-1],
 751 |                                            dn_loss_bbox[:-1]):
 752 |             loss_dict[f'd{num_dec_layer}.dn_loss_cls'] = loss_cls_i
 753 |             loss_dict[f'd{num_dec_layer}.dn_loss_bbox'] = loss_bbox_i
 754 |             num_dec_layer += 1
 755 | 
 756 |         sparse_hm_voxel = preds_dict[0]['sparse_heatmap']
 757 |         spatial_shape, batch_index, voxel_indices, spatial_indices, num_voxels = self._get_voxel_infos(sparse_hm_voxel)
 758 |         voxel_hp_target = multi_apply(
 759 |             self.sparse_hp_target_single,
 760 |             gt_bboxes_3d,
 761 |             gt_labels_3d,
 762 |             num_voxels,
 763 |             spatial_indices,
 764 |         )
 765 |         # voxel_hp_target = self.sparse_hp_target_single(sparse_hm_voxel, gt_bboxes_3d,gt_labels_3d)
 766 |         # TODO: Fix bugs for hp target (uncorrect when batchsize != 1)
 767 |         hp_target = [ t.permute(1,0) for t in voxel_hp_target[0]]
 768 |         hp_target = torch.cat(hp_target,dim=0)
 769 |         pred_hm = sparse_hm_voxel.features.clone()
 770 |         loss_heatmap = self.loss_heatmap(clip_sigmoid(pred_hm), hp_target, avg_factor=max(hp_target.eq(1).float().sum().item(), 1))
 771 |         # heatmap_target = torch.cat(hp_target, dim=0)
 772 |         # loss_heatmap = self.loss_heatmap(clip_sigmoid(preds_dict[0]['dense_heatmap']), heatmap_target, avg_factor=max(heatmap_target.eq(1).float().sum().item(), 1))
 773 |         loss_dict['loss_heatmap'] = loss_heatmap
 774 |         return loss_dict
 775 | 
 776 | 
 777 |     def sparse_hp_target_single(self,gt_bboxes_3d, gt_labels_3d, num_voxels, spatial_indices):
 778 |         num_max_objs = 500
 779 |         gaussian_overlap = 0.1
 780 |         min_radius = 2
 781 |         device = gt_labels_3d.device
 782 |         gt_bboxes_3d = torch.cat([gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]], dim=1).to(device)
 783 |         grid_size = torch.tensor(self.train_cfg['grid_size'])
 784 |         pc_range = torch.tensor(self.train_cfg['point_cloud_range'])
 785 |         voxel_size = torch.tensor(self.train_cfg['voxel_size'])
 786 |         feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor']  # [x_len, y_len]
 787 |         # heatmap = gt_bboxes_3d.new_zeros((self.num_classes[0], feature_map_size[1], feature_map_size[0]))
 788 |         
 789 | 
 790 |         inds = gt_bboxes_3d.new_zeros(num_max_objs).long()
 791 |         mask = gt_bboxes_3d.new_zeros(num_max_objs).long()
 792 |         heatmap = gt_bboxes_3d.new_zeros(sum(self.num_classes), num_voxels)
 793 |         x, y, z = gt_bboxes_3d[:, 0], gt_bboxes_3d[:, 1], gt_bboxes_3d[:, 2]
 794 | 
 795 |         coord_x = (x - self.pc_range[0]) / voxel_size[0] / self.downsample_scale
 796 |         coord_y = (y - self.pc_range[1]) / voxel_size[1] / self.downsample_scale
 797 | 
 798 |         spatial_shape = [self.test_cfg['grid_size'][0] / self.downsample_scale, self.test_cfg['grid_size'][1] / self.downsample_scale]
 799 |         coord_x = torch.clamp(coord_x, min=0, max=spatial_shape[1] - 0.5)  # bugfixed: 1e-6 does not work for center.int()
 800 |         coord_y = torch.clamp(coord_y, min=0, max=spatial_shape[0] - 0.5)  #
 801 | 
 802 |         center = torch.cat((coord_y[:, None], coord_x[:, None]), dim=-1)
 803 |         center_int = center.int()
 804 |         center_int_float = center_int.float()
 805 | 
 806 |         dx, dy, dz = gt_bboxes_3d[:, 3], gt_bboxes_3d[:, 4], gt_bboxes_3d[:, 5]
 807 |         dx = dx / voxel_size[0] / self.downsample_scale
 808 |         dy = dy / voxel_size[1] / self.downsample_scale
 809 | 
 810 |         radius = self.gaussian_radius(dx, dy, min_overlap=gaussian_overlap)
 811 |         radius = torch.clamp_min(radius.int(), min=min_radius)
 812 | 
 813 |         for k in range(min(num_max_objs, gt_bboxes_3d.shape[0])):
 814 |             if dx[k] <= 0 or dy[k] <= 0:
 815 |                 continue
 816 | 
 817 |             if not (0 <= center_int[k][0] <= spatial_shape[1] and 0 <= center_int[k][1] <= spatial_shape[0]):
 818 |                 continue
 819 | 
 820 |             cur_class_id = (gt_labels_3d[k]).long()
 821 |             distance = self.distance(spatial_indices, center[k])
 822 |             inds[k] = distance.argmin()
 823 |             mask[k] = 1
 824 | 
 825 |             # gt_center
 826 |             self.draw_gaussian_to_heatmap_voxels(heatmap[cur_class_id], distance, radius[k].item() * 1)
 827 | 
 828 |             # nearnest
 829 |             self.draw_gaussian_to_heatmap_voxels(heatmap[cur_class_id], self.distance(spatial_indices, spatial_indices[inds[k]]), radius[k].item() * 1)
 830 | 
 831 |         return [heatmap]
 832 |     
 833 |     def draw_gaussian_to_heatmap_voxels(self, heatmap, distances, radius, k=1):
 834 |     
 835 |         diameter = 2 * radius + 1
 836 |         sigma = diameter / 6
 837 |         masked_gaussian = torch.exp(- distances / (2 * sigma * sigma))
 838 | 
 839 |         torch.max(heatmap, masked_gaussian, out=heatmap)
 840 | 
 841 |         return heatmap
 842 | 
 843 |     def distance(self, voxel_indices, center):
 844 |         distances = ((voxel_indices - center.unsqueeze(0))**2).sum(-1)
 845 |         return distances
 846 | 
 847 | 
 848 |     def _get_voxel_infos(self, x):
 849 |         spatial_shape = x.spatial_shape
 850 |         voxel_indices = x.indices
 851 |         spatial_indices = []
 852 |         num_voxels = []
 853 |         batch_size = x.batch_size
 854 |         batch_index = voxel_indices[:, 0]
 855 | 
 856 |         for bs_idx in range(batch_size):
 857 |             batch_inds = batch_index==bs_idx
 858 |             spatial_indices.append(voxel_indices[batch_inds][:, [1, 2]]) # y, x
 859 |             num_voxels.append(batch_inds.sum())
 860 | 
 861 |         return spatial_shape, batch_index, voxel_indices, spatial_indices, num_voxels
 862 | 
 863 | 
 864 |     def gaussian_radius(self, height, width, min_overlap=0.5):
 865 |         """
 866 |         Args:
 867 |             height: (N)
 868 |             width: (N)
 869 |             min_overlap:
 870 |         Returns:
 871 |         """
 872 |         a1 = 1
 873 |         b1 = (height + width)
 874 |         c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
 875 |         sq1 = (b1 ** 2 - 4 * a1 * c1).sqrt()
 876 |         r1 = (b1 + sq1) / 2
 877 | 
 878 |         a2 = 4
 879 |         b2 = 2 * (height + width)
 880 |         c2 = (1 - min_overlap) * width * height
 881 |         sq2 = (b2 ** 2 - 4 * a2 * c2).sqrt()
 882 |         r2 = (b2 + sq2) / 2
 883 | 
 884 |         a3 = 4 * min_overlap
 885 |         b3 = -2 * min_overlap * (height + width)
 886 |         c3 = (min_overlap - 1) * width * height
 887 |         sq3 = (b3 ** 2 - 4 * a3 * c3).sqrt()
 888 |         r3 = (b3 + sq3) / 2
 889 |         
 890 |         ret = torch.min(torch.min(r1, r2), r3)
 891 |         return ret
 892 | 
 893 |     def query_embed(self, ref_points, img_metas):
 894 |         ref_points = inverse_sigmoid(ref_points.clone()).sigmoid()
 895 |         bev_embeds = self._bev_query_embed(ref_points, img_metas)
 896 |         return bev_embeds 
 897 | 
 898 | 
 899 |     def _get_targets_single(self, gt_bboxes_3d, gt_labels_3d, pred_bboxes, pred_logits):
 900 |         """"Compute regression and classification targets for one image.
 901 |         Outputs from a single decoder layer of a single feature level are used.
 902 |         Args:
 903 |             
 904 |             gt_bboxes_3d (Tensor):  LiDARInstance3DBoxes(num_gts, 9)
 905 |             gt_labels_3d (Tensor): Ground truth class indices (num_gts, )
 906 |             pred_bboxes (list[Tensor]): num_tasks x (num_query, 10)
 907 |             pred_logits (list[Tensor]): num_tasks x (num_query, task_classes)
 908 |         Returns:
 909 |             tuple[Tensor]: a tuple containing the following.
 910 |                 - labels_tasks (list[Tensor]): num_tasks x (num_query, ).
 911 |                 - label_weights_tasks (list[Tensor]): num_tasks x (num_query, ).
 912 |                 - bbox_targets_tasks (list[Tensor]): num_tasks x (num_query, 9).
 913 |                 - bbox_weights_tasks (list[Tensor]): num_tasks x (num_query, 10).
 914 |                 - pos_inds (list[Tensor]): num_tasks x Sampled positive indices.
 915 |                 - neg_inds (Tensor): num_tasks x Sampled negative indices.
 916 |         """
 917 |         device = gt_labels_3d.device
 918 |         gt_bboxes_3d = torch.cat(
 919 |             (gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]), dim=1
 920 |         ).to(device)
 921 |         
 922 |         task_masks = []
 923 |         flag = 0
 924 |         for class_name in self.class_names:
 925 |             task_masks.append([
 926 |                 torch.where(gt_labels_3d == class_name.index(i) + flag)
 927 |                 for i in class_name
 928 |             ])
 929 |             flag += len(class_name)
 930 |         
 931 |         task_boxes = []
 932 |         task_classes = []
 933 |         flag2 = 0
 934 |         for idx, mask in enumerate(task_masks):
 935 |             task_box = []
 936 |             task_class = []
 937 |             for m in mask:
 938 |                 task_box.append(gt_bboxes_3d[m])
 939 |                 task_class.append(gt_labels_3d[m] - flag2)
 940 |             task_boxes.append(torch.cat(task_box, dim=0).to(device))
 941 |             task_classes.append(torch.cat(task_class).long().to(device))
 942 |             flag2 += len(mask)
 943 |         
 944 |         def task_assign(bbox_pred, logits_pred, gt_bboxes, gt_labels, num_classes):
 945 |             num_bboxes = bbox_pred.shape[0]
 946 |             assign_results = self.assigner.assign(bbox_pred, logits_pred, gt_bboxes, gt_labels)
 947 |             sampling_result = self.sampler.sample(assign_results, bbox_pred, gt_bboxes)
 948 |             pos_inds, neg_inds = sampling_result.pos_inds, sampling_result.neg_inds
 949 |             # label targets
 950 |             labels = gt_bboxes.new_full((num_bboxes, ),
 951 |                                     num_classes,
 952 |                                     dtype=torch.long)
 953 |             labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
 954 |             label_weights = gt_bboxes.new_ones(num_bboxes)
 955 |             # bbox_targets
 956 |             code_size = gt_bboxes.shape[1]
 957 |             bbox_targets = torch.zeros_like(bbox_pred)[..., :code_size]
 958 |             bbox_weights = torch.zeros_like(bbox_pred)
 959 |             bbox_weights[pos_inds] = 1.0
 960 |             
 961 |             if len(sampling_result.pos_gt_bboxes) > 0:
 962 |                 bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes
 963 |             return labels, label_weights, bbox_targets, bbox_weights, pos_inds, neg_inds
 964 | 
 965 |         labels_tasks, labels_weights_tasks, bbox_targets_tasks, bbox_weights_tasks, pos_inds_tasks, neg_inds_tasks\
 966 |              = multi_apply(task_assign, pred_bboxes, pred_logits, task_boxes, task_classes, self.num_classes)
 967 |         
 968 |         return labels_tasks, labels_weights_tasks, bbox_targets_tasks, bbox_weights_tasks, pos_inds_tasks, neg_inds_tasks
 969 |             
 970 |     def get_targets(self, gt_bboxes_3d, gt_labels_3d, preds_bboxes, preds_logits):
 971 |         """"Compute regression and classification targets for a batch image.
 972 |         Outputs from a single decoder layer of a single feature level are used.
 973 |         Args:
 974 |             gt_bboxes_3d (list[LiDARInstance3DBoxes]): batch_size * (num_gts, 9)
 975 |             gt_labels_3d (list[Tensor]): Ground truth class indices. batch_size * (num_gts, )
 976 |             pred_bboxes (list[list[Tensor]]): batch_size x num_task x [num_query, 10].
 977 |             pred_logits (list[list[Tensor]]): batch_size x num_task x [num_query, task_classes]
 978 |         Returns:
 979 |             tuple: a tuple containing the following targets.
 980 |                 - task_labels_list (list(list[Tensor])): num_tasks x batch_size x (num_query, ).
 981 |                 - task_labels_weight_list (list[Tensor]): num_tasks x batch_size x (num_query, )
 982 |                 - task_bbox_targets_list (list[Tensor]): num_tasks x batch_size x (num_query, 9)
 983 |                 - task_bbox_weights_list (list[Tensor]): num_tasks x batch_size x (num_query, 10)
 984 |                 - num_total_pos_tasks (list[int]): num_tasks x Number of positive samples
 985 |                 - num_total_neg_tasks (list[int]): num_tasks x Number of negative samples.
 986 |         """
 987 |         (labels_list, labels_weight_list, bbox_targets_list,
 988 |          bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply(
 989 |             self._get_targets_single, gt_bboxes_3d, gt_labels_3d, preds_bboxes, preds_logits
 990 |         )
 991 |         task_num = len(labels_list[0])
 992 |         num_total_pos_tasks, num_total_neg_tasks = [], []
 993 |         task_labels_list, task_labels_weight_list, task_bbox_targets_list, \
 994 |             task_bbox_weights_list = [], [], [], []
 995 | 
 996 |         for task_id in range(task_num):
 997 |             num_total_pos_task = sum((inds[task_id].numel() for inds in pos_inds_list))
 998 |             num_total_neg_task = sum((inds[task_id].numel() for inds in neg_inds_list))
 999 |             num_total_pos_tasks.append(num_total_pos_task)
1000 |             num_total_neg_tasks.append(num_total_neg_task)
1001 |             task_labels_list.append([labels_list[batch_idx][task_id] for batch_idx in range(len(gt_bboxes_3d))])
1002 |             task_labels_weight_list.append([labels_weight_list[batch_idx][task_id] for batch_idx in range(len(gt_bboxes_3d))])
1003 |             task_bbox_targets_list.append([bbox_targets_list[batch_idx][task_id] for batch_idx in range(len(gt_bboxes_3d))])
1004 |             task_bbox_weights_list.append([bbox_weights_list[batch_idx][task_id] for batch_idx in range(len(gt_bboxes_3d))])
1005 |         
1006 |         return (task_labels_list, task_labels_weight_list, task_bbox_targets_list,
1007 |                 task_bbox_weights_list, num_total_pos_tasks, num_total_neg_tasks)
1008 |         
1009 |     def _loss_single_task(self,
1010 |                           pred_bboxes,
1011 |                           pred_logits,
1012 |                           labels_list,
1013 |                           labels_weights_list,
1014 |                           bbox_targets_list,
1015 |                           bbox_weights_list,
1016 |                           num_total_pos,
1017 |                           num_total_neg):
1018 |         """"Compute loss for single task.
1019 |         Outputs from a single decoder layer of a single feature level are used.
1020 |         Args:
1021 |             pred_bboxes (Tensor): (batch_size, num_query, 10)
1022 |             pred_logits (Tensor): (batch_size, num_query, task_classes)
1023 |             labels_list (list[Tensor]): batch_size x (num_query, )
1024 |             labels_weights_list (list[Tensor]): batch_size x (num_query, )
1025 |             bbox_targets_list(list[Tensor]): batch_size x (num_query, 9)
1026 |             bbox_weights_list(list[Tensor]): batch_size x (num_query, 10)
1027 |             num_total_pos: int
1028 |             num_total_neg: int
1029 |         Returns:
1030 |             loss_cls
1031 |             loss_bbox 
1032 |         """
1033 |         labels = torch.cat(labels_list, dim=0)
1034 |         labels_weights = torch.cat(labels_weights_list, dim=0)
1035 |         bbox_targets = torch.cat(bbox_targets_list, dim=0)
1036 |         bbox_weights = torch.cat(bbox_weights_list, dim=0)
1037 |         
1038 |         pred_bboxes_flatten = pred_bboxes.flatten(0, 1)
1039 |         pred_logits_flatten = pred_logits.flatten(0, 1)
1040 |         
1041 |         cls_avg_factor = num_total_pos * 1.0 + num_total_neg * 0.1
1042 |         cls_avg_factor = max(cls_avg_factor, 1)
1043 |         loss_cls = self.loss_cls(
1044 |             pred_logits_flatten, labels, labels_weights, avg_factor=cls_avg_factor
1045 |         )
1046 | 
1047 |         normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range)
1048 |         isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
1049 |         bbox_weights = bbox_weights * bbox_weights.new_tensor(self.train_cfg.code_weights)[None, :]
1050 | 
1051 |         loss_bbox = self.loss_bbox(
1052 |             pred_bboxes_flatten[isnotnan, :10],
1053 |             normalized_bbox_targets[isnotnan, :10],
1054 |             bbox_weights[isnotnan, :10],
1055 |             avg_factor=num_total_pos
1056 |         )
1057 | 
1058 |         loss_cls = torch.nan_to_num(loss_cls)
1059 |         loss_bbox = torch.nan_to_num(loss_bbox) 
1060 |         return loss_cls, loss_bbox
1061 | 
1062 |     def loss_single(self,
1063 |                     pred_bboxes,
1064 |                     pred_logits,
1065 |                     gt_bboxes_3d,
1066 |                     gt_labels_3d):
1067 |         """"Loss function for outputs from a single decoder layer of a single
1068 |         feature level.
1069 |         Args:
1070 |             pred_bboxes (list[Tensor]): num_tasks x [bs, num_query, 10].
1071 |             pred_logits (list(Tensor]): num_tasks x [bs, num_query, task_classes]
1072 |             gt_bboxes_3d (list[LiDARInstance3DBoxes]): batch_size * (num_gts, 9)
1073 |             gt_labels_list (list[Tensor]): Ground truth class indices. batch_size * (num_gts, )
1074 |         Returns:
1075 |             dict[str, Tensor]: A dictionary of loss components for outputs from
1076 |                 a single decoder layer.
1077 |         """
1078 |         batch_size = pred_bboxes[0].shape[0]
1079 |         pred_bboxes_list, pred_logits_list = [], []
1080 |         for idx in range(batch_size):
1081 |             pred_bboxes_list.append([task_pred_bbox[idx] for task_pred_bbox in pred_bboxes])
1082 |             pred_logits_list.append([task_pred_logits[idx] for task_pred_logits in pred_logits])
1083 |         cls_reg_targets = self.get_targets(
1084 |             gt_bboxes_3d, gt_labels_3d, pred_bboxes_list, pred_logits_list
1085 |         )
1086 |         (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
1087 |          num_total_pos, num_total_neg) = cls_reg_targets
1088 |         loss_cls_tasks, loss_bbox_tasks = multi_apply(
1089 |             self._loss_single_task, 
1090 |             pred_bboxes,
1091 |             pred_logits,
1092 |             labels_list,
1093 |             label_weights_list,
1094 |             bbox_targets_list,
1095 |             bbox_weights_list,
1096 |             num_total_pos,
1097 |             num_total_neg
1098 |         )
1099 | 
1100 | 
1101 |         return sum(loss_cls_tasks), sum(loss_bbox_tasks)
1102 |     
1103 |     def _dn_loss_single_task(self,
1104 |                              pred_bboxes,
1105 |                              pred_logits,
1106 |                              mask_dict):
1107 |         known_labels, known_bboxs = mask_dict['known_lbs_bboxes']
1108 |         map_known_indice = mask_dict['map_known_indice'].long()
1109 |         known_indice = mask_dict['known_indice'].long()
1110 |         batch_idx = mask_dict['batch_idx'].long()
1111 |         bid = batch_idx[known_indice]
1112 |         known_labels_raw = mask_dict['known_labels_raw']
1113 |         
1114 |         pred_logits = pred_logits[(bid, map_known_indice)]
1115 |         pred_bboxes = pred_bboxes[(bid, map_known_indice)]
1116 |         num_tgt = known_indice.numel()
1117 | 
1118 |         # filter task bbox
1119 |         task_mask = known_labels_raw != pred_logits.shape[-1]
1120 |         task_mask_sum = task_mask.sum()
1121 |         
1122 |         if task_mask_sum > 0:
1123 |             # pred_logits = pred_logits[task_mask]
1124 |             # known_labels = known_labels[task_mask]
1125 |             pred_bboxes = pred_bboxes[task_mask]
1126 |             known_bboxs = known_bboxs[task_mask]
1127 | 
1128 |         # classification loss
1129 |         # construct weighted avg_factor to match with the official DETR repo
1130 |         cls_avg_factor = num_tgt * 3.14159 / 6 * self.split * self.split  * self.split
1131 |         # if self.sync_cls_avg_factor:
1132 |         #     cls_avg_factor = reduce_mean(
1133 |         #         pred_logits.new_tensor([cls_avg_factor]))
1134 |         
1135 |         label_weights = torch.ones_like(known_labels)
1136 |         cls_avg_factor = max(cls_avg_factor, 1)
1137 |         loss_cls = self.loss_cls(
1138 |             pred_logits, known_labels.long(), label_weights, avg_factor=cls_avg_factor)
1139 | 
1140 |         # Compute the average number of gt boxes accross all gpus, for
1141 |         # normalization purposes
1142 |         num_tgt = loss_cls.new_tensor([num_tgt])
1143 |         num_tgt = torch.clamp(reduce_mean(num_tgt), min=1).item()
1144 | 
1145 |         # regression L1 loss
1146 |         normalized_bbox_targets = normalize_bbox(known_bboxs, self.pc_range)
1147 |         isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
1148 |         bbox_weights = torch.ones_like(pred_bboxes)
1149 |         bbox_weights = bbox_weights * bbox_weights.new_tensor(self.train_cfg.code_weights)[None, :]
1150 |         # bbox_weights[:, 6:8] = 0
1151 |         loss_bbox = self.loss_bbox(
1152 |                 pred_bboxes[isnotnan, :10], normalized_bbox_targets[isnotnan, :10], bbox_weights[isnotnan, :10], avg_factor=num_tgt)
1153 |  
1154 |         loss_cls = torch.nan_to_num(loss_cls)
1155 |         loss_bbox = torch.nan_to_num(loss_bbox)
1156 | 
1157 |         if task_mask_sum == 0:
1158 |             # loss_cls = loss_cls * 0.0
1159 |             loss_bbox = loss_bbox * 0.0
1160 | 
1161 |         return self.dn_weight * loss_cls, self.dn_weight * loss_bbox
1162 | 
1163 |     def dn_loss_single(self,
1164 |                        pred_bboxes,
1165 |                        pred_logits,
1166 |                        dn_mask_dict):
1167 |         loss_cls_tasks, loss_bbox_tasks = multi_apply(
1168 |             self._dn_loss_single_task, pred_bboxes, pred_logits, dn_mask_dict
1169 |         )
1170 |         return sum(loss_cls_tasks), sum(loss_bbox_tasks)
1171 | 
1172 |     @force_fp32(apply_to=('preds_dicts'))
1173 |     def get_bboxes(self, preds_dicts, img_metas, img=None, rescale=False):
1174 |         preds_dicts = self.bbox_coder.decode(preds_dicts)
1175 |         num_samples = len(preds_dicts)
1176 |         
1177 |         ret_list = []
1178 |         for i in range(num_samples):
1179 |             preds = preds_dicts[i]
1180 |             bboxes = preds['bboxes']
1181 |             bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
1182 |             bboxes = img_metas[i]['box_type_3d'](bboxes, bboxes.size(-1))
1183 |             scores = preds['scores']
1184 |             labels = preds['labels']
1185 |             ret_list.append([bboxes, scores, labels])
1186 |         return ret_list


--------------------------------------------------------------------------------