├── tools
    ├── data_converter
    │   ├── __init__.py
    │   └── create_gt_database.py
    ├── create_data.sh
    ├── dist_train.sh
    ├── dist_test.sh
    ├── test_speed.py
    ├── visual_utils
    │   ├── open3d_vis_utils.py
    │   └── visualize_utils.py
    ├── create_data.py
    ├── test.py
    └── train.py
├── projects
    ├── mmdet3d_plugin
    │   ├── core
    │   │   ├── __init__.py
    │   │   └── bbox
    │   │   │   ├── coders
    │   │   │       ├── __init__.py
    │   │   │       └── multi_task_bbox_coder.py
    │   │   │   ├── assigners
    │   │   │       ├── __init__.py
    │   │   │       └── hungarian_assigner_3d.py
    │   │   │   ├── match_costs
    │   │   │       ├── __init__.py
    │   │   │       └── match_cost.py
    │   │   │   └── util.py
    │   ├── mmcv_custom
    │   │   ├── ops
    │   │   │   ├── __init__.py
    │   │   │   └── voxel
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── spconv_voxelize.py
    │   │   ├── runner
    │   │   │   ├── __init__.py
    │   │   │   └── hooks
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── optimizer.py
    │   │   │   │   ├── freeze_weight.py
    │   │   │   │   └── drop_gt_sampling.py
    │   │   └── __init__.py
    │   ├── models
    │   │   ├── necks
    │   │   │   ├── __init__.py
    │   │   │   └── cp_fpn.py
    │   │   ├── backbones
    │   │   │   ├── __init__.py
    │   │   │   └── vovnet.py
    │   │   ├── __init__.py
    │   │   ├── detectors
    │   │   │   ├── __init__.py
    │   │   │   ├── meformer.py
    │   │   │   └── mome.py
    │   │   ├── utils
    │   │   │   ├── __init__.py
    │   │   │   ├── grid_mask.py
    │   │   │   ├── moad_transformer.py
    │   │   │   ├── pme_transformer.py
    │   │   │   └── attention.py
    │   │   └── dense_heads
    │   │   │   ├── __init__.py
    │   │   │   └── separate_task_head.py
    │   ├── datasets
    │   │   ├── pipelines
    │   │   │   ├── __init__.py
    │   │   │   └── dbsampler.py
    │   │   ├── __init__.py
    │   │   └── custom_nuscenes_dataset.py
    │   └── __init__.py
    └── configs
    │   ├── meformer_voxel0075_vov_1600x640_cbgs.py
    │   └── mome
    │       └── mome.py
├── assets
    └── ov.png
├── requirements.txt
├── docs
    ├── prepare_dataset.md
    ├── train_eval.md
    └── install.md
├── LICENSE
├── .gitignore
└── README.md


/tools/data_converter/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/assets/ov.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/konyul/MoME/HEAD/assets/ov.png


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/ops/__init__.py:
--------------------------------------------------------------------------------
1 | from .voxel import *
2 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/runner/__init__.py:
--------------------------------------------------------------------------------
1 | from .hooks import *
2 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/__init__.py:
--------------------------------------------------------------------------------
1 | from .ops import *
2 | from .runner import *
3 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/necks/__init__.py:
--------------------------------------------------------------------------------
1 | from .cp_fpn import CPFPN
2 | 
3 | __all__ = ['CPFPN']
4 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/ops/voxel/__init__.py:
--------------------------------------------------------------------------------
1 | from .spconv_voxelize import SPConvVoxelization
2 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | from .vovnet import VoVNet
2 | 
3 | __all__ = ['VoVNet', ]
4 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/datasets/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | from .dbsampler import UnifiedDataBaseSampler
2 | from .transform_3d import *


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .custom_nuscenes_dataset import CustomNuScenesDataset
2 | from .pipelines import *
3 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/core/bbox/coders/__init__.py:
--------------------------------------------------------------------------------
1 | from .multi_task_bbox_coder import MultiTaskBBoxCoder
2 | 
3 | __all__ = ['MultiTaskBBoxCoder']
4 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py:
--------------------------------------------------------------------------------
1 | from .hungarian_assigner_3d import HungarianAssigner3D
2 | 
3 | __all__ = ['HungarianAssigner3D']
4 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .backbones import *
2 | from .dense_heads import *
3 | from .detectors import *
4 | from .necks import *
5 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/detectors/__init__.py:
--------------------------------------------------------------------------------
1 | from .meformer import MEFormerDetector
2 | from .mome import MoME
3 | 
4 | __all__ = ['MEFormerDetector', 'MoME']
5 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .moad_transformer import *
2 | from .petr_transformer import *
3 | from .pme_transformer import *
4 | from .multi_expert import *


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | mmdet==2.24.0
2 | mmsegmentation==0.29.1
3 | mmdet3d==1.0.0rc5
4 | spconv-cu111
5 | flash-attn==0.2.2
6 | numpy==1.23.5
7 | setuptools==59.5.0
8 | yapf==0.40.1
9 | 


--------------------------------------------------------------------------------
/docs/prepare_dataset.md:
--------------------------------------------------------------------------------
1 | #### Prepare data
2 | Run [create_data.sh](https://github.com/hanchaa/MEFormer/blob/main/tools/create_data.sh) script.
3 | ```shell
4 | bash tools/create_data.sh
5 | ```
6 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/runner/hooks/__init__.py:
--------------------------------------------------------------------------------
1 | from .drop_gt_sampling import DropGTSamplingHook
2 | from .freeze_weight import FreezeWeight
3 | from .optimizer import CustomFp16OptimizerHook
4 | 


--------------------------------------------------------------------------------
/tools/create_data.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -x
4 | export PYTHONPATH=`pwd`:$PYTHONPATH
5 | 
6 | python tools/create_data.py nuscenes --root-path ./data/nuscenes/ --out-dir ./data/nuscenes --extra-tag nuscenes
7 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/__init__.py:
--------------------------------------------------------------------------------
1 | from .core.bbox.assigners import *
2 | from .core.bbox.coders import *
3 | from .core.bbox.match_costs import BBox3DL1Cost
4 | from .datasets import *
5 | from .mmcv_custom import *
6 | from .models import *
7 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/dense_heads/__init__.py:
--------------------------------------------------------------------------------
1 | from .meformer_head import MEFormerHead
2 | from .separate_task_head import SeparateTaskHead
3 | from .med import MultiExpertDecoding
4 | 
5 | __all__ = ['SeparateTaskHead', 'MEFormerHead', 'MultiExpertDecoding']
6 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py:
--------------------------------------------------------------------------------
1 | from mmdet.core.bbox.match_costs import build_match_cost
2 | 
3 | from .match_cost import BBox3DL1Cost, BBoxBEVL1Cost, IoU3DCost
4 | 
5 | __all__ = ['build_match_cost', 'BBox3DL1Cost', 'BBoxBEVL1Cost', 'IoU3DCost']
6 | 


--------------------------------------------------------------------------------
/docs/train_eval.md:
--------------------------------------------------------------------------------
 1 | ## Train & Inference
 2 | #### Train
 3 | 
 4 | Train 1st Stage
 5 | 
 6 | ```shell
 7 | tools/dist_train.sh ./projects/configs/moad_voxel0075_vov_1600x640_cbgs.py 8
 8 | ```
 9 | 
10 | Train 2nd Stage
11 | ```shell
12 | tools/dist_train.sh ./projects/configs/mome/mome.py 4
13 | ```
14 | 
15 | #### Inference
16 | ```shell
17 | tools/dist_test.sh ./projects/configs/mome/mome.py $path_to_weight$ $num_gpus --eval bbox
18 | ```
19 | 


--------------------------------------------------------------------------------
/tools/dist_train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CONFIG=$1
 4 | GPUS=$2
 5 | NNODES=${NNODES:-1}
 6 | NODE_RANK=${NODE_RANK:-0}
 7 | PORT=${PORT:-29500}
 8 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
 9 | 
10 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
11 | python -m torch.distributed.launch \
12 |     --nnodes=$NNODES \
13 |     --node_rank=$NODE_RANK \
14 |     --master_addr=$MASTER_ADDR \
15 |     --nproc_per_node=$GPUS \
16 |     --master_port=$PORT \
17 |     $(dirname "$0")/train.py \
18 |     $CONFIG \
19 |     --launcher pytorch ${@:3}
20 | 


--------------------------------------------------------------------------------
/tools/dist_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CONFIG=$1
 4 | CHECKPOINT=$2
 5 | GPUS=$3
 6 | NNODES=${NNODES:-1}
 7 | NODE_RANK=${NODE_RANK:-0}
 8 | PORT=${PORT:-29500}
 9 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
10 | 
11 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
12 | python -m torch.distributed.launch \
13 |     --nnodes=$NNODES \
14 |     --node_rank=$NODE_RANK \
15 |     --master_addr=$MASTER_ADDR \
16 |     --nproc_per_node=$GPUS \
17 |     --master_port=$PORT \
18 |     $(dirname "$0")/test.py \
19 |     $CONFIG \
20 |     $CHECKPOINT \
21 |     --launcher pytorch \
22 |     ${@:4}
23 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/runner/hooks/optimizer.py:
--------------------------------------------------------------------------------
 1 | from mmcv.runner.hooks import HOOKS
 2 | from mmcv.runner.hooks.optimizer import Fp16OptimizerHook
 3 | 
 4 | 
 5 | @HOOKS.register_module()
 6 | class CustomFp16OptimizerHook(Fp16OptimizerHook):
 7 | 
 8 |     def __init__(self,
 9 |                  custom_fp16={},
10 |                  *args,
11 |                  **kwargs):
12 |         super(CustomFp16OptimizerHook, self).__init__(*args, **kwargs)
13 |         self.custom_fp16 = custom_fp16
14 | 
15 |     def before_run(self, runner) -> None:
16 |         super().before_run(runner)
17 |         for module_name, v in self.custom_fp16.items():
18 |             runner.model.module._modules[module_name].fp16_enabled = v
19 | 


--------------------------------------------------------------------------------
/docs/install.md:
--------------------------------------------------------------------------------
 1 | # Step-by-step installation instructions
 2 | 
 3 | Use MoME with Docker
 4 | 
 5 | **a. We provide Docker Image.**
 6 | ```shell
 7 | docker pull kyparkk/mome:python3.8_torch1.11.0_cu113
 8 | docker run --gpus all --shm-size=512g -it -v {DATA_DIR}:{DATA_DIR} kyparkk/mome:python3.8_torch1.11.0_cu113 /bin/bash
 9 | ```
10 | 
11 | **b. Clone MoME.**
12 | ```
13 | git clone https://github.com/konyul/MoME.git
14 | ```
15 | 
16 | **c. Install requirements**
17 | ```shell
18 | cd /path/to/MoME
19 | pip install -r requirements.txt
20 | 
21 | ```
22 | 
23 | **c. Download pre-trained weights**
24 | Download the pretrained weight of the image backbone from https://github.com/hanchaa/MEFormer
25 | ```shell
26 | MoME
27 | ├─ ckpts
28 | │  ├─ fcos3d_vovnet_imgbackbone-remapped.pth
29 | │  └─ nuim_r50.pth
30 | ├─ figures
31 | ├─ projects
32 | └─ tools
33 | 
34 | ```


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/runner/hooks/freeze_weight.py:
--------------------------------------------------------------------------------
 1 | from mmcv.runner.hooks import HOOKS, Hook
 2 | 
 3 | 
 4 | @HOOKS.register_module()
 5 | class FreezeWeight(Hook):
 6 |     def __init__(self, finetune_weight):
 7 |         super().__init__()
 8 |         self.finetune_weight = finetune_weight
 9 | 
10 |     def before_run(self, runner):
11 |         if hasattr(runner.model, "module"):
12 |             model = runner.model.module
13 |         else:
14 |             model = runner.model
15 | 
16 |         freezed = []
17 |         not_freezed = []
18 |         for name, p in model.named_parameters():
19 |             flag = False
20 |             for f in self.finetune_weight:
21 |                 if name.startswith(f) and p.requires_grad:
22 |                     flag = True
23 |                     not_freezed.append(name)
24 | 
25 |             if not flag:
26 |                 p.requires_grad = False
27 |                 freezed.append(name)
28 | 
29 |         runner.logger.info(f"Freezed parameters: {', '.join(freezed)}")
30 |         runner.logger.info(f"Learned parameters: {', '.join(not_freezed)}")
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | MIT License
 3 | 
 4 | Copyright (c) 2024 JuHan Cha
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/runner/hooks/drop_gt_sampling.py:
--------------------------------------------------------------------------------
 1 | from mmcv.runner.hooks import HOOKS, Hook
 2 | 
 3 | 
 4 | @HOOKS.register_module()
 5 | class DropGTSamplingHook(Hook):
 6 | 
 7 |     def __init__(self,
 8 |                  drop_epoch,
 9 |                  pipeline_name="UnifiedObjectSample",
10 |                  *args,
11 |                  **kwargs):
12 |         super(DropGTSamplingHook, self).__init__(*args, **kwargs)
13 |         self.drop_epoch = drop_epoch
14 |         self.pipeline_name = pipeline_name
15 |         self.dropped = False
16 | 
17 |     def before_train_epoch(self, runner) -> None:
18 |         if not self.dropped and runner.epoch >= self.drop_epoch:
19 |             dataset = runner.data_loader.dataset.dataset
20 |             if hasattr(dataset, 'datasets'):
21 |                 datasets = dataset.datasets
22 |             else:
23 |                 datasets = [dataset]
24 | 
25 |             for d in datasets:
26 |                 pipeline = d.pipeline.transforms
27 |                 index = 0
28 |                 dropped = False
29 | 
30 |                 for i, p in enumerate(pipeline):
31 |                     if p.__class__.__name__ == self.pipeline_name:
32 |                         index = i
33 |                         dropped = True
34 |                         runner.logger.info(f"{self.pipeline_name} is dropped after {self.drop_epoch} epoch training!")
35 |                         break
36 | 
37 |                 if dropped:
38 |                     pipeline.pop(index)
39 |                     self.dropped = dropped
40 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from mmdet.core.bbox.match_costs.builder import MATCH_COST
 3 | 
 4 | 
 5 | @MATCH_COST.register_module()
 6 | class BBox3DL1Cost(object):
 7 |     """BBox3DL1Cost.
 8 |      Args:
 9 |          weight (int | float, optional): loss_weight
10 |     """
11 | 
12 |     def __init__(self, weight=1.):
13 |         self.weight = weight
14 | 
15 |     def __call__(self, bbox_pred, gt_bboxes):
16 |         """
17 |         Args:
18 |             bbox_pred (Tensor): Predicted boxes with normalized coordinates
19 |                 (cx, cy, w, h), which are all in range [0, 1]. Shape
20 |                 [num_query, 4].
21 |             gt_bboxes (Tensor): Ground truth boxes with normalized
22 |                 coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
23 |         Returns:
24 |             torch.Tensor: bbox_cost value with weight
25 |         """
26 |         bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
27 |         return bbox_cost * self.weight
28 | 
29 | 
30 | @MATCH_COST.register_module()
31 | class BBoxBEVL1Cost(object):
32 |     def __init__(self, weight):
33 |         self.weight = weight
34 | 
35 |     def __call__(self, bboxes, gt_bboxes, pc_range):
36 |         pc_start = bboxes.new(pc_range[0:2])
37 |         pc_range = bboxes.new(pc_range[3:5]) - bboxes.new(pc_range[0:2])
38 |         # normalize the box center to [0, 1]
39 |         normalized_bboxes_xy = (bboxes[:, :2] - pc_start) / pc_range
40 |         normalized_gt_bboxes_xy = (gt_bboxes[:, :2] - pc_start) / pc_range
41 |         reg_cost = torch.cdist(normalized_bboxes_xy, normalized_gt_bboxes_xy, p=1)
42 |         return reg_cost * self.weight
43 | 
44 | 
45 | @MATCH_COST.register_module()
46 | class IoU3DCost(object):
47 |     def __init__(self, weight):
48 |         self.weight = weight
49 | 
50 |     def __call__(self, iou):
51 |         iou_cost = - iou
52 |         return iou_cost * self.weight
53 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | *.ipynb
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | tmp/
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | mmdetection3d/
 31 | mmdetection3d
 32 | mmdet3d
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | hostfile.txt
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # pyenv
 83 | .python-version
 84 | 
 85 | # celery beat schedule file
 86 | celerybeat-schedule
 87 | 
 88 | # SageMath parsed files
 89 | *.sage.py
 90 | 
 91 | # Environments
 92 | .env
 93 | .venv
 94 | env/
 95 | venv/
 96 | ENV/
 97 | env.bak/
 98 | venv.bak/
 99 | 
100 | # Spyder project settings
101 | .spyderproject
102 | .spyproject
103 | 
104 | # Rope project settings
105 | .ropeproject
106 | 
107 | # mkdocs documentation
108 | /site
109 | 
110 | # mypy
111 | .mypy_cache/
112 | 
113 | # cython generated cpp
114 | data
115 | ckpts
116 | .vscode
117 | .idea
118 | 
119 | # custom
120 | nuscenes_gt_database
121 | nuscenes_unified_gt_database
122 | work_dirs
123 | *.pkl
124 | *.pkl.json
125 | *.log.json
126 | work_dirs/
127 | exps/
128 | *~
129 | mmdet3d/.mim
130 | 
131 | # Pytorch
132 | *.pth
133 | 
134 | # demo
135 | # *.jpg
136 | # *.png
137 | data/s3dis/Stanford3dDataset_v1.2_Aligned_Version/
138 | data/scannet/scans/
139 | data/sunrgbd/OFFICIAL_SUNRGBD/
140 | *.obj
141 | *.ply
142 | 
143 | # Waymo evaluation
144 | mmdet3d/core/evaluation/waymo_utils/compute_detection_metrics_main
145 | 
146 | .DS_Store
147 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/core/bbox/util.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def normalize_bbox(bboxes, pc_range=None):
 5 |     cx = bboxes[..., 0:1]
 6 |     cy = bboxes[..., 1:2]
 7 |     cz = bboxes[..., 2:3]
 8 |     w = bboxes[..., 3:4].log()
 9 |     l = bboxes[..., 4:5].log()
10 |     h = bboxes[..., 5:6].log()
11 | 
12 |     rot = bboxes[..., 6:7]
13 |     if bboxes.size(-1) > 7:
14 |         vx = bboxes[..., 7:8]
15 |         vy = bboxes[..., 8:9]
16 |         normalized_bboxes = torch.cat(
17 |             (cx, cy, w, l, cz, h, rot.sin(), rot.cos(), vx, vy), dim=-1
18 |         )
19 |     else:
20 |         normalized_bboxes = torch.cat(
21 |             (cx, cy, w, l, cz, h, rot.sin(), rot.cos()), dim=-1
22 |         )
23 |     return normalized_bboxes
24 | 
25 | 
26 | def denormalize_bbox(normalized_bboxes, pc_range=None):
27 |     # rotation
28 |     rot_sine = normalized_bboxes[..., 6:7]
29 | 
30 |     rot_cosine = normalized_bboxes[..., 7:8]
31 |     rot = torch.atan2(rot_sine, rot_cosine)
32 | 
33 |     # center in the bev
34 |     cx = normalized_bboxes[..., 0:1]
35 |     cy = normalized_bboxes[..., 1:2]
36 |     cz = normalized_bboxes[..., 4:5]
37 | 
38 |     # size
39 |     w = normalized_bboxes[..., 2:3]
40 |     l = normalized_bboxes[..., 3:4]
41 |     h = normalized_bboxes[..., 5:6]
42 | 
43 |     w = w.exp()
44 |     l = l.exp()
45 |     h = h.exp()
46 | 
47 |     if normalized_bboxes.size(-1) > 8:
48 |         # velocity
49 |         vx = normalized_bboxes[..., 8:9]
50 |         vy = normalized_bboxes[..., 9:10]
51 |         denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1)
52 |     else:
53 |         denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1)
54 |     return denormalized_bboxes
55 | 
56 | 
57 | def bbox3d_mapping_back(bboxes, rot_degree, scale_factor, flip_horizontal, flip_vertical):
58 |     """Map bboxes from testing scale to original image scale.
59 | 
60 |     Args:
61 |         bboxes (:obj:`BaseInstance3DBoxes`): Boxes to be mapped back.
62 |         scale_factor (float): Scale factor.
63 |         flip_horizontal (bool): Whether to flip horizontally.
64 |         flip_vertical (bool): Whether to flip vertically.
65 | 
66 |     Returns:
67 |         :obj:`BaseInstance3DBoxes`: Boxes mapped back.
68 |     """
69 |     new_bboxes = bboxes.clone()
70 |     if flip_horizontal:
71 |         new_bboxes.flip('horizontal')
72 |     if flip_vertical:
73 |         new_bboxes.flip('vertical')
74 |     new_bboxes.scale(1 / scale_factor)
75 |     new_bboxes.rotate(-rot_degree)
76 | 
77 |     return new_bboxes
78 | 


--------------------------------------------------------------------------------
/tools/test_speed.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 megvii-model. All Rights Reserved.
 2 | 
 3 | import importlib
 4 | import os
 5 | import time
 6 | 
 7 | import torch
 8 | from mmcv import Config
 9 | from mmcv.parallel import MMDataParallel
10 | from mmcv.runner import load_checkpoint, wrap_fp16_model
11 | from mmdet3d.datasets import build_dataloader, build_dataset
12 | from mmdet3d.models import build_detector
13 | 
14 | 
15 | class Wrapper:
16 | 
17 |     def __init__(self,
18 |                  cfg,
19 |                  checkpoint=None) -> None:
20 |         self.cfg = Config.fromfile(cfg)
21 |         self.save_dir = './tmp'
22 |         self.init()
23 |         self.model = self._build_model(checkpoint)
24 | 
25 |         if self.cfg.get('fp16', None) is not None:
26 |             wrap_fp16_model(self.model)
27 |         if cfg.get('optimizer_config', None) is not None and cfg.optimizer_config['type'] == 'CustomFp16OptimizerHook':
28 |             wrap_fp16_model(self.model)
29 |             for module_name, v in cfg.optimizer_config['custom_fp16'].items():
30 |                 self.model._modules[module_name].fp16_enabled = v
31 | 
32 |         self.dataset = self._build_dataset()
33 | 
34 |     def init(self):
35 |         self.cfg.model.pretrained = None
36 |         self.cfg.data.test.test_mode = True
37 |         plugin_dir = self.cfg.plugin_dir
38 |         _module_dir = os.path.dirname(plugin_dir)
39 |         _module_dir = _module_dir.split('/')
40 |         _module_path = _module_dir[0]
41 |         for m in _module_dir[1:]:
42 |             _module_path = _module_path + '.' + m
43 |         print(_module_path)
44 |         plg_lib = importlib.import_module(_module_path)
45 | 
46 |     def _build_model(self, checkpoint=None):
47 |         model = build_detector(self.cfg.model, test_cfg=self.cfg.get('test_cfg'))
48 |         if checkpoint:
49 |             load_checkpoint(model, checkpoint, map_location='cpu')
50 |         model = MMDataParallel(model, device_ids=[0])
51 |         model.eval()
52 |         return model
53 | 
54 |     def _build_dataset(self):
55 |         dataset = build_dataset(self.cfg.data.val)
56 |         return dataset
57 | 
58 |     def test_speed(self, num_iters=100):
59 |         data_loader = build_dataloader(
60 |             self.dataset,
61 |             samples_per_gpu=1,
62 |             workers_per_gpu=self.cfg.data.workers_per_gpu,
63 |             dist=False,
64 |             shuffle=False)
65 |         loader = iter(data_loader)
66 |         total_time = 0
67 |         warmup_iter = 10
68 | 
69 |         with torch.no_grad():
70 |             for _ in range(num_iters):
71 |                 data = next(loader)
72 |                 t1 = time.time()
73 |                 self.model(**data, return_loss=False)
74 | 
75 |                 if _ >= warmup_iter:
76 |                     total_time += time.time() - t1
77 | 
78 |         print(f'Average time: {total_time / (num_iters - warmup_iter)}')
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     wrapper = Wrapper(
83 |         cfg='your path to config file',
84 |     )
85 |     wrapper.test_speed()
86 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/ops/voxel/spconv_voxelize.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 megvii-model. All Rights Reserved.
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | from spconv.pytorch.utils import PointToVoxel  # spconv-cu111  2.1.21
 6 | from torch import nn
 7 | from torch.nn.modules.utils import _pair
 8 | 
 9 | 
10 | class SPConvVoxelization(nn.Module):
11 |     def __init__(self, voxel_size, point_cloud_range, max_num_points, max_voxels, num_point_features,
12 |                  device=torch.device("cuda")):
13 |         super().__init__()
14 |         assert len(voxel_size) == 3
15 |         assert len(point_cloud_range) == 6
16 |         self.voxel_size = np.array(voxel_size)
17 |         self.point_cloud_range = np.array(point_cloud_range)
18 |         self.max_num_points = max_num_points
19 |         self.num_point_features = num_point_features
20 |         self.device = device
21 |         if isinstance(max_voxels, tuple):
22 |             self.max_voxels = max_voxels
23 |         else:
24 |             self.max_voxels = _pair(max_voxels)
25 |         self.voxel_generator = PointToVoxel(
26 |             vsize_xyz=voxel_size,
27 |             coors_range_xyz=point_cloud_range,
28 |             max_num_points_per_voxel=max_num_points,
29 |             max_num_voxels=self.max_voxels[0],
30 |             num_point_features=num_point_features,
31 |             device=device,
32 |         )
33 |         grid_size = (self.point_cloud_range[3:6] - self.point_cloud_range[0:3]) / np.array(voxel_size)
34 |         self.grid_size = np.round(grid_size).astype(np.int64)
35 | 
36 |     def train(self, mode: bool = True):
37 |         if mode:
38 |             self.voxel_generator = PointToVoxel(
39 |                 vsize_xyz=self.voxel_size.tolist(),
40 |                 coors_range_xyz=self.point_cloud_range.tolist(),
41 |                 max_num_points_per_voxel=self.max_num_points,
42 |                 max_num_voxels=self.max_voxels[0],
43 |                 num_point_features=self.num_point_features,
44 |                 device=self.device,
45 |             )
46 |         else:
47 |             self.voxel_generator = PointToVoxel(
48 |                 vsize_xyz=self.voxel_size.tolist(),
49 |                 coors_range_xyz=self.point_cloud_range.tolist(),
50 |                 max_num_points_per_voxel=self.max_num_points,
51 |                 max_num_voxels=self.max_voxels[1],
52 |                 num_point_features=self.num_point_features,
53 |                 device=self.device,
54 |             )
55 | 
56 |         return super().train(mode)
57 | 
58 |     def forward(self, points):
59 |         voxel_output = self.voxel_generator(points)
60 |         voxels, coordinates, num_points = voxel_output
61 |         return torch.clone(voxels), torch.clone(coordinates), torch.clone(num_points)
62 | 
63 |     def __repr__(self):
64 |         tmpstr = self.__class__.__name__ + '('
65 |         tmpstr += 'voxel_size=' + str(self.voxel_size)
66 |         tmpstr += ', point_cloud_range=' + str(self.point_cloud_range)
67 |         tmpstr += ', max_num_points=' + str(self.max_num_points)
68 |         tmpstr += ', max_voxels=' + str(self.max_voxels)
69 |         tmpstr += ', num_point_features=' + str(self.num_point_features)
70 |         tmpstr += ')'
71 |         return tmpstr
72 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 | <h1>[CVPR2025] MoME </h1>
 3 | <h3>Resilient Sensor Fusion under Adverse Sensor Failures via Multi-Modal Expert Fusion</h3>
 4 | 
 5 | Konyul Park<sup>1</sup> \*, Yecheol Kim<sup>2,3</sup> \*, Daehun Kim<sup>2</sup>, Jun Won Choi<sup>1</sup> \**
 6 |  
 7 | <sup>1</sup> Seoul National University, Korea <sup>2</sup> Hanyang University, Korea, <sup>3</sup> LG AI Research, Korea
 8 | 
 9 | (\*) equal contribution, (\**) corresponding author.
10 | 
11 | ArXiv Preprint ([arXiv 2407.13517](https://arxiv.org/abs/2503.19776))
12 | </div>
13 | 
14 | ![overall](assets/ov.png "framework")
15 | 
16 | ## Introduction
17 | In this study, we introduce an efficient and robust LiDAR-camera 3D object detector, referred to as MoME, which can achieve robust performance through a mixture of experts approach. Our MoME fully decouples modality dependencies using three parallel expert decoders, which use camera features, LiDAR features, or a combination of both to decode object queries, respectively. We propose Multi-Expert Decoding (MED) framework, where each query is decoded selectively using one of three expert decoders. MoME utilizes an Adaptive Query Router (AQR) to select the most appropriate expert decoder for each query based on the quality of camera and LiDAR features. This ensures that each query is processed by the best-suited expert, resulting in robust performance across diverse sensor failure scenarios. We evaluated the performance of MoME on the nuScenes-R benchmark. Our MoME achieved state-of-the-art performance in extreme weather and sensor failure conditions, significantly outperforming the existing models across various sensor failure scenarios.
18 | 
19 | ## Qualitative results (NDS) on nuScenes and nuScenes-R dataset 
20 | 
21 | 
22 | |Method|Training Schedule|Clean|Beam Reduction|LiDAR Drop|Limited FOV|Object Failure|View Drop|Occlusion| config | weight |
23 | |:----|:----|:----|:----|:----|:----|:----|:----|:----|:----|:----|
24 | | | | | 4 beams | all | ±60 | 0.5 | all |  |  |  |
25 | | MoME | 2 Epochs | 73.6 | 63.0 | 48.2 | 58.3 | 71.0 | 69.5 | 70.5 | [config](https://github.com/konyul/MoME/blob/main/projects/configs/mome/mome.py) | [weight](https://drive.google.com/file/d/1dFwy-eUrTMVJkoufT58rwvqis5lfOoEH/view?usp=sharing) |
26 | 
27 | ## Notes
28 | We Evaluate MoME on [nuScenes-R](https://github.com/ADLab-AutoDrive/lidar-camera-robust-benchmark) and [nuScenes-C](https://github.com/thu-ml/3D_Corruptions_AD)
29 | 
30 | ## Getting Started
31 | - [Installation](docs/install.md)
32 | - [Prepare Dataset](docs/prepare_dataset.md)
33 | - [Train and Eval](docs/train_eval.md)
34 | 
35 | ## Acknowledgements
36 | 
37 | MoME is based on [MEFormer](https://github.com/hanchaa/MEFormer). It is also greatly inspired by the following outstanding contributions to the open-source community: [mmdetection3d](https://github.com/open-mmlab/mmdetection3d), [CMT](https://github.com/junjie18/CMT).
38 | 
39 | ## Citation
40 | If you find MoME is useful in your research or applications, please consider giving us a star 🌟 and citing it by the following BibTeX entry.
41 | ```bibtex
42 | @article{MoME,
43 |   title={Resilient Sensor Fusion under Adverse Sensor Failures via Multi-Modal Expert Fusion},
44 |   author={Park, Konyul and Kim, Yecheol and Kim, Daehun and Choi, Jun Won},
45 |   journal={arXiv preprint arXiv:2503.19776},
46 |   year={2025}
47 | }
48 | ```
49 | 


--------------------------------------------------------------------------------
/tools/visual_utils/open3d_vis_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Open3d visualization tool box
  3 | Written by Jihan YANG
  4 | All rights preserved from 2021 - present.
  5 | """
  6 | import matplotlib
  7 | import numpy as np
  8 | import open3d
  9 | import torch
 10 | 
 11 | box_colormap = [
 12 |     [1, 1, 1],
 13 |     [0, 1, 0],
 14 |     [0, 1, 1],
 15 |     [1, 1, 0],
 16 | ]
 17 | 
 18 | 
 19 | def get_coor_colors(obj_labels):
 20 |     """
 21 |     Args:
 22 |         obj_labels: 1 is ground, labels > 1 indicates different instance cluster
 23 | 
 24 |     Returns:
 25 |         rgb: [N, 3]. color for each point.
 26 |     """
 27 |     colors = matplotlib.colors.XKCD_COLORS.values()
 28 |     max_color_num = obj_labels.max()
 29 | 
 30 |     color_list = list(colors)[:max_color_num + 1]
 31 |     colors_rgba = [matplotlib.colors.to_rgba_array(color) for color in color_list]
 32 |     label_rgba = np.array(colors_rgba)[obj_labels]
 33 |     label_rgba = label_rgba.squeeze()[:, :3]
 34 | 
 35 |     return label_rgba
 36 | 
 37 | 
 38 | def draw_scenes(points, gt_boxes=None, ref_boxes=None, ref_labels=None, ref_scores=None, point_colors=None,
 39 |                 draw_origin=True):
 40 |     if isinstance(points, torch.Tensor):
 41 |         points = points.cpu().numpy()
 42 |     if isinstance(gt_boxes, torch.Tensor):
 43 |         gt_boxes = gt_boxes.cpu().numpy()
 44 |     if isinstance(ref_boxes, torch.Tensor):
 45 |         ref_boxes = ref_boxes.cpu().numpy()
 46 | 
 47 |     vis = open3d.visualization.Visualizer()
 48 |     vis.create_window()
 49 | 
 50 |     vis.get_render_option().point_size = 1.0
 51 |     vis.get_render_option().background_color = np.zeros(3)
 52 | 
 53 |     # draw origin
 54 |     if draw_origin:
 55 |         axis_pcd = open3d.geometry.TriangleMesh.create_coordinate_frame(size=1.0, origin=[0, 0, 0])
 56 |         vis.add_geometry(axis_pcd)
 57 | 
 58 |     pts = open3d.geometry.PointCloud()
 59 |     pts.points = open3d.utility.Vector3dVector(points[:, :3])
 60 | 
 61 |     vis.add_geometry(pts)
 62 |     if point_colors is None:
 63 |         pts.colors = open3d.utility.Vector3dVector(np.ones((points.shape[0], 3)))
 64 |     else:
 65 |         pts.colors = open3d.utility.Vector3dVector(point_colors)
 66 | 
 67 |     if gt_boxes is not None:
 68 |         vis = draw_box(vis, gt_boxes, (0, 0, 1))
 69 | 
 70 |     if ref_boxes is not None:
 71 |         vis = draw_box(vis, ref_boxes, (0, 1, 0), ref_labels, ref_scores)
 72 | 
 73 |     vis.run()
 74 |     vis.destroy_window()
 75 | 
 76 | 
 77 | def translate_boxes_to_open3d_instance(gt_boxes):
 78 |     """
 79 |              4-------- 6
 80 |            /|         /|
 81 |           5 -------- 3 .
 82 |           | |        | |
 83 |           . 7 -------- 1
 84 |           |/         |/
 85 |           2 -------- 0
 86 |     """
 87 |     center = gt_boxes[0:3]
 88 |     lwh = gt_boxes[3:6]
 89 |     axis_angles = np.array([0, 0, gt_boxes[6] + 1e-10])
 90 |     rot = open3d.geometry.get_rotation_matrix_from_axis_angle(axis_angles)
 91 |     box3d = open3d.geometry.OrientedBoundingBox(center, rot, lwh)
 92 | 
 93 |     line_set = open3d.geometry.LineSet.create_from_oriented_bounding_box(box3d)
 94 | 
 95 |     # import ipdb; ipdb.set_trace(context=20)
 96 |     lines = np.asarray(line_set.lines)
 97 |     lines = np.concatenate([lines, np.array([[1, 4], [7, 6]])], axis=0)
 98 | 
 99 |     line_set.lines = open3d.utility.Vector2iVector(lines)
100 | 
101 |     return line_set, box3d
102 | 
103 | 
104 | def draw_box(vis, gt_boxes, color=(0, 1, 0), ref_labels=None, score=None):
105 |     for i in range(gt_boxes.shape[0]):
106 |         line_set, box3d = translate_boxes_to_open3d_instance(gt_boxes[i])
107 |         if ref_labels is None:
108 |             line_set.paint_uniform_color(color)
109 |         else:
110 |             line_set.paint_uniform_color(box_colormap[ref_labels[i]])
111 | 
112 |         vis.add_geometry(line_set)
113 | 
114 |         # if score is not None:
115 |         #     corners = box3d.get_box_points()
116 |         #     vis.add_3d_label(corners[5], '%.2f' % score[i])
117 |     return vis
118 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/datasets/custom_nuscenes_dataset.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2023 megvii-model. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from DETR3D (https://github.com/WangYueFt/detr3d)
 5 | # Copyright (c) 2021 Wang, Yue
 6 | # ------------------------------------------------------------------------
 7 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
 8 | # Copyright (c) OpenMMLab. All rights reserved.
 9 | # ------------------------------------------------------------------------
10 | 
11 | import numpy as np
12 | from mmdet.datasets import DATASETS
13 | from mmdet3d.datasets import NuScenesDataset
14 | 
15 | 
16 | @DATASETS.register_module()
17 | class CustomNuScenesDataset(NuScenesDataset):
18 |     r"""NuScenes Dataset.
19 | 
20 |     This datset only add camera intrinsics and extrinsics to the results.
21 |     """
22 | 
23 |     def __init__(self, *args, return_gt_info=False, **kwargs):
24 |         super(CustomNuScenesDataset, self).__init__(*args, **kwargs)
25 |         self.return_gt_info = return_gt_info
26 | 
27 |     def get_data_info(self, index):
28 |         """Get data info according to the given index.
29 | 
30 |         Args:
31 |             index (int): Index of the sample data to get.
32 | 
33 |         Returns:
34 |             dict: Data information that will be passed to the data \
35 |                 preprocessing pipelines. It includes the following keys:
36 | 
37 |                 - sample_idx (str): Sample index.
38 |                 - pts_filename (str): Filename of point clouds.
39 |                 - sweeps (list[dict]): Infos of sweeps.
40 |                 - timestamp (float): Sample timestamp.
41 |                 - img_filename (str, optional): Image filename.
42 |                 - lidar2img (list[np.ndarray], optional): Transformations \
43 |                     from lidar to different cameras.
44 |                 - ann_info (dict): Annotation info.
45 |         """
46 |         info = self.data_infos[index]
47 |         # standard protocal modified from SECOND.Pytorch
48 |         input_dict = dict(
49 |             sample_idx=info['token'],
50 |             pts_filename=info['lidar_path'],
51 |             sweeps=info['sweeps'],
52 |             timestamp=info['timestamp'] / 1e6,
53 |             img_sweeps=None if 'img_sweeps' not in info else info['img_sweeps'],
54 |             radar_info=None if 'radars' not in info else info['radars']
55 |         )
56 | 
57 |         if self.return_gt_info:
58 |             input_dict['info'] = info
59 | 
60 |         if self.modality['use_camera']:
61 |             image_paths = []
62 |             lidar2img_rts = []
63 |             lidar2cam_rts = []
64 |             cam_intrinsics = []
65 |             img_timestamp = []
66 |             for cam_type, cam_info in info['cams'].items():
67 |                 img_timestamp.append(cam_info['timestamp'] / 1e6)
68 |                 image_paths.append(cam_info['data_path'])
69 |                 # obtain lidar to image transformation matrix
70 |                 lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
71 |                 lidar2cam_t = cam_info[
72 |                                   'sensor2lidar_translation'] @ lidar2cam_r.T
73 |                 lidar2cam_rt = np.eye(4)
74 |                 lidar2cam_rt[:3, :3] = lidar2cam_r.T
75 |                 lidar2cam_rt[3, :3] = -lidar2cam_t
76 |                 intrinsic = cam_info['cam_intrinsic']
77 |                 viewpad = np.eye(4)
78 |                 viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
79 |                 lidar2img_rt = (viewpad @ lidar2cam_rt.T)
80 |                 lidar2img_rts.append(lidar2img_rt)
81 | 
82 |                 cam_intrinsics.append(viewpad)
83 |                 lidar2cam_rts.append(lidar2cam_rt.T)
84 | 
85 |             input_dict.update(
86 |                 dict(
87 |                     img_timestamp=img_timestamp,
88 |                     img_filename=image_paths,
89 |                     lidar2img=lidar2img_rts,
90 |                     cam_intrinsic=cam_intrinsics,
91 |                     lidar2cam=lidar2cam_rts,
92 |                 ))
93 |         if not self.test_mode:
94 |             annos = self.get_ann_info(index)
95 |             input_dict['ann_info'] = annos
96 | 
97 |         return input_dict
98 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/utils/grid_mask.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | from PIL import Image
  5 | 
  6 | 
  7 | class Grid(object):
  8 |     def __init__(self, use_h, use_w, rotate=1, offset=False, ratio=0.5, mode=0, prob=1.):
  9 |         self.use_h = use_h
 10 |         self.use_w = use_w
 11 |         self.rotate = rotate
 12 |         self.offset = offset
 13 |         self.ratio = ratio
 14 |         self.mode = mode
 15 |         self.st_prob = prob
 16 |         self.prob = prob
 17 | 
 18 |     def set_prob(self, epoch, max_epoch):
 19 |         self.prob = self.st_prob * epoch / max_epoch
 20 | 
 21 |     def __call__(self, img, label):
 22 |         if np.random.rand() > self.prob:
 23 |             return img, label
 24 |         h = img.size(1)
 25 |         w = img.size(2)
 26 |         self.d1 = 2
 27 |         self.d2 = min(h, w)
 28 |         hh = int(1.5 * h)
 29 |         ww = int(1.5 * w)
 30 |         d = np.random.randint(self.d1, self.d2)
 31 |         if self.ratio == 1:
 32 |             self.l = np.random.randint(1, d)
 33 |         else:
 34 |             self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1)
 35 |         mask = np.ones((hh, ww), np.float32)
 36 |         st_h = np.random.randint(d)
 37 |         st_w = np.random.randint(d)
 38 |         if self.use_h:
 39 |             for i in range(hh // d):
 40 |                 s = d * i + st_h
 41 |                 t = min(s + self.l, hh)
 42 |                 mask[s:t, :] *= 0
 43 |         if self.use_w:
 44 |             for i in range(ww // d):
 45 |                 s = d * i + st_w
 46 |                 t = min(s + self.l, ww)
 47 |                 mask[:, s:t] *= 0
 48 | 
 49 |         r = np.random.randint(self.rotate)
 50 |         mask = Image.fromarray(np.uint8(mask))
 51 |         mask = mask.rotate(r)
 52 |         mask = np.asarray(mask)
 53 |         mask = mask[(hh - h) // 2:(hh - h) // 2 + h, (ww - w) // 2:(ww - w) // 2 + w]
 54 | 
 55 |         mask = torch.from_numpy(mask).float()
 56 |         if self.mode == 1:
 57 |             mask = 1 - mask
 58 | 
 59 |         mask = mask.expand_as(img)
 60 |         if self.offset:
 61 |             offset = torch.from_numpy(2 * (np.random.rand(h, w) - 0.5)).float()
 62 |             offset = (1 - mask) * offset
 63 |             img = img * mask + offset
 64 |         else:
 65 |             img = img * mask
 66 | 
 67 |         return img, label
 68 | 
 69 | 
 70 | class GridMask(nn.Module):
 71 |     def __init__(self, use_h, use_w, rotate=1, offset=False, ratio=0.5, mode=0, prob=1.):
 72 |         super(GridMask, self).__init__()
 73 |         self.use_h = use_h
 74 |         self.use_w = use_w
 75 |         self.rotate = rotate
 76 |         self.offset = offset
 77 |         self.ratio = ratio
 78 |         self.mode = mode
 79 |         self.st_prob = prob
 80 |         self.prob = prob
 81 | 
 82 |     def set_prob(self, epoch, max_epoch):
 83 |         self.prob = self.st_prob * epoch / max_epoch  # + 1.#0.5
 84 | 
 85 |     def forward(self, x):
 86 |         if np.random.rand() > self.prob or not self.training:
 87 |             return x
 88 |         n, c, h, w = x.size()
 89 |         x = x.view(-1, h, w)
 90 |         hh = int(1.5 * h)
 91 |         ww = int(1.5 * w)
 92 |         d = np.random.randint(2, h)
 93 |         self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1)
 94 |         mask = np.ones((hh, ww), np.float32)
 95 |         st_h = np.random.randint(d)
 96 |         st_w = np.random.randint(d)
 97 |         if self.use_h:
 98 |             for i in range(hh // d):
 99 |                 s = d * i + st_h
100 |                 t = min(s + self.l, hh)
101 |                 mask[s:t, :] *= 0
102 |         if self.use_w:
103 |             for i in range(ww // d):
104 |                 s = d * i + st_w
105 |                 t = min(s + self.l, ww)
106 |                 mask[:, s:t] *= 0
107 | 
108 |         r = np.random.randint(self.rotate)
109 |         mask = Image.fromarray(np.uint8(mask))
110 |         mask = mask.rotate(r)
111 |         mask = np.asarray(mask)
112 |         mask = mask[(hh - h) // 2:(hh - h) // 2 + h, (ww - w) // 2:(ww - w) // 2 + w]
113 | 
114 |         mask = torch.from_numpy(mask).float().cuda()
115 |         if self.mode == 1:
116 |             mask = 1 - mask
117 |         mask = mask.expand_as(x)
118 |         if self.offset:
119 |             offset = torch.from_numpy(2 * (np.random.rand(h, w) - 0.5)).float().cuda()
120 |             x = x * mask + offset * (1 - mask)
121 |         else:
122 |             x = x * mask
123 | 
124 |         return x.view(n, c, h, w)
125 | 


--------------------------------------------------------------------------------
/tools/create_data.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) OpenMMLab. All rights reserved.
  2 | import argparse
  3 | import os.path as osp
  4 | 
  5 | from data_converter import nuscenes_converter
  6 | from data_converter.create_gt_database import create_groundtruth_database
  7 | 
  8 | 
  9 | def nuscenes_data_prep(root_path,
 10 |                        info_prefix,
 11 |                        version,
 12 |                        dataset_name,
 13 |                        out_dir,
 14 |                        max_sweeps=10):
 15 |     """Prepare data related to nuScenes dataset.
 16 | 
 17 |     Related data consists of '.pkl' files recording basic infos,
 18 |     2D annotations and groundtruth database.
 19 | 
 20 |     Args:
 21 |         root_path (str): Path of dataset root.
 22 |         info_prefix (str): The prefix of info filenames.
 23 |         version (str): Dataset version.
 24 |         dataset_name (str): The dataset class name.
 25 |         out_dir (str): Output directory of the groundtruth database info.
 26 |         max_sweeps (int, optional): Number of input consecutive frames.
 27 |             Default: 10
 28 |     """
 29 |     nuscenes_converter.create_nuscenes_infos(
 30 |         root_path, info_prefix, version=version, max_sweeps=max_sweeps)
 31 | 
 32 |     if version == 'v1.0-test':
 33 |         info_test_path = osp.join(root_path, f'{info_prefix}_infos_test.pkl')
 34 |         nuscenes_converter.export_2d_annotation(
 35 |             root_path, info_test_path, version=version)
 36 |         return
 37 | 
 38 |     info_train_path = osp.join(root_path, f'{info_prefix}_infos_train.pkl')
 39 |     info_val_path = osp.join(root_path, f'{info_prefix}_infos_val.pkl')
 40 |     nuscenes_converter.export_2d_annotation(
 41 |         root_path, info_train_path, version=version)
 42 |     nuscenes_converter.export_2d_annotation(
 43 |         root_path, info_val_path, version=version)
 44 |     create_groundtruth_database(dataset_name, root_path, info_prefix,
 45 |                                 f'{out_dir}/{info_prefix}_infos_train.pkl')
 46 | 
 47 | 
 48 | parser = argparse.ArgumentParser(description='Data converter arg parser')
 49 | parser.add_argument('dataset', metavar='kitti', help='name of the dataset')
 50 | parser.add_argument(
 51 |     '--root-path',
 52 |     type=str,
 53 |     default='./data/kitti',
 54 |     help='specify the root path of dataset')
 55 | parser.add_argument(
 56 |     '--version',
 57 |     type=str,
 58 |     default='v1.0',
 59 |     required=False,
 60 |     help='specify the dataset version, no need for kitti')
 61 | parser.add_argument(
 62 |     '--max-sweeps',
 63 |     type=int,
 64 |     default=10,
 65 |     required=False,
 66 |     help='specify sweeps of lidar per example')
 67 | parser.add_argument(
 68 |     '--with-plane',
 69 |     action='store_true',
 70 |     help='Whether to use plane information for kitti.')
 71 | parser.add_argument(
 72 |     '--num-points',
 73 |     type=int,
 74 |     default=-1,
 75 |     help='Number of points to sample for indoor datasets.')
 76 | parser.add_argument(
 77 |     '--out-dir',
 78 |     type=str,
 79 |     default='./data/kitti',
 80 |     required=False,
 81 |     help='name of info pkl')
 82 | parser.add_argument('--extra-tag', type=str, default='kitti')
 83 | parser.add_argument(
 84 |     '--workers', type=int, default=4, help='number of threads to be used')
 85 | args = parser.parse_args()
 86 | 
 87 | if __name__ == '__main__':
 88 |     import importlib
 89 | 
 90 |     importlib.import_module('projects.mmdet3d_plugin')
 91 | 
 92 |     if args.dataset == 'nuscenes' and args.version != 'v1.0-mini':
 93 |         train_version = f'{args.version}-trainval'
 94 |         nuscenes_data_prep(
 95 |             root_path=args.root_path,
 96 |             info_prefix=args.extra_tag,
 97 |             version=train_version,
 98 |             dataset_name='CustomNuScenesDataset',
 99 |             out_dir=args.out_dir,
100 |             max_sweeps=args.max_sweeps)
101 |         test_version = f'{args.version}-test'
102 |         nuscenes_data_prep(
103 |             root_path=args.root_path,
104 |             info_prefix=args.extra_tag,
105 |             version=test_version,
106 |             dataset_name='CustomNuScenesDataset',
107 |             out_dir=args.out_dir,
108 |             max_sweeps=args.max_sweeps)
109 |     elif args.dataset == 'nuscenes' and args.version == 'v1.0-mini':
110 |         train_version = f'{args.version}'
111 |         nuscenes_data_prep(
112 |             root_path=args.root_path,
113 |             info_prefix=args.extra_tag,
114 |             version=train_version,
115 |             dataset_name='CustomNuScenesDataset',
116 |             out_dir=args.out_dir,
117 |             max_sweeps=args.max_sweeps)
118 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/utils/moad_transformer.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2022 megvii-model. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from DETR3D (https://github.com/WangYueFt/detr3d)
  5 | # Copyright (c) 2021 Wang, Yue
  6 | # ------------------------------------------------------------------------
  7 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
  8 | # Copyright (c) OpenMMLab. All rights reserved.
  9 | # ------------------------------------------------------------------------
 10 | import numpy as np
 11 | import torch
 12 | import torch.nn as nn
 13 | from einops import rearrange
 14 | from mmcv.cnn import xavier_init
 15 | from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence
 16 | from mmcv.runner.base_module import BaseModule
 17 | from mmdet.models.utils.builder import TRANSFORMER
 18 | 
 19 | 
 20 | @TRANSFORMER.register_module()
 21 | class MOADTransformer(BaseModule):
 22 |     def __init__(
 23 |             self,
 24 |             use_type_embed=True,
 25 |             use_cam_embed=False,
 26 |             encoder=None,
 27 |             decoder=None,
 28 |             init_cfg=None,
 29 |             cross=False
 30 |     ):
 31 |         super(MOADTransformer, self).__init__(init_cfg=init_cfg)
 32 | 
 33 |         if encoder is not None:
 34 |             self.encoder = build_transformer_layer_sequence(encoder)
 35 |         else:
 36 |             self.encoder = None
 37 |         self.decoder = build_transformer_layer_sequence(decoder)
 38 |         self.embed_dims = self.decoder.embed_dims
 39 |         self.use_type_embed = use_type_embed
 40 |         self.use_cam_embed = use_cam_embed
 41 | 
 42 |         if self.use_type_embed:
 43 |             self.bev_type_embed = nn.Parameter(torch.randn(self.embed_dims))
 44 |             self.rv_type_embed = nn.Parameter(torch.randn(self.embed_dims))
 45 |         else:
 46 |             self.bev_type_embed = None
 47 |             self.rv_type_embed = None
 48 | 
 49 |         if self.use_cam_embed:
 50 |             self.cam_embed = nn.Sequential(
 51 |                 nn.Conv1d(16, self.embed_dims, kernel_size=1),
 52 |                 nn.BatchNorm1d(self.embed_dims),
 53 |                 nn.Conv1d(self.embed_dims, self.embed_dims, kernel_size=1),
 54 |                 nn.BatchNorm1d(self.embed_dims),
 55 |                 nn.Conv1d(self.embed_dims, self.embed_dims, kernel_size=1),
 56 |                 nn.BatchNorm1d(self.embed_dims)
 57 |             )
 58 |         else:
 59 |             self.cam_embed = None
 60 | 
 61 |         self.cross = cross
 62 | 
 63 |     def init_weights(self):
 64 |         # follow the official DETR to init parameters
 65 |         for m in self.modules():
 66 |             if hasattr(m, 'weight') and m.weight.dim() > 1:
 67 |                 xavier_init(m, distribution='uniform')
 68 |         self._is_init = True
 69 | 
 70 |     def forward(self, x, x_img, bev_query_embed, rv_query_embed, bev_pos_embed, rv_pos_embed, img_metas,
 71 |                 attn_masks=None, modalities=None, reg_branch=None):
 72 |         bs, c, h, w = x.shape
 73 |         bev_memory = rearrange(x, "bs c h w -> (h w) bs c")  # [bs, n, c, h, w] -> [n*h*w, bs, c]
 74 |         rv_memory = rearrange(x_img, "(bs v) c h w -> (v h w) bs c", bs=bs)
 75 | 
 76 |         bev_pos_embed = bev_pos_embed.unsqueeze(1).repeat(1, bs, 1)  # [bs, n, c, h, w] -> [n*h*w, bs, c]
 77 |         rv_pos_embed = rearrange(rv_pos_embed, "(bs v) h w c -> (v h w) bs c", bs=bs)
 78 | 
 79 |         if self.use_type_embed:
 80 |             bev_query_embed = bev_query_embed + self.bev_type_embed
 81 |             rv_query_embed = rv_query_embed + self.rv_type_embed
 82 | 
 83 |         if self.use_cam_embed:
 84 |             imgs2lidars = np.stack([np.linalg.inv(meta['lidar2img']) for meta in img_metas])
 85 |             imgs2lidars = torch.from_numpy(imgs2lidars).float().to(x.device)
 86 |             imgs2lidars = imgs2lidars.flatten(-2).permute(0, 2, 1)
 87 |             imgs2lidars = self.cam_embed(imgs2lidars)
 88 |             imgs2lidars = imgs2lidars.permute(0, 2, 1).reshape(-1, self.embed_dims, 1, 1)
 89 |             imgs2lidars = imgs2lidars.repeat(1, 1, *x_img.shape[-2:])
 90 |             imgs2lidars = rearrange(imgs2lidars, '(bs v) c h w -> (v h w) bs c', bs=bs)
 91 | 
 92 |         out_decs = []
 93 |         for modality in modalities:
 94 |             if modality == "fused":
 95 |                 memory, pos_embed = (torch.cat([bev_memory, rv_memory], dim=0),
 96 |                                      torch.cat([bev_pos_embed, rv_pos_embed], dim=0))
 97 |                 memory_v = memory
 98 |                 query_embed = bev_query_embed + rv_query_embed
 99 |             elif modality == "bev":
100 |                 memory, pos_embed = bev_memory, bev_pos_embed
101 |                 memory_v = memory
102 |                 query_embed = bev_query_embed
103 |             else:
104 |                 memory, pos_embed = rv_memory, rv_pos_embed
105 |                 memory_v = memory
106 |                 if self.cam_embed is not None:
107 |                     memory_v = memory_v * imgs2lidars
108 |                 query_embed = rv_query_embed
109 | 
110 |             query_embed = query_embed.transpose(0, 1)  # [bs, num_query, dim] -> [num_query, bs, dim]
111 |             target = torch.zeros_like(query_embed)
112 | 
113 |             # out_dec: [num_layers, num_query, bs, dim]
114 |             out_dec = self.decoder(
115 |                 query=target,
116 |                 key=memory,
117 |                 value=memory_v,
118 |                 query_pos=query_embed,
119 |                 key_pos=pos_embed,
120 |                 attn_masks=[attn_masks, None],
121 |                 reg_branch=reg_branch,
122 |             )
123 |             out_decs.append(out_dec.transpose(1, 2))
124 | 
125 |         return out_decs
126 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/utils/pme_transformer.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2022 megvii-model. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from DETR3D (https://github.com/WangYueFt/detr3d)
  5 | # Copyright (c) 2021 Wang, Yue
  6 | # ------------------------------------------------------------------------
  7 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
  8 | # Copyright (c) OpenMMLab. All rights reserved.
  9 | # ------------------------------------------------------------------------
 10 | import copy
 11 | 
 12 | import torch
 13 | import torch.nn as nn
 14 | from mmcv.cnn import xavier_init
 15 | from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence
 16 | from mmcv.runner.base_module import BaseModule
 17 | from mmdet.models.utils.builder import TRANSFORMER
 18 | from mmdet3d.models import builder
 19 | 
 20 | from projects.mmdet3d_plugin.models.dense_heads.meformer_head import pos2embed
 21 | 
 22 | 
 23 | @TRANSFORMER.register_module()
 24 | class PMETransformer(BaseModule):
 25 |     def __init__(
 26 |             self,
 27 |             decoder=None,
 28 |             heads=None,
 29 |             separate_head=None,
 30 |             num_classes=None,
 31 |             init_cfg=None
 32 |     ):
 33 |         super(PMETransformer, self).__init__(init_cfg=init_cfg)
 34 |         self.dist_scaler = nn.Parameter(torch.randn(1), requires_grad=True)
 35 |         self.dist_bias = nn.Parameter(torch.randn(1), requires_grad=True)
 36 | 
 37 |         self.decoder = build_transformer_layer_sequence(decoder)
 38 | 
 39 |         self.embed_dims = self.decoder.embed_dims
 40 |         self.num_layers = decoder["num_layers"]
 41 |         self.num_heads = decoder["transformerlayers"]["attn_cfgs"][0]["num_heads"]
 42 |         self.box_pos_embedding = nn.Sequential(
 43 |             nn.Linear(self.embed_dims * 2, self.embed_dims),
 44 |             nn.ReLU(inplace=True),
 45 |             nn.Linear(self.embed_dims, self.embed_dims)
 46 |         )
 47 |         self.modality_proj = nn.ModuleDict({
 48 |             "fused": nn.Sequential(
 49 |                 nn.Linear(self.embed_dims, self.embed_dims),
 50 |                 nn.LayerNorm(self.embed_dims)
 51 |             ),
 52 |             "bev": nn.Sequential(
 53 |                 nn.Linear(self.embed_dims, self.embed_dims),
 54 |                 nn.LayerNorm(self.embed_dims)
 55 |             ),
 56 |             "img": nn.Sequential(
 57 |                 nn.Linear(self.embed_dims, self.embed_dims),
 58 |                 nn.LayerNorm(self.embed_dims)
 59 |             )
 60 |         })
 61 | 
 62 |         self.task_heads = nn.ModuleList()
 63 |         for num_cls in num_classes:
 64 |             heads = copy.deepcopy(heads)
 65 |             heads.update(dict(cls_logits=(num_cls, 2)))
 66 |             separate_head.update(
 67 |                 in_channels=self.embed_dims,
 68 |                 heads=heads, num_cls=num_cls,
 69 |                 groups=decoder.num_layers
 70 |             )
 71 |             self.task_heads.append(builder.build_head(separate_head))
 72 | 
 73 |     def init_weights(self):
 74 |         # follow the official DETR to init parameters
 75 |         for m in self.decoder.modules():
 76 |             if hasattr(m, 'weight') and m.weight.dim() > 1:
 77 |                 xavier_init(m, distribution='uniform')
 78 | 
 79 |         self._is_init = True
 80 | 
 81 |     def forward(
 82 |             self,
 83 |             x,
 84 |             reference,
 85 |             outs,
 86 |             modalities,
 87 |             num_queries_per_modality,
 88 |             task_id,
 89 |             pc_range,
 90 |             attn_masks=None
 91 |     ):
 92 |         x = x[-1].transpose(0, 1)
 93 |         x = list(x.split(num_queries_per_modality, dim=0))
 94 |         x_proj = []
 95 | 
 96 |         for i, modality in enumerate(modalities):
 97 |             x_proj.append(self.modality_proj[modality](x[i]))
 98 | 
 99 |         target = x_proj[modalities.index("fused")]
100 |         memory = torch.cat(x_proj, dim=0)
101 | 
102 |         center = outs["center"][-1]
103 | 
104 |         box_pos_embed = pos2embed(center, self.embed_dims)
105 |         box_pos_embed = self.box_pos_embedding(box_pos_embed).transpose(0, 1)
106 |         box_pos_embed = list(box_pos_embed.split(num_queries_per_modality, dim=0))
107 | 
108 |         query_box_pos_embed = box_pos_embed[modalities.index("fused")]
109 |         key_box_pos_embed = torch.cat(box_pos_embed, dim=0)
110 | 
111 |         center = list(center.split(num_queries_per_modality, dim=1))
112 |         center_q = center[modalities.index("fused")]
113 |         center_kv = torch.cat(center, dim=1)
114 |         dist = (center_q.unsqueeze(2) - center_kv.unsqueeze(1)).norm(p=2, dim=-1)
115 |         dist_mask = dist * self.dist_scaler + self.dist_bias
116 | 
117 |         if attn_masks is None:
118 |             attn_masks = torch.zeros((target.shape[0], target.shape[0]), dtype=torch.bool, device=target.device)
119 | 
120 |         attn_masks = torch.zeros_like(attn_masks, dtype=torch.float).float().masked_fill(attn_masks, float("-inf"))
121 |         attn_masks = attn_masks.repeat(1, len(x_proj))
122 |         attn_masks = attn_masks + dist_mask
123 |         attn_masks = attn_masks.unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1)
124 | 
125 |         outs_dec = self.decoder(
126 |             query=target,
127 |             key=memory,
128 |             value=memory,
129 |             query_pos=query_box_pos_embed,
130 |             key_pos=key_box_pos_embed,
131 |             attn_masks=[attn_masks]
132 |         )
133 | 
134 |         outs_dec = outs_dec.transpose(1, 2)
135 |         outs = self.task_heads[task_id](outs_dec)
136 | 
137 |         reference = reference.split(num_queries_per_modality, dim=1)[modalities.index("fused")]
138 | 
139 |         center = (outs['center'] + reference[None, :, :, :2]).sigmoid()
140 |         height = (outs['height'] + reference[None, :, :, 2:3]).sigmoid()
141 |         _center, _height = center.new_zeros(center.shape), height.new_zeros(height.shape)
142 |         _center[..., 0:1] = center[..., 0:1] * (pc_range[3] - pc_range[0]) + pc_range[0]
143 |         _center[..., 1:2] = center[..., 1:2] * (pc_range[4] - pc_range[1]) + pc_range[1]
144 |         _height[..., 0:1] = height[..., 0:1] * (pc_range[5] - pc_range[2]) + pc_range[2]
145 |         outs['center'] = _center
146 |         outs['height'] = _height
147 | 
148 |         return outs
149 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/utils/attention.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 megvii-model. All Rights Reserved.
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | from einops import rearrange
  6 | from flash_attn.bert_padding import unpad_input
  7 | from flash_attn.flash_attn_interface import flash_attn_unpadded_kvpacked_func
  8 | from mmcv.runner import auto_fp16
  9 | from torch.nn.functional import linear
 10 | from torch.nn.init import xavier_uniform_, constant_
 11 | 
 12 | 
 13 | def _in_projection_packed(q, k, v, w, b=None):
 14 |     w_q, w_k, w_v = w.chunk(3)
 15 |     if b is None:
 16 |         b_q = b_k = b_v = None
 17 |     else:
 18 |         b_q, b_k, b_v = b.chunk(3)
 19 |     return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
 20 | 
 21 | 
 22 | class FlashAttention(nn.Module):
 23 |     """Implement the scaled dot product attention with softmax.
 24 |     Arguments
 25 |     ---------
 26 |         softmax_scale: The temperature to use for the softmax attention.
 27 |                       (default: 1/sqrt(d_keys) where d_keys is computed at
 28 |                       runtime)
 29 |         attention_dropout: The dropout rate to apply to the attention
 30 |                            (default: 0.1)
 31 |     """
 32 | 
 33 |     def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
 34 |         super().__init__()
 35 |         self.softmax_scale = softmax_scale
 36 |         self.dropout_p = attention_dropout
 37 |         self.fp16_enabled = True
 38 | 
 39 |     @auto_fp16(apply_to=('q', 'kv'), out_fp32=True)
 40 |     def forward(self, q, kv,
 41 |                 causal=False,
 42 |                 key_padding_mask=None):
 43 |         """Implements the multihead softmax attention.
 44 |         Arguments
 45 |         ---------
 46 |             q: The tensor containing the query. (B, T, H, D)
 47 |             kv: The tensor containing the key, and value. (B, S, 2, H, D)
 48 |             key_padding_mask: a bool tensor of shape (B, S)
 49 |         """
 50 |         assert q.dtype in [torch.float16, torch.bfloat16] and kv.dtype in [torch.float16, torch.bfloat16]
 51 |         assert q.is_cuda and kv.is_cuda
 52 |         assert q.shape[0] == kv.shape[0] and q.shape[-2] == kv.shape[-2] and q.shape[-1] == kv.shape[-1]
 53 | 
 54 |         batch_size = q.shape[0]
 55 |         seqlen_q, seqlen_k = q.shape[1], kv.shape[1]
 56 |         if key_padding_mask is None:
 57 |             q, kv = rearrange(q, 'b s ... -> (b s) ...'), rearrange(kv, 'b s ... -> (b s) ...')
 58 |             max_sq, max_sk = seqlen_q, seqlen_k
 59 |             cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32,
 60 |                                         device=q.device)
 61 |             cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32,
 62 |                                         device=kv.device)
 63 |             output = flash_attn_unpadded_kvpacked_func(
 64 |                 q, kv, cu_seqlens_q, cu_seqlens_k, max_sq, max_sk,
 65 |                 self.dropout_p if self.training else 0.0,
 66 |                 softmax_scale=self.softmax_scale, causal=causal
 67 |             )
 68 |             output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
 69 |         else:
 70 |             nheads = kv.shape[-2]
 71 |             q = rearrange(q, 'b s ... -> (b s) ...')
 72 |             max_sq = seqlen_q
 73 |             cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32,
 74 |                                         device=q.device)
 75 |             x = rearrange(kv, 'b s two h d -> b s (two h d)')
 76 |             x_unpad, indices, cu_seqlens_k, max_sk = unpad_input(x, key_padding_mask)
 77 |             x_unpad = rearrange(x_unpad, 'nnz (two h d) -> nnz two h d', two=2, h=nheads)
 78 |             output_unpad = flash_attn_unpadded_kvpacked_func(
 79 |                 q, x_unpad, cu_seqlens_q, cu_seqlens_k, max_sq, max_sk,
 80 |                 self.dropout_p if self.training else 0.0,
 81 |                 softmax_scale=self.softmax_scale, causal=causal
 82 |             )
 83 |             output = rearrange(output_unpad, '(b s) ... -> b s ...', b=batch_size)
 84 | 
 85 |         return output, None
 86 | 
 87 | 
 88 | class FlashMHA(nn.Module):
 89 | 
 90 |     def __init__(self, embed_dim, num_heads, bias=True, batch_first=True, attention_dropout=0.0,
 91 |                  causal=False, device=None, dtype=None, **kwargs) -> None:
 92 |         assert batch_first
 93 |         factory_kwargs = {'device': device, 'dtype': dtype}
 94 |         super().__init__()
 95 |         self.embed_dim = embed_dim
 96 |         self.causal = causal
 97 |         self.bias = bias
 98 | 
 99 |         self.num_heads = num_heads
100 |         assert self.embed_dim % num_heads == 0, "self.kdim must be divisible by num_heads"
101 |         self.head_dim = self.embed_dim // num_heads
102 |         assert self.head_dim % 8 == 0 and self.head_dim <= 128, "Only support head_dim <= 128 and divisible by 8"
103 | 
104 |         self.in_proj_weight = nn.Parameter(torch.empty((3 * embed_dim, embed_dim)))
105 |         if bias:
106 |             self.in_proj_bias = nn.Parameter(torch.empty(3 * embed_dim))
107 |         else:
108 |             self.register_parameter('in_proj_bias', None)
109 |         self.inner_attn = FlashAttention(attention_dropout=attention_dropout, **factory_kwargs)
110 |         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
111 |         self._reset_parameters()
112 | 
113 |     def _reset_parameters(self) -> None:
114 |         xavier_uniform_(self.in_proj_weight)
115 |         if self.in_proj_bias is not None:
116 |             constant_(self.in_proj_bias, 0.)
117 |             constant_(self.out_proj.bias, 0.)
118 | 
119 |     def forward(self, q, k, v, key_padding_mask=None):
120 |         """x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim)
121 |         key_padding_mask: bool tensor of shape (batch, seqlen)
122 |         """
123 |         # q, k, v = self.Wq(q), self.Wk(k), self.Wv(v)
124 |         q, k, v = _in_projection_packed(q, k, v, self.in_proj_weight, self.in_proj_bias)
125 |         q = rearrange(q, 'b s (h d) -> b s h d', h=self.num_heads)
126 |         k = rearrange(k, 'b s (h d) -> b s h d', h=self.num_heads)
127 |         v = rearrange(v, 'b s (h d) -> b s h d', h=self.num_heads)
128 |         kv = torch.stack([k, v], dim=2)
129 | 
130 |         context, attn_weights = self.inner_attn(q, kv, key_padding_mask=key_padding_mask, causal=self.causal)
131 |         return self.out_proj(rearrange(context, 'b s h d -> b s (h d)')), attn_weights
132 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/dense_heads/separate_task_head.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2023 megvii-model. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
  5 | # Copyright (c) OpenMMLab. All rights reserved.
  6 | # ------------------------------------------------------------------------
  7 | 
  8 | import torch
  9 | import torch.nn as nn
 10 | from einops import rearrange
 11 | from mmcv.runner import BaseModule
 12 | from mmdet.models import HEADS
 13 | 
 14 | 
 15 | class LayerNormFunction(torch.autograd.Function):
 16 | 
 17 |     @staticmethod
 18 |     def forward(ctx, x, weight, bias, groups, eps):
 19 |         ctx.groups = groups
 20 |         ctx.eps = eps
 21 |         N, C, L = x.size()
 22 |         x = x.view(N, groups, C // groups, L)
 23 |         mu = x.mean(2, keepdim=True)
 24 |         var = (x - mu).pow(2).mean(2, keepdim=True)
 25 |         y = (x - mu) / (var + eps).sqrt()
 26 |         ctx.save_for_backward(y, var, weight)
 27 |         y = weight.view(1, C, 1) * y.view(N, C, L) + bias.view(1, C, 1)
 28 |         return y
 29 | 
 30 |     @staticmethod
 31 |     def backward(ctx, grad_output):
 32 |         groups = ctx.groups
 33 |         eps = ctx.eps
 34 | 
 35 |         N, C, L = grad_output.size()
 36 |         y, var, weight = ctx.saved_variables
 37 |         g = grad_output * weight.view(1, C, 1)
 38 |         g = g.view(N, groups, C // groups, L)
 39 |         mean_g = g.mean(dim=2, keepdim=True)
 40 |         mean_gy = (g * y).mean(dim=2, keepdim=True)
 41 |         gx = 1. / torch.sqrt(var + eps) * (g - y * mean_gy - mean_g)
 42 |         return gx.view(N, C, L), (grad_output * y.view(N, C, L)).sum(dim=2).sum(dim=0), grad_output.sum(dim=2).sum(
 43 |             dim=0), None, None
 44 | 
 45 | 
 46 | class GroupLayerNorm1d(nn.Module):
 47 | 
 48 |     def __init__(self, channels, groups=1, eps=1e-6):
 49 |         super(GroupLayerNorm1d, self).__init__()
 50 |         self.register_parameter('weight', nn.Parameter(torch.ones(channels)))
 51 |         self.register_parameter('bias', nn.Parameter(torch.zeros(channels)))
 52 |         self.groups = groups
 53 |         self.eps = eps
 54 | 
 55 |     def forward(self, x):
 56 |         return LayerNormFunction.apply(x, self.weight, self.bias, self.groups, self.eps)
 57 | 
 58 | 
 59 | @HEADS.register_module()
 60 | class SeparateTaskHead(BaseModule):
 61 |     """SeparateHead for CenterHead.
 62 | 
 63 |     Args:
 64 |         in_channels (int): Input channels for conv_layer.
 65 |         heads (dict): Conv information.
 66 |         head_conv (int): Output channels.
 67 |             Default: 64.
 68 |         final_kernal (int): Kernal size for the last conv layer.
 69 |             Deafult: 1.
 70 |         init_bias (float): Initial bias. Default: -2.19.
 71 |         conv_cfg (dict): Config of conv layer.
 72 |             Default: dict(type='Conv2d')
 73 |         norm_cfg (dict): Config of norm layer.
 74 |             Default: dict(type='BN2d').
 75 |         bias (str): Type of bias. Default: 'auto'.
 76 |     """
 77 | 
 78 |     def __init__(self,
 79 |                  in_channels,
 80 |                  heads,
 81 |                  groups=1,
 82 |                  head_conv=64,
 83 |                  final_kernel=1,
 84 |                  init_bias=-2.19,
 85 |                  init_cfg=None,
 86 |                  **kwargs):
 87 |         assert init_cfg is None, 'To prevent abnormal initialization ' \
 88 |                                  'behavior, init_cfg is not allowed to be set'
 89 |         super(SeparateTaskHead, self).__init__(init_cfg=init_cfg)
 90 |         self.heads = heads
 91 |         self.groups = groups
 92 |         self.init_bias = init_bias
 93 |         for head in self.heads:
 94 |             classes, num_conv = self.heads[head]
 95 | 
 96 |             conv_layers = []
 97 |             c_in = in_channels
 98 |             for i in range(num_conv - 1):
 99 |                 conv_layers.extend([
100 |                     nn.Conv1d(
101 |                         c_in * groups,
102 |                         head_conv * groups,
103 |                         kernel_size=final_kernel,
104 |                         stride=1,
105 |                         padding=final_kernel // 2,
106 |                         groups=groups,
107 |                         bias=False),
108 |                     GroupLayerNorm1d(head_conv * groups, groups=groups),
109 |                     nn.ReLU(inplace=True)
110 |                 ])
111 |                 c_in = head_conv
112 | 
113 |             conv_layers.append(
114 |                 nn.Conv1d(
115 |                     head_conv * groups,
116 |                     classes * groups,
117 |                     kernel_size=final_kernel,
118 |                     stride=1,
119 |                     padding=final_kernel // 2,
120 |                     groups=groups,
121 |                     bias=True))
122 |             conv_layers = nn.Sequential(*conv_layers)
123 | 
124 |             self.__setattr__(head, conv_layers)
125 | 
126 |             if init_cfg is None:
127 |                 self.init_cfg = dict(type='Kaiming', layer='Conv1d')
128 | 
129 |     def init_weights(self):
130 |         """Initialize weights."""
131 |         super().init_weights()
132 |         for head in self.heads:
133 |             if head == 'cls_logits':
134 |                 self.__getattr__(head)[-1].bias.data.fill_(self.init_bias)
135 | 
136 |     def forward(self, x):
137 |         """Forward function for SepHead.
138 | 
139 |         Args:
140 |             x (torch.Tensor): Input feature map with the shape of
141 |                 [N, B, query, C].
142 | 
143 |         Returns:
144 |             dict[str: torch.Tensor]: contains the following keys:
145 | 
146 |                 -reg （torch.Tensor): 2D regression value with the \
147 |                     shape of [N, B, query, 2].
148 |                 -height (torch.Tensor): Height value with the \
149 |                     shape of [N, B, query, 1].
150 |                 -dim (torch.Tensor): Size value with the shape \
151 |                     of [N, B, query, 3].
152 |                 -rot (torch.Tensor): Rotation value with the \
153 |                     shape of [N, B, query, 2].
154 |                 -vel (torch.Tensor): Velocity value with the \
155 |                     shape of [N, B, query, 2].
156 |         """
157 |         N, B, query_num, c1 = x.shape
158 |         x = rearrange(x, "n b q c -> b (n c) q")
159 |         ret_dict = dict()
160 | 
161 |         for head in self.heads:
162 |             head_output = self.__getattr__(head)(x)
163 |             ret_dict[head] = rearrange(head_output, "b (n c) q -> n b q c", n=N)
164 | 
165 |         return ret_dict
166 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/core/bbox/coders/multi_task_bbox_coder.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2023 megvii-model. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from mmdetection (https://github.com/open-mmlab/mmdetection)
  5 | # Copyright (c) OpenMMLab. All rights reserved.
  6 | # ------------------------------------------------------------------------
  7 | 
  8 | import torch
  9 | from mmdet.core.bbox import BaseBBoxCoder
 10 | from mmdet.core.bbox.builder import BBOX_CODERS
 11 | 
 12 | from projects.mmdet3d_plugin.core.bbox.util import denormalize_bbox
 13 | 
 14 | 
 15 | @BBOX_CODERS.register_module()
 16 | class MultiTaskBBoxCoder(BaseBBoxCoder):
 17 |     """Bbox coder for NMS-free detector.
 18 |     Args:
 19 |         pc_range (list[float]): Range of point cloud.
 20 |         post_center_range (list[float]): Limit of the center.
 21 |             Default: None.
 22 |         max_num (int): Max number to be kept. Default: 100.
 23 |         score_threshold (float): Threshold to filter boxes based on score.
 24 |             Default: None.
 25 |         code_size (int): Code size of bboxes. Default: 9
 26 |     """
 27 | 
 28 |     def __init__(self,
 29 |                  pc_range,
 30 |                  voxel_size=None,
 31 |                  post_center_range=None,
 32 |                  max_num=100,
 33 |                  score_threshold=None,
 34 |                  num_classes=10):
 35 | 
 36 |         self.pc_range = pc_range
 37 |         self.voxel_size = voxel_size
 38 |         self.post_center_range = post_center_range
 39 |         self.max_num = max_num
 40 |         self.score_threshold = score_threshold
 41 |         self.num_classes = num_classes
 42 | 
 43 |     def encode(self):
 44 |         pass
 45 | 
 46 |     def decode_single(self, cls_scores, bbox_preds, task_ids):
 47 |         """Decode bboxes.
 48 |         Args:
 49 |             cls_scores (Tensor): Outputs from the classification head, \
 50 |                 shape [num_query, cls_out_channels]. Note \
 51 |                 cls_out_channels should includes background.
 52 |             bbox_preds (Tensor): Outputs from the regression \
 53 |                 head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
 54 |                 Shape [num_query, 9].
 55 |         Returns:
 56 |             list[dict]: Decoded boxes.
 57 |         """
 58 |         max_num = self.max_num
 59 |         num_query = cls_scores.shape[0]
 60 | 
 61 |         cls_scores = cls_scores.sigmoid()
 62 |         scores, indexs = cls_scores.view(-1).topk(max_num)
 63 |         labels = indexs % self.num_classes
 64 |         bbox_index = indexs // self.num_classes
 65 |         task_index = torch.gather(task_ids, 1, labels.unsqueeze(1)).squeeze()
 66 | 
 67 |         bbox_preds = bbox_preds[task_index * num_query + bbox_index]
 68 |         boxes3d = denormalize_bbox(bbox_preds, self.pc_range)
 69 | 
 70 |         # use score threshold
 71 |         if self.score_threshold is not None:
 72 |             thresh_mask = scores > self.score_threshold
 73 |         if self.post_center_range is not None:
 74 |             self.post_center_range = torch.tensor(self.post_center_range, device=scores.device)
 75 |             mask = (boxes3d[..., :3] >=
 76 |                     self.post_center_range[:3]).all(1)
 77 |             mask &= (boxes3d[..., :3] <=
 78 |                      self.post_center_range[3:]).all(1)
 79 | 
 80 |             if self.score_threshold:
 81 |                 mask &= thresh_mask
 82 | 
 83 |             boxes3d = boxes3d[mask]
 84 |             scores = scores[mask]
 85 |             labels = labels[mask]
 86 | 
 87 |         predictions_dict = {
 88 |             'bboxes': boxes3d,
 89 |             'scores': scores,
 90 |             'labels': labels
 91 |         }
 92 |         return predictions_dict
 93 | 
 94 |     def decode(self, preds_dicts):
 95 |         """Decode bboxes.
 96 |         Args:
 97 |             all_cls_scores (Tensor): Outputs from the classification head, \
 98 |                 shape [nb_dec, bs, num_query, cls_out_channels]. Note \
 99 |                 cls_out_channels should includes background.
100 |             all_bbox_preds (Tensor): Sigmoid outputs from the regression \
101 |                 head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
102 |                 Shape [nb_dec, bs, num_query, 9].
103 |         Returns:
104 |             list[dict]: Decoded boxes.
105 |         """
106 |         task_num = len(preds_dicts)
107 | 
108 |         pred_bbox_list, pred_logits_list, task_ids_list, rv_box_mask_lists = [], [], [], []
109 |         for task_id in range(task_num):
110 |             task_pred_dict = preds_dicts[task_id][0]
111 |             task_pred_bbox = [task_pred_dict['center'][-1], task_pred_dict['height'][-1],
112 |                               task_pred_dict['dim'][-1], task_pred_dict['rot'][-1]]
113 |             if 'vel' in task_pred_dict:
114 |                 task_pred_bbox.append(task_pred_dict['vel'][-1])
115 |             task_pred_bbox = torch.cat(task_pred_bbox, dim=-1)
116 |             task_pred_logits = task_pred_dict['cls_logits'][-1]
117 |             pred_bbox_list.append(task_pred_bbox)
118 |             pred_logits_list.append(task_pred_logits)
119 | 
120 |             if "rv_box_mask" in task_pred_dict:
121 |                 rv_box_mask_lists.append(task_pred_dict["rv_box_mask"])
122 |             else:
123 |                 rv_box_mask_lists.append(task_pred_dict["cls_logits"].new_ones(task_pred_dict["cls_logits"].shape[1], 6,
124 |                                                                                task_pred_dict["cls_logits"].shape[
125 |                                                                                    2]).to(torch.bool))
126 | 
127 |             task_ids = task_pred_logits.new_ones(task_pred_logits.shape).int() * task_id
128 |             task_ids_list.append(task_ids)
129 | 
130 |         all_pred_logits = torch.cat(pred_logits_list, dim=-1)  # bs * nq * 10
131 |         all_pred_bbox = torch.cat(pred_bbox_list, dim=1)  # bs * (task nq) * 10
132 |         all_task_ids = torch.cat(task_ids_list, dim=-1)  # bs * nq * 10
133 |         all_rv_box_masks = torch.cat(rv_box_mask_lists, dim=-1)
134 | 
135 |         batch_size = all_pred_logits.shape[0]
136 |         predictions_list = []
137 |         for i in range(batch_size):
138 |             rv_box_mask = all_rv_box_masks[i].sum(dim=0) != 0
139 |             if rv_box_mask.shape[0] != all_pred_bbox[i].shape[0]:
140 |                 box_mask = torch.cat([torch.ones_like(rv_box_mask), rv_box_mask])
141 |             else:
142 |                 box_mask = rv_box_mask
143 | 
144 |             pred_logits = all_pred_logits[i][box_mask]
145 |             pred_bbox = all_pred_bbox[i][box_mask]
146 |             task_ids = all_task_ids[i][box_mask]
147 | 
148 |             predictions_list.append(
149 |                 self.decode_single(pred_logits, pred_bbox, task_ids))
150 |         return predictions_list
151 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2023 megvii-model. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from DETR3D (https://github.com/WangYueFt/detr3d)
  5 | # Copyright (c) 2021 Wang, Yue
  6 | # ------------------------------------------------------------------------
  7 | # Modified from mmdetection (https://github.com/open-mmlab/mmdetection)
  8 | # Copyright (c) OpenMMLab. All rights reserved.
  9 | # ------------------------------------------------------------------------
 10 | 
 11 | import torch
 12 | from mmdet.core.bbox.assigners import AssignResult
 13 | from mmdet.core.bbox.assigners import BaseAssigner
 14 | from mmdet.core.bbox.builder import BBOX_ASSIGNERS
 15 | from mmdet.core.bbox.match_costs import build_match_cost
 16 | from scipy.optimize import linear_sum_assignment
 17 | 
 18 | from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox
 19 | 
 20 | 
 21 | @BBOX_ASSIGNERS.register_module()
 22 | class HungarianAssigner3D(BaseAssigner):
 23 |     """Computes one-to-one matching between predictions and ground truth.
 24 |     This class computes an assignment between the targets and the predictions
 25 |     based on the costs. The costs are weighted sum of three components:
 26 |     classification cost, regression L1 cost and regression iou cost. The
 27 |     targets don't include the no_object, so generally there are more
 28 |     predictions than targets. After the one-to-one matching, the un-matched
 29 |     are treated as backgrounds. Thus each query prediction will be assigned
 30 |     with `0` or a positive integer indicating the ground truth index:
 31 |     - 0: negative sample, no assigned gt
 32 |     - positive integer: positive sample, index (1-based) of assigned gt
 33 |     Args:
 34 |         cls_weight (int | float, optional): The scale factor for classification
 35 |             cost. Default 1.0.
 36 |         bbox_weight (int | float, optional): The scale factor for regression
 37 |             L1 cost. Default 1.0.
 38 |         iou_weight (int | float, optional): The scale factor for regression
 39 |             iou cost. Default 1.0.
 40 |         iou_calculator (dict | optional): The config for the iou calculation.
 41 |             Default type `BboxOverlaps2D`.
 42 |         iou_mode (str | optional): "iou" (intersection over union), "iof"
 43 |                 (intersection over foreground), or "giou" (generalized
 44 |                 intersection over union). Default "giou".
 45 |     """
 46 | 
 47 |     def __init__(self,
 48 |                  cls_cost=dict(type='ClassificationCost', weight=1.),
 49 |                  reg_cost=dict(type='BBoxL1Cost', weight=1.0),
 50 |                  iou_cost=dict(type='IoUCost', weight=0.0),
 51 |                  pc_range=None,
 52 |                  code_weights=None):
 53 |         self.cls_cost = build_match_cost(cls_cost)
 54 |         self.reg_cost = build_match_cost(reg_cost)
 55 |         self.iou_cost = build_match_cost(iou_cost)
 56 |         self.pc_range = pc_range
 57 |         self.code_weights = code_weights
 58 |         if self.code_weights:
 59 |             self.code_weights = torch.tensor(self.code_weights)[None, :].cuda()
 60 | 
 61 |     def assign(self,
 62 |                bbox_pred,
 63 |                cls_pred,
 64 |                gt_bboxes,
 65 |                gt_labels,
 66 |                gt_bboxes_ignore=None,
 67 |                eps=1e-7,
 68 |                code_weights=None):
 69 |         """Computes one-to-one matching based on the weighted costs.
 70 |         This method assign each query prediction to a ground truth or
 71 |         background. The `assigned_gt_inds` with -1 means don't care,
 72 |         0 means negative sample, and positive number is the index (1-based)
 73 |         of assigned gt.
 74 |         The assignment is done in the following steps, the order matters.
 75 |         1. assign every prediction to -1
 76 |         2. compute the weighted costs
 77 |         3. do Hungarian matching on CPU based on the costs
 78 |         4. assign all to 0 (background) first, then for each matched pair
 79 |            between predictions and gts, treat this prediction as foreground
 80 |            and assign the corresponding gt index (plus 1) to it.
 81 |         Args:
 82 |             bbox_pred (Tensor): Predicted boxes with normalized coordinates
 83 |                 (cx, cy, w, h), which are all in range [0, 1]. Shape
 84 |                 [num_query, 4].
 85 |             cls_pred (Tensor): Predicted classification logits, shape
 86 |                 [num_query, num_class].
 87 |             gt_bboxes (Tensor): Ground truth boxes with unnormalized
 88 |                 coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
 89 |             gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
 90 |             gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
 91 |                 labelled as `ignored`. Default None.
 92 |             eps (int | float, optional): A value added to the denominator for
 93 |                 numerical stability. Default 1e-7.
 94 |         Returns:
 95 |             :obj:`AssignResult`: The assigned result.
 96 |         """
 97 |         assert gt_bboxes_ignore is None, \
 98 |             'Only case when gt_bboxes_ignore is None is supported.'
 99 |         num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
100 | 
101 |         # 1. assign -1 by default
102 |         assigned_gt_inds = bbox_pred.new_full((num_bboxes,),
103 |                                               -1,
104 |                                               dtype=torch.long)
105 |         assigned_labels = bbox_pred.new_full((num_bboxes,),
106 |                                              -1,
107 |                                              dtype=torch.long)
108 |         if num_gts == 0 or num_bboxes == 0:
109 |             # No ground truth or boxes, return empty assignment
110 |             if num_gts == 0:
111 |                 # No ground truth, assign all to background
112 |                 assigned_gt_inds[:] = 0
113 |             return AssignResult(
114 |                 num_gts, assigned_gt_inds, None, labels=assigned_labels)
115 | 
116 |         # 2. compute the weighted costs
117 |         # classification and bboxcost.
118 |         cls_cost = self.cls_cost(cls_pred, gt_labels)
119 |         # regression L1 cost
120 |         normalized_gt_bboxes = normalize_bbox(gt_bboxes, self.pc_range)
121 | 
122 |         if self.code_weights is not None:
123 |             bbox_pred = bbox_pred * self.code_weights
124 |             normalized_gt_bboxes = normalized_gt_bboxes * self.code_weights
125 | 
126 |         reg_cost = self.reg_cost(bbox_pred[:, :8], normalized_gt_bboxes[:, :8])
127 | 
128 |         # weighted sum of above two costs
129 |         cost = cls_cost + reg_cost
130 | 
131 |         # 3. do Hungarian matching on CPU using linear_sum_assignment
132 |         cost = cost.detach().cpu()
133 |         if linear_sum_assignment is None:
134 |             raise ImportError('Please run "pip install scipy" '
135 |                               'to install scipy first.')
136 |         matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
137 |         matched_row_inds = torch.from_numpy(matched_row_inds).to(
138 |             bbox_pred.device)
139 |         matched_col_inds = torch.from_numpy(matched_col_inds).to(
140 |             bbox_pred.device)
141 | 
142 |         # 4. assign backgrounds and foregrounds
143 |         # assign all indices to backgrounds first
144 |         assigned_gt_inds[:] = 0
145 |         # assign foregrounds based on matching results
146 |         assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
147 |         assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
148 |         return AssignResult(
149 |             num_gts, assigned_gt_inds, None, labels=assigned_labels)
150 | 


--------------------------------------------------------------------------------
/tools/visual_utils/visualize_utils.py:
--------------------------------------------------------------------------------
  1 | import mayavi.mlab as mlab
  2 | import numpy as np
  3 | import torch
  4 | 
  5 | box_colormap = [
  6 |     [1, 1, 1],
  7 |     [0, 1, 0],
  8 |     [0, 1, 1],
  9 |     [1, 1, 0],
 10 | ]
 11 | 
 12 | 
 13 | def check_numpy_to_torch(x):
 14 |     if isinstance(x, np.ndarray):
 15 |         return torch.from_numpy(x).float(), True
 16 |     return x, False
 17 | 
 18 | 
 19 | def rotate_points_along_z(points, angle):
 20 |     """
 21 |     Args:
 22 |         points: (B, N, 3 + C)
 23 |         angle: (B), angle along z-axis, angle increases x ==> y
 24 |     Returns:
 25 | 
 26 |     """
 27 |     points, is_numpy = check_numpy_to_torch(points)
 28 |     angle, _ = check_numpy_to_torch(angle)
 29 | 
 30 |     cosa = torch.cos(angle)
 31 |     sina = torch.sin(angle)
 32 |     zeros = angle.new_zeros(points.shape[0])
 33 |     ones = angle.new_ones(points.shape[0])
 34 |     rot_matrix = torch.stack((
 35 |         cosa, sina, zeros,
 36 |         -sina, cosa, zeros,
 37 |         zeros, zeros, ones
 38 |     ), dim=1).view(-1, 3, 3).float()
 39 |     points_rot = torch.matmul(points[:, :, 0:3], rot_matrix)
 40 |     points_rot = torch.cat((points_rot, points[:, :, 3:]), dim=-1)
 41 |     return points_rot.numpy() if is_numpy else points_rot
 42 | 
 43 | 
 44 | def boxes_to_corners_3d(boxes3d):
 45 |     """
 46 |         7 -------- 4
 47 |        /|         /|
 48 |       6 -------- 5 .
 49 |       | |        | |
 50 |       . 3 -------- 0
 51 |       |/         |/
 52 |       2 -------- 1
 53 |     Args:
 54 |         boxes3d:  (N, 7) [x, y, z, dx, dy, dz, heading], (x, y, z) is the box center
 55 | 
 56 |     Returns:
 57 |     """
 58 |     boxes3d, is_numpy = check_numpy_to_torch(boxes3d)
 59 | 
 60 |     template = boxes3d.new_tensor((
 61 |         [1, 1, -1], [1, -1, -1], [-1, -1, -1], [-1, 1, -1],
 62 |         [1, 1, 1], [1, -1, 1], [-1, -1, 1], [-1, 1, 1],
 63 |     )) / 2
 64 | 
 65 |     corners3d = boxes3d[:, None, 3:6].repeat(1, 8, 1) * template[None, :, :]
 66 |     corners3d = rotate_points_along_z(corners3d.view(-1, 8, 3), boxes3d[:, 6]).view(-1, 8, 3)
 67 |     corners3d += boxes3d[:, None, 0:3]
 68 | 
 69 |     return corners3d.numpy() if is_numpy else corners3d
 70 | 
 71 | 
 72 | def visualize_pts(pts, fig=None, bgcolor=(0, 0, 0), fgcolor=(1.0, 1.0, 1.0),
 73 |                   show_intensity=False, size=(600, 600), draw_origin=True):
 74 |     if not isinstance(pts, np.ndarray):
 75 |         pts = pts.cpu().numpy()
 76 |     if fig is None:
 77 |         fig = mlab.figure(figure=None, bgcolor=bgcolor, fgcolor=fgcolor, engine=None, size=size)
 78 | 
 79 |     if show_intensity:
 80 |         G = mlab.points3d(pts[:, 0], pts[:, 1], pts[:, 2], pts[:, 3], mode='point',
 81 |                           colormap='gnuplot', scale_factor=1, figure=fig)
 82 |     else:
 83 |         G = mlab.points3d(pts[:, 0], pts[:, 1], pts[:, 2], mode='point',
 84 |                           colormap='gnuplot', scale_factor=1, figure=fig)
 85 |     if draw_origin:
 86 |         mlab.points3d(0, 0, 0, color=(1, 1, 1), mode='cube', scale_factor=0.2)
 87 |         mlab.plot3d([0, 3], [0, 0], [0, 0], color=(0, 0, 1), tube_radius=0.1)
 88 |         mlab.plot3d([0, 0], [0, 3], [0, 0], color=(0, 1, 0), tube_radius=0.1)
 89 |         mlab.plot3d([0, 0], [0, 0], [0, 3], color=(1, 0, 0), tube_radius=0.1)
 90 | 
 91 |     return fig
 92 | 
 93 | 
 94 | def draw_sphere_pts(pts, color=(0, 1, 0), fig=None, bgcolor=(0, 0, 0), scale_factor=0.2):
 95 |     if not isinstance(pts, np.ndarray):
 96 |         pts = pts.cpu().numpy()
 97 | 
 98 |     if fig is None:
 99 |         fig = mlab.figure(figure=None, bgcolor=bgcolor, fgcolor=None, engine=None, size=(600, 600))
100 | 
101 |     if isinstance(color, np.ndarray) and color.shape[0] == 1:
102 |         color = color[0]
103 |         color = (color[0] / 255.0, color[1] / 255.0, color[2] / 255.0)
104 | 
105 |     if isinstance(color, np.ndarray):
106 |         pts_color = np.zeros((pts.__len__(), 4), dtype=np.uint8)
107 |         pts_color[:, 0:3] = color
108 |         pts_color[:, 3] = 255
109 |         G = mlab.points3d(pts[:, 0], pts[:, 1], pts[:, 2], np.arange(0, pts_color.__len__()), mode='sphere',
110 |                           scale_factor=scale_factor, figure=fig)
111 |         G.glyph.color_mode = 'color_by_scalar'
112 |         G.glyph.scale_mode = 'scale_by_vector'
113 |         G.module_manager.scalar_lut_manager.lut.table = pts_color
114 |     else:
115 |         mlab.points3d(pts[:, 0], pts[:, 1], pts[:, 2], mode='sphere', color=color,
116 |                       colormap='gnuplot', scale_factor=scale_factor, figure=fig)
117 | 
118 |     mlab.points3d(0, 0, 0, color=(1, 1, 1), mode='cube', scale_factor=0.2)
119 |     mlab.plot3d([0, 3], [0, 0], [0, 0], color=(0, 0, 1), line_width=3, tube_radius=None, figure=fig)
120 |     mlab.plot3d([0, 0], [0, 3], [0, 0], color=(0, 1, 0), line_width=3, tube_radius=None, figure=fig)
121 |     mlab.plot3d([0, 0], [0, 0], [0, 3], color=(1, 0, 0), line_width=3, tube_radius=None, figure=fig)
122 | 
123 |     return fig
124 | 
125 | 
126 | def draw_grid(x1, y1, x2, y2, fig, tube_radius=None, color=(0.5, 0.5, 0.5)):
127 |     mlab.plot3d([x1, x1], [y1, y2], [0, 0], color=color, tube_radius=tube_radius, line_width=1, figure=fig)
128 |     mlab.plot3d([x2, x2], [y1, y2], [0, 0], color=color, tube_radius=tube_radius, line_width=1, figure=fig)
129 |     mlab.plot3d([x1, x2], [y1, y1], [0, 0], color=color, tube_radius=tube_radius, line_width=1, figure=fig)
130 |     mlab.plot3d([x1, x2], [y2, y2], [0, 0], color=color, tube_radius=tube_radius, line_width=1, figure=fig)
131 |     return fig
132 | 
133 | 
134 | def draw_multi_grid_range(fig, grid_size=20, bv_range=(-60, -60, 60, 60)):
135 |     for x in range(bv_range[0], bv_range[2], grid_size):
136 |         for y in range(bv_range[1], bv_range[3], grid_size):
137 |             fig = draw_grid(x, y, x + grid_size, y + grid_size, fig)
138 | 
139 |     return fig
140 | 
141 | 
142 | def draw_scenes(points, gt_boxes=None, ref_boxes=None, ref_scores=None, ref_labels=None):
143 |     if not isinstance(points, np.ndarray):
144 |         points = points.cpu().numpy()
145 |     if ref_boxes is not None and not isinstance(ref_boxes, np.ndarray):
146 |         ref_boxes = ref_boxes.cpu().numpy()
147 |     if gt_boxes is not None and not isinstance(gt_boxes, np.ndarray):
148 |         gt_boxes = gt_boxes.cpu().numpy()
149 |     if ref_scores is not None and not isinstance(ref_scores, np.ndarray):
150 |         ref_scores = ref_scores.cpu().numpy()
151 |     if ref_labels is not None and not isinstance(ref_labels, np.ndarray):
152 |         ref_labels = ref_labels.cpu().numpy()
153 | 
154 |     fig = visualize_pts(points)
155 |     fig = draw_multi_grid_range(fig, bv_range=(0, -40, 80, 40))
156 |     if gt_boxes is not None:
157 |         corners3d = boxes_to_corners_3d(gt_boxes)
158 |         fig = draw_corners3d(corners3d, fig=fig, color=(0, 0, 1), max_num=100)
159 | 
160 |     if ref_boxes is not None and len(ref_boxes) > 0:
161 |         ref_corners3d = boxes_to_corners_3d(ref_boxes)
162 |         if ref_labels is None:
163 |             fig = draw_corners3d(ref_corners3d, fig=fig, color=(0, 1, 0), cls=ref_scores, max_num=100)
164 |         else:
165 |             for k in range(ref_labels.min(), ref_labels.max() + 1):
166 |                 cur_color = tuple(box_colormap[k % len(box_colormap)])
167 |                 mask = (ref_labels == k)
168 |                 fig = draw_corners3d(ref_corners3d[mask], fig=fig, color=cur_color, cls=ref_scores[mask], max_num=100)
169 |     mlab.view(azimuth=-179, elevation=54.0, distance=104.0, roll=90.0)
170 |     return fig
171 | 
172 | 
173 | def draw_corners3d(corners3d, fig, color=(1, 1, 1), line_width=2, cls=None, tag='', max_num=500, tube_radius=None):
174 |     """
175 |     :param corners3d: (N, 8, 3)
176 |     :param fig:
177 |     :param color:
178 |     :param line_width:
179 |     :param cls:
180 |     :param tag:
181 |     :param max_num:
182 |     :return:
183 |     """
184 |     import mayavi.mlab as mlab
185 |     num = min(max_num, len(corners3d))
186 |     for n in range(num):
187 |         b = corners3d[n]  # (8, 3)
188 | 
189 |         if cls is not None:
190 |             if isinstance(cls, np.ndarray):
191 |                 mlab.text3d(b[6, 0], b[6, 1], b[6, 2], '%.2f' % cls[n], scale=(0.3, 0.3, 0.3), color=color, figure=fig)
192 |             else:
193 |                 mlab.text3d(b[6, 0], b[6, 1], b[6, 2], '%s' % cls[n], scale=(0.3, 0.3, 0.3), color=color, figure=fig)
194 | 
195 |         for k in range(0, 4):
196 |             i, j = k, (k + 1) % 4
197 |             mlab.plot3d([b[i, 0], b[j, 0]], [b[i, 1], b[j, 1]], [b[i, 2], b[j, 2]], color=color,
198 |                         tube_radius=tube_radius,
199 |                         line_width=line_width, figure=fig)
200 | 
201 |             i, j = k + 4, (k + 1) % 4 + 4
202 |             mlab.plot3d([b[i, 0], b[j, 0]], [b[i, 1], b[j, 1]], [b[i, 2], b[j, 2]], color=color,
203 |                         tube_radius=tube_radius,
204 |                         line_width=line_width, figure=fig)
205 | 
206 |             i, j = k, k + 4
207 |             mlab.plot3d([b[i, 0], b[j, 0]], [b[i, 1], b[j, 1]], [b[i, 2], b[j, 2]], color=color,
208 |                         tube_radius=tube_radius,
209 |                         line_width=line_width, figure=fig)
210 | 
211 |         i, j = 0, 5
212 |         mlab.plot3d([b[i, 0], b[j, 0]], [b[i, 1], b[j, 1]], [b[i, 2], b[j, 2]], color=color, tube_radius=tube_radius,
213 |                     line_width=line_width, figure=fig)
214 |         i, j = 1, 4
215 |         mlab.plot3d([b[i, 0], b[j, 0]], [b[i, 1], b[j, 1]], [b[i, 2], b[j, 2]], color=color, tube_radius=tube_radius,
216 |                     line_width=line_width, figure=fig)
217 | 
218 |     return fig
219 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/necks/cp_fpn.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) OpenMMLab. All rights reserved.
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from mmcv.cnn import ConvModule
  5 | from mmcv.runner import BaseModule, auto_fp16
  6 | 
  7 | from mmdet.models import NECKS
  8 | 
  9 | 
 10 | ####This FPN remove the unused parameters which can used with checkpoint (with_cp = True in Backbone)
 11 | @NECKS.register_module()
 12 | class CPFPN(BaseModule):
 13 |     r"""Feature Pyramid Network.
 14 | 
 15 |     This is an implementation of paper `Feature Pyramid Networks for Object
 16 |     Detection <https://arxiv.org/abs/1612.03144>`_.
 17 | 
 18 |     Args:
 19 |         in_channels (List[int]): Number of input channels per scale.
 20 |         out_channels (int): Number of output channels (used at each scale)
 21 |         num_outs (int): Number of output scales.
 22 |         start_level (int): Index of the start input backbone level used to
 23 |             build the feature pyramid. Default: 0.
 24 |         end_level (int): Index of the end input backbone level (exclusive) to
 25 |             build the feature pyramid. Default: -1, which means the last level.
 26 |         add_extra_convs (bool | str): If bool, it decides whether to add conv
 27 |             layers on top of the original feature maps. Default to False.
 28 |             If True, it is equivalent to `add_extra_convs='on_input'`.
 29 |             If str, it specifies the source feature map of the extra convs.
 30 |             Only the following options are allowed
 31 | 
 32 |             - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
 33 |             - 'on_lateral':  Last feature map after lateral convs.
 34 |             - 'on_output': The last output feature map after fpn convs.
 35 |         relu_before_extra_convs (bool): Whether to apply relu before the extra
 36 |             conv. Default: False.
 37 |         no_norm_on_lateral (bool): Whether to apply norm on lateral.
 38 |             Default: False.
 39 |         conv_cfg (dict): Config dict for convolution layer. Default: None.
 40 |         norm_cfg (dict): Config dict for normalization layer. Default: None.
 41 |         act_cfg (str): Config dict for activation layer in ConvModule.
 42 |             Default: None.
 43 |         upsample_cfg (dict): Config dict for interpolate layer.
 44 |             Default: `dict(mode='nearest')`
 45 |         init_cfg (dict or list[dict], optional): Initialization config dict.
 46 | 
 47 |     Example:
 48 |         >>> import torch
 49 |         >>> in_channels = [2, 3, 5, 7]
 50 |         >>> scales = [340, 170, 84, 43]
 51 |         >>> inputs = [torch.rand(1, c, s, s)
 52 |         ...           for c, s in zip(in_channels, scales)]
 53 |         >>> self = FPN(in_channels, 11, len(in_channels)).eval()
 54 |         >>> outputs = self.forward(inputs)
 55 |         >>> for i in range(len(outputs)):
 56 |         ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
 57 |         outputs[0].shape = torch.Size([1, 11, 340, 340])
 58 |         outputs[1].shape = torch.Size([1, 11, 170, 170])
 59 |         outputs[2].shape = torch.Size([1, 11, 84, 84])
 60 |         outputs[3].shape = torch.Size([1, 11, 43, 43])
 61 |     """
 62 | 
 63 |     def __init__(self,
 64 |                  in_channels,
 65 |                  out_channels,
 66 |                  num_outs,
 67 |                  start_level=0,
 68 |                  end_level=-1,
 69 |                  add_extra_convs=False,
 70 |                  relu_before_extra_convs=False,
 71 |                  no_norm_on_lateral=False,
 72 |                  conv_cfg=None,
 73 |                  norm_cfg=None,
 74 |                  act_cfg=None,
 75 |                  upsample_cfg=dict(mode='nearest'),
 76 |                  init_cfg=dict(
 77 |                      type='Xavier', layer='Conv2d', distribution='uniform')):
 78 |         super(CPFPN, self).__init__(init_cfg)
 79 |         assert isinstance(in_channels, list)
 80 |         self.in_channels = in_channels
 81 |         self.out_channels = out_channels
 82 |         self.num_ins = len(in_channels)
 83 |         self.num_outs = num_outs
 84 |         self.relu_before_extra_convs = relu_before_extra_convs
 85 |         self.no_norm_on_lateral = no_norm_on_lateral
 86 |         self.fp16_enabled = False
 87 |         self.upsample_cfg = upsample_cfg.copy()
 88 | 
 89 |         if end_level == -1:
 90 |             self.backbone_end_level = self.num_ins
 91 |             assert num_outs >= self.num_ins - start_level
 92 |         else:
 93 |             # if end_level < inputs, no extra level is allowed
 94 |             self.backbone_end_level = end_level
 95 |             assert end_level <= len(in_channels)
 96 |             assert num_outs == end_level - start_level
 97 |         self.start_level = start_level
 98 |         self.end_level = end_level
 99 |         self.add_extra_convs = add_extra_convs
100 |         assert isinstance(add_extra_convs, (str, bool))
101 |         if isinstance(add_extra_convs, str):
102 |             # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
103 |             assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
104 |         elif add_extra_convs:  # True
105 |             self.add_extra_convs = 'on_input'
106 | 
107 |         self.lateral_convs = nn.ModuleList()
108 |         self.fpn_convs = nn.ModuleList()
109 | 
110 |         for i in range(self.start_level, self.backbone_end_level):
111 |             l_conv = ConvModule(
112 |                 in_channels[i],
113 |                 out_channels,
114 |                 1,
115 |                 conv_cfg=conv_cfg,
116 |                 norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
117 |                 act_cfg=act_cfg,
118 |                 inplace=False)
119 |             self.lateral_convs.append(l_conv)
120 |             if i == 0:
121 |                 fpn_conv = ConvModule(
122 |                     out_channels,
123 |                     out_channels,
124 |                     3,
125 |                     padding=1,
126 |                     conv_cfg=conv_cfg,
127 |                     norm_cfg=norm_cfg,
128 |                     act_cfg=act_cfg,
129 |                     inplace=False)
130 |                 self.fpn_convs.append(fpn_conv)
131 | 
132 |         # add extra conv layers (e.g., RetinaNet)
133 |         extra_levels = num_outs - self.backbone_end_level + self.start_level
134 |         if self.add_extra_convs and extra_levels >= 1:
135 |             for i in range(extra_levels):
136 |                 if i == 0 and self.add_extra_convs == 'on_input':
137 |                     in_channels = self.in_channels[self.backbone_end_level - 1]
138 |                 else:
139 |                     in_channels = out_channels
140 |                 extra_fpn_conv = ConvModule(
141 |                     in_channels,
142 |                     out_channels,
143 |                     3,
144 |                     stride=2,
145 |                     padding=1,
146 |                     conv_cfg=conv_cfg,
147 |                     norm_cfg=norm_cfg,
148 |                     act_cfg=act_cfg,
149 |                     inplace=False)
150 |                 self.fpn_convs.append(extra_fpn_conv)
151 | 
152 |     @auto_fp16()
153 |     def forward(self, inputs):
154 |         """Forward function."""
155 |         assert len(inputs) == len(self.in_channels)
156 | 
157 |         # build laterals
158 |         laterals = [
159 |             lateral_conv(inputs[i + self.start_level])
160 |             for i, lateral_conv in enumerate(self.lateral_convs)
161 |         ]
162 | 
163 |         # build top-down path
164 |         used_backbone_levels = len(laterals)
165 |         for i in range(used_backbone_levels - 1, 0, -1):
166 |             # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
167 |             #  it cannot co-exist with `size` in `F.interpolate`.
168 |             if 'scale_factor' in self.upsample_cfg:
169 |                 laterals[i - 1] += F.interpolate(laterals[i],
170 |                                                  **self.upsample_cfg)
171 |             else:
172 |                 prev_shape = laterals[i - 1].shape[2:]
173 |                 laterals[i - 1] += F.interpolate(
174 |                     laterals[i], size=prev_shape, **self.upsample_cfg)
175 | 
176 |         # build outputs
177 |         # part 1: from original levels
178 |         outs = [
179 |             self.fpn_convs[i](laterals[i]) if i == 0 else laterals[i] for i in range(used_backbone_levels)
180 |         ]
181 |         # part 2: add extra levels
182 |         if self.num_outs > len(outs):
183 |             # use max pool to get more levels on top of outputs
184 |             # (e.g., Faster R-CNN, Mask R-CNN)
185 |             if not self.add_extra_convs:
186 |                 for i in range(self.num_outs - used_backbone_levels):
187 |                     outs.append(F.max_pool2d(outs[-1], 1, stride=2))
188 |             # add conv layers on top of original feature maps (RetinaNet)
189 |             else:
190 |                 if self.add_extra_convs == 'on_input':
191 |                     extra_source = inputs[self.backbone_end_level - 1]
192 |                 elif self.add_extra_convs == 'on_lateral':
193 |                     extra_source = laterals[-1]
194 |                 elif self.add_extra_convs == 'on_output':
195 |                     extra_source = outs[-1]
196 |                 else:
197 |                     raise NotImplementedError
198 |                 outs.append(self.fpn_convs[used_backbone_levels](extra_source))
199 |                 for i in range(used_backbone_levels + 1, self.num_outs):
200 |                     if self.relu_before_extra_convs:
201 |                         outs.append(self.fpn_convs[i](F.relu(outs[-1])))
202 |                     else:
203 |                         outs.append(self.fpn_convs[i](outs[-1]))
204 |         return tuple(outs)
205 | 


--------------------------------------------------------------------------------
/tools/data_converter/create_gt_database.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2023 megvii-model. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from UVTR (https://github.com/dvlab-research/UVTR)
  5 | # Copyright (c) 2022 Li, Yanwei
  6 | # ------------------------------------------------------------------------
  7 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
  8 | # Copyright (c) OpenMMLab. All rights reserved.
  9 | # ------------------------------------------------------------------------
 10 | 
 11 | import pickle
 12 | from os import path as osp
 13 | 
 14 | import mmcv
 15 | import numpy as np
 16 | from mmcv import track_iter_progress
 17 | from mmdet3d.core.bbox import box_np_ops as box_np_ops
 18 | from mmdet3d.datasets import build_dataset
 19 | 
 20 | 
 21 | def create_groundtruth_database(dataset_class_name,
 22 |                                 data_path,
 23 |                                 info_prefix,
 24 |                                 info_path=None,
 25 |                                 mask_anno_path=None,
 26 |                                 used_classes=None,
 27 |                                 database_save_path=None,
 28 |                                 db_info_save_path=None,
 29 |                                 relative_path=True,
 30 |                                 add_rgb=False,
 31 |                                 lidar_only=False,
 32 |                                 bev_only=False,
 33 |                                 coors_range=None,
 34 |                                 with_mask=False):
 35 |     """Given the raw data, generate the ground truth database.
 36 | 
 37 |     Args:
 38 |         dataset_class_name (str): Name of the input dataset.
 39 |         data_path (str): Path of the data.
 40 |         info_prefix (str): Prefix of the info file.
 41 |         info_path (str, optional): Path of the info file.
 42 |             Default: None.
 43 |         mask_anno_path (str, optional): Path of the mask_anno.
 44 |             Default: None.
 45 |         used_classes (list[str], optional): Classes have been used.
 46 |             Default: None.
 47 |         database_save_path (str, optional): Path to save database.
 48 |             Default: None.
 49 |         db_info_save_path (str, optional): Path to save db_info.
 50 |             Default: None.
 51 |         relative_path (bool, optional): Whether to use relative path.
 52 |             Default: True.
 53 |         with_mask (bool, optional): Whether to use mask.
 54 |             Default: False.
 55 |     """
 56 |     print(f'Create GT Database of {dataset_class_name}')
 57 |     dataset_cfg = dict(
 58 |         type=dataset_class_name, data_root=data_path, ann_file=info_path, return_gt_info=True)
 59 | 
 60 |     if dataset_class_name == 'CustomNuScenesDataset':
 61 |         dataset_cfg.update(
 62 |             use_valid_flag=True,
 63 |             pipeline=[
 64 |                 dict(
 65 |                     type='LoadPointsFromFile',
 66 |                     coord_type='LIDAR',
 67 |                     load_dim=5,
 68 |                     use_dim=5),
 69 |                 dict(
 70 |                     type='LoadPointsFromMultiSweeps',
 71 |                     sweeps_num=10,
 72 |                     use_dim=[0, 1, 2, 3, 4],
 73 |                     pad_empty_sweeps=True,
 74 |                     remove_close=True),
 75 |                 dict(
 76 |                     type='LoadAnnotations3D',
 77 |                     with_bbox_3d=True,
 78 |                     with_label_3d=True)
 79 |             ])
 80 | 
 81 |     dataset = build_dataset(dataset_cfg)
 82 | 
 83 |     if database_save_path is None:
 84 |         database_save_path = osp.join(data_path, f'{info_prefix}_gt_database')
 85 |     if db_info_save_path is None:
 86 |         db_info_save_path = osp.join(data_path,
 87 |                                      f'{info_prefix}_dbinfos_train.pkl')
 88 | 
 89 |     database_pts_path = osp.join(database_save_path, 'pts_dir')
 90 |     database_img_path = osp.join(database_save_path, 'img_dir')
 91 |     mmcv.mkdir_or_exist(database_save_path)
 92 |     mmcv.mkdir_or_exist(database_pts_path)
 93 |     mmcv.mkdir_or_exist(database_img_path)
 94 |     all_db_infos = dict()
 95 | 
 96 |     group_counter = 0
 97 |     for j in track_iter_progress(list(range(len(dataset)))):
 98 |         input_dict = dataset.get_data_info(j)
 99 |         dataset.pre_pipeline(input_dict)
100 |         example = dataset.pipeline(input_dict)
101 |         annos = example['ann_info']
102 |         image_idx = example['sample_idx']
103 |         points = example['points'].tensor.numpy()
104 |         gt_boxes_3d = annos['gt_bboxes_3d'].tensor.numpy()
105 |         names = annos['gt_names']
106 |         group_dict = dict()
107 |         if 'group_ids' in annos:
108 |             group_ids = annos['group_ids']
109 |         else:
110 |             group_ids = np.arange(gt_boxes_3d.shape[0], dtype=np.int64)
111 |         difficulty = np.zeros(gt_boxes_3d.shape[0], dtype=np.int32)
112 |         if 'difficulty' in annos:
113 |             difficulty = annos['difficulty']
114 | 
115 |         num_obj = gt_boxes_3d.shape[0]
116 |         point_indices = box_np_ops.points_in_rbbox(points, gt_boxes_3d)
117 | 
118 |         # load multi-view image
119 |         input_img = {}
120 |         input_info = {}
121 |         for _cam in example['info']['cams']:
122 |             cam_info = example['info']['cams'][_cam]
123 |             _path = cam_info['data_path']
124 |             _img = mmcv.imread(_path, 'unchanged')
125 |             input_img[_cam] = _img
126 | 
127 |             # obtain lidar to image transformation matrix
128 |             lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
129 |             lidar2cam_t = cam_info[
130 |                               'sensor2lidar_translation'] @ lidar2cam_r.T
131 |             lidar2cam_rt = np.eye(4)
132 |             lidar2cam_rt[:3, :3] = lidar2cam_r.T
133 |             lidar2cam_rt[3, :3] = -lidar2cam_t
134 |             intrinsic = cam_info['cam_intrinsic']
135 |             viewpad = np.eye(4)
136 |             viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
137 |             lidar2img_rt = (viewpad @ lidar2cam_rt.T)
138 | 
139 |             input_info[_cam] = {
140 |                 'lidar2img': lidar2img_rt,
141 |                 'lidar2cam': lidar2cam_rt,
142 |                 'cam_intrinsic': viewpad}
143 | 
144 |         for i in range(num_obj):
145 |             pts_filename = f'{image_idx}_{names[i]}_{i}.bin'
146 |             img_filename = f'{image_idx}_{names[i]}_{i}.png'
147 |             abs_filepath = osp.join(database_pts_path, pts_filename)
148 |             abs_img_filepath = osp.join(database_img_path, img_filename)
149 |             rel_filepath = osp.join(f'{info_prefix}_gt_database', 'pts_dir', pts_filename)
150 |             rel_img_filepath = osp.join(f'{info_prefix}_gt_database', 'img_dir', img_filename)
151 | 
152 |             # save point clouds and image patches for each object
153 |             gt_points = points[point_indices[:, i]]
154 |             gt_points[:, :3] -= gt_boxes_3d[i, :3]
155 | 
156 |             with open(abs_filepath, 'w') as f:
157 |                 gt_points.tofile(f)
158 | 
159 |             img_crop, crop_key, crop_depth = find_img_crop(annos['gt_bboxes_3d'][i].corners.numpy(), input_img,
160 |                                                            input_info, points[point_indices[:, i]])
161 |             if img_crop is not None:
162 |                 mmcv.imwrite(img_crop, abs_img_filepath)
163 | 
164 |             if (used_classes is None) or names[i] in used_classes:
165 |                 db_info = {
166 |                     'name': names[i],
167 |                     'path': rel_filepath,
168 |                     'image_idx': image_idx,
169 |                     'image_path': rel_img_filepath if img_crop is not None else '',
170 |                     'image_crop_key': crop_key if img_crop is not None else '',
171 |                     'image_crop_depth': crop_depth,
172 |                     'gt_idx': i,
173 |                     'box3d_lidar': gt_boxes_3d[i],
174 |                     'num_points_in_gt': gt_points.shape[0],
175 |                     'difficulty': difficulty[i],
176 |                 }
177 |                 local_group_id = group_ids[i]
178 |                 # if local_group_id >= 0:
179 |                 if local_group_id not in group_dict:
180 |                     group_dict[local_group_id] = group_counter
181 |                     group_counter += 1
182 |                 db_info['group_id'] = group_dict[local_group_id]
183 |                 if 'score' in annos:
184 |                     db_info['score'] = annos['score'][i]
185 |                 if names[i] in all_db_infos:
186 |                     all_db_infos[names[i]].append(db_info)
187 |                 else:
188 |                     all_db_infos[names[i]] = [db_info]
189 | 
190 |     for k, v in all_db_infos.items():
191 |         print(f'load {len(v)} {k} database infos')
192 | 
193 |     with open(db_info_save_path, 'wb') as f:
194 |         pickle.dump(all_db_infos, f)
195 | 
196 | 
197 | def find_img_crop(gt_boxes_3d, input_img, input_info, points):
198 |     coord_3d = np.concatenate([gt_boxes_3d, np.ones_like(gt_boxes_3d[..., :1])], -1)
199 |     coord_3d = coord_3d.squeeze(0)
200 |     max_crop, crop_key = None, None
201 |     crop_area, crop_depth = 0, 0
202 | 
203 |     for _key in input_img:
204 |         lidar2img = np.array(input_info[_key]['lidar2img'])
205 |         coord_img = coord_3d @ lidar2img.T
206 |         coord_img[:, :2] /= coord_img[:, 2, None]
207 |         image_shape = input_img[_key].shape
208 |         if (coord_img[2] <= 0).any():
209 |             continue
210 | 
211 |         avg_depth = coord_img[:, 2].mean()
212 |         minxy = np.min(coord_img[:, :2], axis=-2)
213 |         maxxy = np.max(coord_img[:, :2], axis=-2)
214 |         bbox = np.concatenate([minxy, maxxy], axis=-1)
215 |         bbox[0::2] = np.clip(bbox[0::2], a_min=0, a_max=image_shape[1] - 1)
216 |         bbox[1::2] = np.clip(bbox[1::2], a_min=0, a_max=image_shape[0] - 1)
217 |         bbox = bbox.astype(int)
218 |         if ((bbox[2:] - bbox[:2]) <= 10).any():
219 |             continue
220 | 
221 |         img_crop = input_img[_key][bbox[1]:bbox[3], bbox[0]:bbox[2]]
222 |         if img_crop.shape[0] * img_crop.shape[1] > crop_area:
223 |             max_crop = img_crop
224 |             crop_key = _key
225 |             crop_depth = avg_depth
226 | 
227 |     return max_crop, crop_key, crop_depth
228 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/detectors/meformer.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2022 megvii-model. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
  5 | # Copyright (c) OpenMMLab. All rights reserved.
  6 | # ------------------------------------------------------------------------
  7 | import torch
  8 | import torch.nn.functional as F
  9 | from mmcv.runner import force_fp32, auto_fp16
 10 | from mmdet.models import DETECTORS
 11 | from mmdet3d.core import bbox3d2result
 12 | from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector
 13 | 
 14 | from projects.mmdet3d_plugin import SPConvVoxelization
 15 | from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask
 16 | 
 17 | 
 18 | @DETECTORS.register_module()
 19 | class MEFormerDetector(MVXTwoStageDetector):
 20 |     def __init__(self,
 21 |                  use_grid_mask=False,
 22 |                  **kwargs):
 23 |         pts_voxel_cfg = kwargs.get('pts_voxel_layer', None)
 24 |         kwargs['pts_voxel_layer'] = None
 25 |         super(MEFormerDetector, self).__init__(**kwargs)
 26 | 
 27 |         self.use_grid_mask = use_grid_mask
 28 |         self.grid_mask = GridMask(True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7)
 29 |         if pts_voxel_cfg:
 30 |             self.pts_voxel_layer = SPConvVoxelization(**pts_voxel_cfg)
 31 | 
 32 |     def init_weights(self):
 33 |         """Initialize model weights."""
 34 |         super(MEFormerDetector, self).init_weights()
 35 | 
 36 |     @auto_fp16(apply_to=('img'), out_fp32=True)
 37 |     def extract_img_feat(self, img, img_metas):
 38 |         """Extract features of images."""
 39 |         if self.with_img_backbone and img is not None:
 40 |             input_shape = img.shape[-2:]
 41 |             # update real input shape of each single img
 42 |             for img_meta in img_metas:
 43 |                 img_meta.update(input_shape=input_shape)
 44 | 
 45 |             if img.dim() == 5 and img.size(0) == 1:
 46 |                 img.squeeze_(0)
 47 |             elif img.dim() == 5 and img.size(0) > 1:
 48 |                 B, N, C, H, W = img.size()
 49 |                 img = img.view(B * N, C, H, W)
 50 |             if self.use_grid_mask:
 51 |                 img = self.grid_mask(img)
 52 |             img_feats = self.img_backbone(img.float())
 53 |             if isinstance(img_feats, dict):
 54 |                 img_feats = list(img_feats.values())
 55 |         else:
 56 |             return None
 57 |         if self.with_img_neck:
 58 |             img_feats = self.img_neck(img_feats)
 59 |         return img_feats
 60 | 
 61 |     @force_fp32(apply_to=('pts', 'img_feats'))
 62 |     def extract_pts_feat(self, pts, img_feats, img_metas):
 63 |         """Extract features of points."""
 64 |         if not self.with_pts_bbox:
 65 |             return None
 66 |         if pts is None:
 67 |             return None
 68 |         voxels, num_points, coors = self.voxelize(pts)
 69 |         voxel_features = self.pts_voxel_encoder(voxels, num_points, coors)
 70 |         batch_size = coors[-1, 0] + 1
 71 |         x = self.pts_middle_encoder(voxel_features, coors, batch_size)
 72 |         x = self.pts_backbone(x)
 73 |         if self.with_pts_neck:
 74 |             x = self.pts_neck(x)
 75 |         return x
 76 | 
 77 |     @torch.no_grad()
 78 |     @force_fp32()
 79 |     def voxelize(self, points):
 80 |         """Apply dynamic voxelization to points.
 81 | 
 82 |         Args:
 83 |             points (list[torch.Tensor]): Points of each sample.
 84 | 
 85 |         Returns:
 86 |             tuple[torch.Tensor]: Concatenated points, number of points
 87 |                 per voxel, and coordinates.
 88 |         """
 89 |         voxels, coors, num_points = [], [], []
 90 |         for res in points:
 91 |             res_voxels, res_coors, res_num_points = self.pts_voxel_layer(res)
 92 |             voxels.append(res_voxels)
 93 |             coors.append(res_coors)
 94 |             num_points.append(res_num_points)
 95 |         voxels = torch.cat(voxels, dim=0)
 96 |         num_points = torch.cat(num_points, dim=0)
 97 |         coors_batch = []
 98 |         for i, coor in enumerate(coors):
 99 |             coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
100 |             coors_batch.append(coor_pad)
101 |         coors_batch = torch.cat(coors_batch, dim=0)
102 |         return voxels, num_points, coors_batch
103 | 
104 |     def forward_train(self,
105 |                       points=None,
106 |                       img_metas=None,
107 |                       gt_bboxes_3d=None,
108 |                       gt_labels_3d=None,
109 |                       gt_labels=None,
110 |                       gt_bboxes=None,
111 |                       img=None,
112 |                       proposals=None,
113 |                       gt_bboxes_ignore=None):
114 |         """Forward training function.
115 | 
116 |         Args:
117 |             points (list[torch.Tensor], optional): Points of each sample.
118 |                 Defaults to None.
119 |             img_metas (list[dict], optional): Meta information of each sample.
120 |                 Defaults to None.
121 |             gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
122 |                 Ground truth 3D boxes. Defaults to None.
123 |             gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
124 |                 of 3D boxes. Defaults to None.
125 |             gt_labels (list[torch.Tensor], optional): Ground truth labels
126 |                 of 2D boxes in images. Defaults to None.
127 |             gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
128 |                 images. Defaults to None.
129 |             img (torch.Tensor optional): Images of each sample with shape
130 |                 (N, C, H, W). Defaults to None.
131 |             proposals ([list[torch.Tensor], optional): Predicted proposals
132 |                 used for training Fast RCNN. Defaults to None.
133 |             gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
134 |                 2D boxes in images to be ignored. Defaults to None.
135 | 
136 |         Returns:
137 |             dict: Losses of different branches.
138 |         """
139 | 
140 |         img_feats, pts_feats = self.extract_feat(
141 |             points, img=img, img_metas=img_metas)
142 |         losses = dict()
143 |         if pts_feats or img_feats:
144 |             losses_pts = self.forward_pts_train(
145 |                 pts_feats, img_feats, gt_bboxes_3d, gt_labels_3d, img_metas, gt_bboxes_ignore
146 |             )
147 |             losses.update(losses_pts)
148 |         return losses
149 | 
150 |     @force_fp32(apply_to=('pts_feats', 'img_feats'))
151 |     def forward_pts_train(self,
152 |                           pts_feats,
153 |                           img_feats,
154 |                           gt_bboxes_3d,
155 |                           gt_labels_3d,
156 |                           img_metas,
157 |                           gt_bboxes_ignore=None):
158 |         """Forward function for point cloud branch.
159 | 
160 |         Args:
161 |             pts_feats (list[torch.Tensor]): Features of point cloud branch
162 |             gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
163 |                 boxes for each sample.
164 |             gt_labels_3d (list[torch.Tensor]): Ground truth labels for
165 |                 boxes of each sampole
166 |             img_metas (list[dict]): Meta information of samples.
167 |             gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
168 |                 boxes to be ignored. Defaults to None.
169 | 
170 |         Returns:
171 |             dict: Losses of each branch.
172 |         """
173 |         if pts_feats is None:
174 |             pts_feats = [None]
175 |         if img_feats is None:
176 |             img_feats = [None]
177 |         outs = self.pts_bbox_head(pts_feats, img_feats, img_metas)
178 |         loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs]
179 |         losses = self.pts_bbox_head.loss(*loss_inputs)
180 |         return losses
181 | 
182 |     def forward_test(self,
183 |                      points=None,
184 |                      img_metas=None,
185 |                      img=None, **kwargs):
186 |         """
187 |         Args:
188 |             points (list[torch.Tensor]): the outer list indicates test-time
189 |                 augmentations and inner torch.Tensor should have a shape NxC,
190 |                 which contains all points in the batch.
191 |             img_metas (list[list[dict]]): the outer list indicates test-time
192 |                 augs (multiscale, flip, etc.) and the inner list indicates
193 |                 images in a batch
194 |             img (list[torch.Tensor], optional): the outer
195 |                 list indicates test-time augmentations and inner
196 |                 torch.Tensor should have a shape NxCxHxW, which contains
197 |                 all images in the batch. Defaults to None.
198 |         """
199 |         if points is None:
200 |             points = [None]
201 |         if img is None:
202 |             img = [None]
203 |         for var, name in [(points, 'points'), (img, 'img'), (img_metas, 'img_metas')]:
204 |             if not isinstance(var, list):
205 |                 raise TypeError('{} must be a list, but got {}'.format(
206 |                     name, type(var)))
207 | 
208 |         return self.simple_test(points[0], img_metas[0], img[0], **kwargs)
209 | 
210 |     @force_fp32(apply_to=('x', 'x_img'))
211 |     def simple_test_pts(self, x, x_img, img_metas, rescale=False):
212 |         """Test function of point cloud branch."""
213 |         outs = self.pts_bbox_head(x, x_img, img_metas)
214 |         bbox_list = self.pts_bbox_head.get_bboxes(
215 |             outs, img_metas, rescale=rescale)
216 |         bbox_results = [
217 |             bbox3d2result(bboxes, scores, labels)
218 |             for bboxes, scores, labels in bbox_list
219 |         ]
220 |         return bbox_results
221 | 
222 |     def simple_test(self, points, img_metas, img=None, rescale=False):
223 |         img_feats, pts_feats = self.extract_feat(
224 |             points, img=img, img_metas=img_metas)
225 |         if pts_feats is None:
226 |             pts_feats = [None]
227 |         if img_feats is None:
228 |             img_feats = [None]
229 | 
230 |         bbox_list = [dict() for i in range(len(img_metas))]
231 |         if (pts_feats or img_feats) and self.with_pts_bbox:
232 |             bbox_pts = self.simple_test_pts(
233 |                 pts_feats, img_feats, img_metas, rescale=rescale)
234 |             for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
235 |                 result_dict['pts_bbox'] = pts_bbox
236 |         if img_feats and self.with_img_bbox:
237 |             bbox_img = self.simple_test_img(
238 |                 img_feats, img_metas, rescale=rescale)
239 |             for result_dict, img_bbox in zip(bbox_list, bbox_img):
240 |                 result_dict['img_bbox'] = img_bbox
241 |         return bbox_list
242 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/detectors/mome.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2022 megvii-model. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
  5 | # Copyright (c) OpenMMLab. All rights reserved.
  6 | # ------------------------------------------------------------------------
  7 | 
  8 | import mmcv
  9 | import copy
 10 | import torch
 11 | import torch.nn as nn
 12 | import torch.nn.functional as F
 13 | import numpy as np
 14 | 
 15 | from mmcv.runner import force_fp32, auto_fp16
 16 | from mmdet.core import multi_apply
 17 | from mmdet.models import DETECTORS
 18 | from mmdet.models.builder import build_backbone
 19 | from mmdet3d.core import (Box3DMode, Coord3DMode, bbox3d2result,
 20 |                           merge_aug_bboxes_3d, show_result)
 21 | from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector
 22 | 
 23 | from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask
 24 | from projects.mmdet3d_plugin import SPConvVoxelization
 25 | 
 26 | 
 27 | @DETECTORS.register_module()
 28 | class MoME(MVXTwoStageDetector):
 29 | 
 30 |     def __init__(self,
 31 |                  use_grid_mask=False,
 32 |                  **kwargs):
 33 |         pts_voxel_cfg = kwargs.get('pts_voxel_layer', None)
 34 |         kwargs['pts_voxel_layer'] = None
 35 |         super(MoME, self).__init__(**kwargs)
 36 |         
 37 |         self.use_grid_mask = use_grid_mask
 38 |         self.grid_mask = GridMask(True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7)
 39 |         if pts_voxel_cfg:
 40 |             self.pts_voxel_layer = SPConvVoxelization(**pts_voxel_cfg)
 41 | 
 42 |     def init_weights(self):
 43 |         """Initialize model weights."""
 44 |         super(MoME, self).init_weights()
 45 | 
 46 |     @auto_fp16(apply_to=('img'), out_fp32=True) 
 47 |     def extract_img_feat(self, img, img_metas):
 48 |         """Extract features of images."""
 49 |         if self.with_img_backbone and img is not None:
 50 |             input_shape = img.shape[-2:]
 51 |             # update real input shape of each single img
 52 |             for img_meta in img_metas:
 53 |                 img_meta.update(input_shape=input_shape)
 54 | 
 55 |             if img.dim() == 5 and img.size(0) == 1:
 56 |                 img.squeeze_(0)
 57 |             elif img.dim() == 5 and img.size(0) > 1:
 58 |                 B, N, C, H, W = img.size()
 59 |                 img = img.view(B * N, C, H, W)
 60 |             if self.use_grid_mask:
 61 |                 img = self.grid_mask(img)
 62 |             img_feats = self.img_backbone(img.float())
 63 |             if isinstance(img_feats, dict):
 64 |                 img_feats = list(img_feats.values())
 65 |         else:
 66 |             return None
 67 |         if self.with_img_neck:
 68 |             img_feats = self.img_neck(img_feats)
 69 |         return img_feats
 70 | 
 71 |     @force_fp32(apply_to=('pts', 'img_feats'))
 72 |     def extract_pts_feat(self, pts, img_feats, img_metas):
 73 |         """Extract features of points."""
 74 |         if not self.with_pts_bbox:
 75 |             return None
 76 |         if pts is None:
 77 |             return None
 78 |         voxels, num_points, coors = self.voxelize(pts)
 79 |         voxel_features = self.pts_voxel_encoder(voxels, num_points, coors,
 80 |                                                 )
 81 |         batch_size = coors[-1, 0] + 1
 82 |         x = self.pts_middle_encoder(voxel_features, coors, batch_size)
 83 |         x = self.pts_backbone(x)
 84 |         if self.with_pts_neck:
 85 |             x = self.pts_neck(x)
 86 |         return x
 87 | 
 88 |     @torch.no_grad()
 89 |     @force_fp32()
 90 |     def voxelize(self, points):
 91 |         """Apply dynamic voxelization to points.
 92 | 
 93 |         Args:
 94 |             points (list[torch.Tensor]): Points of each sample.
 95 | 
 96 |         Returns:
 97 |             tuple[torch.Tensor]: Concatenated points, number of points
 98 |                 per voxel, and coordinates.
 99 |         """
100 |         voxels, coors, num_points = [], [], []
101 |         for res in points:
102 |             res_voxels, res_coors, res_num_points = self.pts_voxel_layer(res)
103 |             voxels.append(res_voxels)
104 |             coors.append(res_coors)
105 |             num_points.append(res_num_points)
106 |         voxels = torch.cat(voxels, dim=0)
107 |         num_points = torch.cat(num_points, dim=0)
108 |         coors_batch = []
109 |         for i, coor in enumerate(coors):
110 |             coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
111 |             coors_batch.append(coor_pad)
112 |         coors_batch = torch.cat(coors_batch, dim=0)
113 |         return voxels, num_points, coors_batch
114 | 
115 |     def forward_train(self,
116 |                       points=None,
117 |                       img_metas=None,
118 |                       gt_bboxes_3d=None,
119 |                       gt_labels_3d=None,
120 |                       gt_labels=None,
121 |                       gt_bboxes=None,
122 |                       img=None,
123 |                       proposals=None,
124 |                       gt_bboxes_ignore=None):
125 |         """Forward training function.
126 | 
127 |         Args:
128 |             points (list[torch.Tensor], optional): Points of each sample.
129 |                 Defaults to None.
130 |             img_metas (list[dict], optional): Meta information of each sample.
131 |                 Defaults to None.
132 |             gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
133 |                 Ground truth 3D boxes. Defaults to None.
134 |             gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
135 |                 of 3D boxes. Defaults to None.
136 |             gt_labels (list[torch.Tensor], optional): Ground truth labels
137 |                 of 2D boxes in images. Defaults to None.
138 |             gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
139 |                 images. Defaults to None.
140 |             img (torch.Tensor optional): Images of each sample with shape
141 |                 (N, C, H, W). Defaults to None.
142 |             proposals ([list[torch.Tensor], optional): Predicted proposals
143 |                 used for training Fast RCNN. Defaults to None.
144 |             gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
145 |                 2D boxes in images to be ignored. Defaults to None.
146 | 
147 |         Returns:
148 |             dict: Losses of different branches.
149 |         """
150 | 
151 |         img_feats, pts_feats = self.extract_feat(
152 |             points, img=img, img_metas=img_metas)
153 |         losses = dict()
154 |         if pts_feats or img_feats:
155 |             losses_pts = self.forward_pts_train(pts_feats, img_feats, gt_bboxes_3d,
156 |                                                 gt_labels_3d, img_metas,
157 |                                                 gt_bboxes_ignore)
158 |             losses.update(losses_pts)
159 |         return losses
160 | 
161 |     @force_fp32(apply_to=('pts_feats', 'img_feats'))
162 |     def forward_pts_train(self,
163 |                           pts_feats,
164 |                           img_feats,
165 |                           gt_bboxes_3d,
166 |                           gt_labels_3d,
167 |                           img_metas,
168 |                           gt_bboxes_ignore=None):
169 |         """Forward function for point cloud branch.
170 | 
171 |         Args:
172 |             pts_feats (list[torch.Tensor]): Features of point cloud branch
173 |             gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
174 |                 boxes for each sample.
175 |             gt_labels_3d (list[torch.Tensor]): Ground truth labels for
176 |                 boxes of each sampole
177 |             img_metas (list[dict]): Meta information of samples.
178 |             gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
179 |                 boxes to be ignored. Defaults to None.
180 | 
181 |         Returns:
182 |             dict: Losses of each branch.
183 |         """
184 |         if pts_feats is None:
185 |             pts_feats = [None]
186 |         if img_feats is None:
187 |             img_feats = [None]
188 |         outs = self.pts_bbox_head(pts_feats, img_feats, img_metas)
189 |         loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs]
190 |         losses = self.pts_bbox_head.loss(*loss_inputs)
191 |         return losses
192 | 
193 |     def forward_test(self,
194 |                      points=None,
195 |                      img_metas=None,
196 |                      img=None, **kwargs):
197 |         """
198 |         Args:
199 |             points (list[torch.Tensor]): the outer list indicates test-time
200 |                 augmentations and inner torch.Tensor should have a shape NxC,
201 |                 which contains all points in the batch.
202 |             img_metas (list[list[dict]]): the outer list indicates test-time
203 |                 augs (multiscale, flip, etc.) and the inner list indicates
204 |                 images in a batch
205 |             img (list[torch.Tensor], optional): the outer
206 |                 list indicates test-time augmentations and inner
207 |                 torch.Tensor should have a shape NxCxHxW, which contains
208 |                 all images in the batch. Defaults to None.
209 |         """
210 |         if points is None:
211 |             points = [None]
212 |         if img is None:
213 |             img = [None]
214 |         for var, name in [(points, 'points'), (img, 'img'), (img_metas, 'img_metas')]:
215 |             if not isinstance(var, list):
216 |                 raise TypeError('{} must be a list, but got {}'.format(
217 |                     name, type(var)))
218 | 
219 |         return self.simple_test(points[0], img_metas[0], img[0], **kwargs)
220 |     
221 |     @force_fp32(apply_to=('x', 'x_img'))
222 |     def simple_test_pts(self, x, x_img, img_metas, rescale=False):
223 |         """Test function of point cloud branch."""
224 |         outs = self.pts_bbox_head(x, x_img, img_metas)
225 |         bbox_list = self.pts_bbox_head.get_bboxes(
226 |             outs, img_metas, rescale=rescale)
227 |         bbox_results = [
228 |             bbox3d2result(bboxes, scores, labels)
229 |             for bboxes, scores, labels in bbox_list
230 |         ] 
231 |         return bbox_results
232 | 
233 |     def simple_test(self, points, img_metas, img=None, rescale=False):
234 |         img_feats, pts_feats = self.extract_feat(
235 |             points, img=img, img_metas=img_metas)
236 |         if pts_feats is None:
237 |             pts_feats = [None]
238 |         if img_feats is None:
239 |             img_feats = [None]
240 |         
241 |         bbox_list = [dict() for i in range(len(img_metas))]
242 |         if (pts_feats or img_feats) and self.with_pts_bbox:
243 |             bbox_pts = self.simple_test_pts(
244 |                 pts_feats, img_feats, img_metas, rescale=rescale)
245 |             for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
246 |                 result_dict['pts_bbox'] = pts_bbox
247 |         if img_feats and self.with_img_bbox:
248 |             bbox_img = self.simple_test_img(
249 |                 img_feats, img_metas, rescale=rescale)
250 |             for result_dict, img_bbox in zip(bbox_list, bbox_img):
251 |                 result_dict['img_bbox'] = img_bbox
252 |         return bbox_list
253 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/datasets/pipelines/dbsampler.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) OpenMMLab. All rights reserved.
  2 | import copy
  3 | import os
  4 | 
  5 | import mmcv
  6 | import numpy as np
  7 | from mmdet3d.core.bbox import box_np_ops
  8 | from mmdet3d.datasets import PIPELINES
  9 | from mmdet3d.datasets.builder import OBJECTSAMPLERS
 10 | from mmdet3d.datasets.pipelines import data_augment_utils
 11 | from mmdet3d.datasets.pipelines.dbsampler import BatchSampler
 12 | 
 13 | 
 14 | @OBJECTSAMPLERS.register_module()
 15 | class UnifiedDataBaseSampler(object):
 16 |     """Class for sampling data from the ground truth database.
 17 | 
 18 |     Args:
 19 |         info_path (str): Path of groundtruth database info.
 20 |         data_root (str): Path of groundtruth database.
 21 |         rate (float): Rate of actual sampled over maximum sampled number.
 22 |         prepare (dict): Name of preparation functions and the input value.
 23 |         sample_groups (dict): Sampled classes and numbers.
 24 |         classes (list[str]): List of classes. Default: None.
 25 |         points_loader(dict): Config of points loader. Default: dict(
 26 |             type='LoadPointsFromFile', load_dim=4, use_dim=[0,1,2,3])
 27 |     """
 28 | 
 29 |     def __init__(self,
 30 |                  info_path,
 31 |                  data_root,
 32 |                  rate,
 33 |                  prepare,
 34 |                  sample_groups,
 35 |                  classes=None,
 36 |                  points_loader=dict(
 37 |                      type='LoadPointsFromFile',
 38 |                      coord_type='LIDAR',
 39 |                      load_dim=4,
 40 |                      use_dim=[0, 1, 2, 3])):
 41 |         super().__init__()
 42 |         self.data_root = data_root
 43 |         self.info_path = info_path
 44 |         self.rate = rate
 45 |         self.prepare = prepare
 46 |         self.classes = classes
 47 |         self.cat2label = {name: i for i, name in enumerate(classes)}
 48 |         self.label2cat = {i: name for i, name in enumerate(classes)}
 49 |         self.points_loader = mmcv.build_from_cfg(points_loader, PIPELINES)
 50 | 
 51 |         db_infos = mmcv.load(info_path)
 52 | 
 53 |         # filter database infos
 54 |         from mmdet3d.utils import get_root_logger
 55 |         logger = get_root_logger()
 56 |         for k, v in db_infos.items():
 57 |             logger.info(f'load {len(v)} {k} database infos')
 58 |         for prep_func, val in prepare.items():
 59 |             db_infos = getattr(self, prep_func)(db_infos, val)
 60 |         logger.info('After filter database:')
 61 |         for k, v in db_infos.items():
 62 |             logger.info(f'load {len(v)} {k} database infos')
 63 | 
 64 |         self.db_infos = db_infos
 65 | 
 66 |         # load sample groups
 67 |         # TODO: more elegant way to load sample groups
 68 |         self.sample_groups = []
 69 |         for name, num in sample_groups.items():
 70 |             self.sample_groups.append({name: int(num)})
 71 | 
 72 |         self.group_db_infos = self.db_infos  # just use db_infos
 73 |         self.sample_classes = []
 74 |         self.sample_max_nums = []
 75 |         for group_info in self.sample_groups:
 76 |             self.sample_classes += list(group_info.keys())
 77 |             self.sample_max_nums += list(group_info.values())
 78 | 
 79 |         self.sampler_dict = {}
 80 |         for k, v in self.group_db_infos.items():
 81 |             self.sampler_dict[k] = BatchSampler(v, k, shuffle=True)
 82 |         # TODO: No group_sampling currently
 83 | 
 84 |     @staticmethod
 85 |     def filter_by_difficulty(db_infos, removed_difficulty):
 86 |         """Filter ground truths by difficulties.
 87 | 
 88 |         Args:
 89 |             db_infos (dict): Info of groundtruth database.
 90 |             removed_difficulty (list): Difficulties that are not qualified.
 91 | 
 92 |         Returns:
 93 |             dict: Info of database after filtering.
 94 |         """
 95 |         new_db_infos = {}
 96 |         for key, dinfos in db_infos.items():
 97 |             new_db_infos[key] = [
 98 |                 info for info in dinfos
 99 |                 if info['difficulty'] not in removed_difficulty
100 |             ]
101 |         return new_db_infos
102 | 
103 |     @staticmethod
104 |     def filter_by_min_points(db_infos, min_gt_points_dict):
105 |         """Filter ground truths by number of points in the bbox.
106 | 
107 |         Args:
108 |             db_infos (dict): Info of groundtruth database.
109 |             min_gt_points_dict (dict): Different number of minimum points
110 |                 needed for different categories of ground truths.
111 | 
112 |         Returns:
113 |             dict: Info of database after filtering.
114 |         """
115 |         for name, min_num in min_gt_points_dict.items():
116 |             min_num = int(min_num)
117 |             if min_num > 0:
118 |                 filtered_infos = []
119 |                 for info in db_infos[name]:
120 |                     if info['num_points_in_gt'] >= min_num:
121 |                         filtered_infos.append(info)
122 |                 db_infos[name] = filtered_infos
123 |         return db_infos
124 | 
125 |     def sample_all(self, gt_bboxes, gt_labels, with_img=False):
126 |         """Sampling all categories of bboxes.
127 | 
128 |         Args:
129 |             gt_bboxes (np.ndarray): Ground truth bounding boxes.
130 |             gt_labels (np.ndarray): Ground truth labels of boxes.
131 | 
132 |         Returns:
133 |             dict: Dict of sampled 'pseudo ground truths'.
134 | 
135 |                 - gt_labels_3d (np.ndarray): ground truths labels \
136 |                     of sampled objects.
137 |                 - gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): \
138 |                     sampled ground truth 3D bounding boxes
139 |                 - points (np.ndarray): sampled points
140 |                 - group_ids (np.ndarray): ids of sampled ground truths
141 |         """
142 |         sampled_num_dict = {}
143 |         sample_num_per_class = []
144 | 
145 |         for class_name, max_sample_num in zip(self.sample_classes,
146 |                                               self.sample_max_nums):
147 |             class_label = self.cat2label[class_name]
148 |             sampled_num = int(max_sample_num -
149 |                               np.sum([n == class_label for n in gt_labels]))
150 |             sampled_num = np.round(self.rate * sampled_num).astype(np.int64)
151 |             sampled_num_dict[class_name] = sampled_num
152 |             sample_num_per_class.append(sampled_num)
153 | 
154 |         sampled = []
155 |         sampled_gt_bboxes = []
156 |         avoid_coll_boxes = gt_bboxes
157 | 
158 |         for class_name, sampled_num in zip(self.sample_classes,
159 |                                            sample_num_per_class):
160 |             if sampled_num > 0:
161 |                 sampled_cls = self.sample_class_v2(class_name, sampled_num,
162 |                                                    avoid_coll_boxes)
163 | 
164 |                 sampled += sampled_cls
165 |                 if len(sampled_cls) > 0:
166 |                     if len(sampled_cls) == 1:
167 |                         sampled_gt_box = sampled_cls[0]['box3d_lidar'][
168 |                             np.newaxis, ...]
169 |                     else:
170 |                         sampled_gt_box = np.stack(
171 |                             [s['box3d_lidar'] for s in sampled_cls], axis=0)
172 | 
173 |                     sampled_gt_bboxes += [sampled_gt_box]
174 |                     avoid_coll_boxes = np.concatenate(
175 |                         [avoid_coll_boxes, sampled_gt_box], axis=0)
176 | 
177 |         ret = None
178 |         if len(sampled) > 0:
179 |             sampled_gt_bboxes = np.concatenate(sampled_gt_bboxes, axis=0)
180 |             s_points_list = []
181 |             s_idx_list = []
182 |             s_imgs_list = []
183 |             count = 0
184 |             for info in sampled:
185 |                 file_path = os.path.join(
186 |                     self.data_root,
187 |                     info['path']) if self.data_root else info['path']
188 |                 results = dict(pts_filename=file_path)
189 |                 if 'nori_id' in info:
190 |                     results['pts_nori_path'] = info['nori_id']
191 |                 s_points = self.points_loader(results)['points']
192 |                 s_points.translate(info['box3d_lidar'][:3])
193 |                 idx_points = count * np.ones(len(s_points), dtype=np.int)
194 |                 s_points_list.append(s_points)
195 |                 s_idx_list.append(idx_points)
196 |                 count += 1
197 |                 if with_img:
198 |                     if len(info['image_path']) > 0:
199 |                         img_path = os.path.join(
200 |                             self.data_root,
201 |                             info['image_path']) if self.data_root else info['image_path']
202 |                         s_img = mmcv.imread(img_path, 'unchanged')
203 |                     else:
204 |                         s_img = []
205 |                     s_imgs_list.append(s_img)
206 | 
207 |             gt_labels = np.array([self.cat2label[s['name']] for s in sampled],
208 |                                  dtype=np.long)
209 |             ret = {
210 |                 'gt_labels_3d':
211 |                     gt_labels,
212 |                 'gt_bboxes_3d':
213 |                     sampled_gt_bboxes,
214 |                 'points':
215 |                     s_points_list[0].cat(s_points_list),
216 |                 "points_idx":
217 |                     np.concatenate(s_idx_list, axis=0),
218 |                 'images':
219 |                     s_imgs_list,
220 |                 'group_ids':
221 |                     np.arange(gt_bboxes.shape[0],
222 |                               gt_bboxes.shape[0] + len(sampled))
223 |             }
224 | 
225 |         return ret
226 | 
227 |     def sample_class_v2(self, name, num, gt_bboxes):
228 |         """Sampling specific categories of bounding boxes.
229 | 
230 |         Args:
231 |             name (str): Class of objects to be sampled.
232 |             num (int): Number of sampled bboxes.
233 |             gt_bboxes (np.ndarray): Ground truth boxes.
234 | 
235 |         Returns:
236 |             list[dict]: Valid samples after collision test.
237 |         """
238 |         sampled = self.sampler_dict[name].sample(num)
239 |         sampled = copy.deepcopy(sampled)
240 |         num_gt = gt_bboxes.shape[0]
241 |         num_sampled = len(sampled)
242 |         gt_bboxes_bv = box_np_ops.center_to_corner_box2d(
243 |             gt_bboxes[:, 0:2], gt_bboxes[:, 3:5], gt_bboxes[:, 6])
244 | 
245 |         sp_boxes = np.stack([i['box3d_lidar'] for i in sampled], axis=0)
246 |         boxes = np.concatenate([gt_bboxes, sp_boxes], axis=0).copy()
247 | 
248 |         sp_boxes_new = boxes[gt_bboxes.shape[0]:]
249 |         sp_boxes_bv = box_np_ops.center_to_corner_box2d(
250 |             sp_boxes_new[:, 0:2], sp_boxes_new[:, 3:5], sp_boxes_new[:, 6])
251 | 
252 |         total_bv = np.concatenate([gt_bboxes_bv, sp_boxes_bv], axis=0)
253 |         coll_mat = data_augment_utils.box_collision_test(total_bv, total_bv)
254 |         diag = np.arange(total_bv.shape[0])
255 |         coll_mat[diag, diag] = False
256 | 
257 |         valid_samples = []
258 |         for i in range(num_gt, num_gt + num_sampled):
259 |             if coll_mat[i].any():
260 |                 coll_mat[i] = False
261 |                 coll_mat[:, i] = False
262 |             else:
263 |                 valid_samples.append(sampled[i - num_gt])
264 |         return valid_samples
265 | 


--------------------------------------------------------------------------------
/tools/test.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) OpenMMLab. All rights reserved.
  2 | import argparse
  3 | import os
  4 | import warnings
  5 | 
  6 | import mmcv
  7 | import mmdet
  8 | import torch
  9 | from mmcv import Config, DictAction
 10 | from mmcv.cnn import fuse_conv_bn
 11 | from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
 12 | from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
 13 |                          wrap_fp16_model)
 14 | from mmdet.apis import multi_gpu_test, set_random_seed
 15 | from mmdet.datasets import replace_ImageToTensor
 16 | from mmdet3d.apis import single_gpu_test
 17 | from mmdet3d.datasets import build_dataloader, build_dataset
 18 | from mmdet3d.models import build_model
 19 | 
 20 | if mmdet.__version__ > '2.23.0':
 21 |     # If mmdet version > 2.23.0, setup_multi_processes would be imported and
 22 |     # used from mmdet instead of mmdet3d.
 23 |     from mmdet.utils import setup_multi_processes
 24 | else:
 25 |     from mmdet3d.utils import setup_multi_processes
 26 | 
 27 | try:
 28 |     # If mmdet version > 2.23.0, compat_cfg would be imported and
 29 |     # used from mmdet instead of mmdet3d.
 30 |     from mmdet.utils import compat_cfg
 31 | except ImportError:
 32 |     from mmdet3d.utils import compat_cfg
 33 | 
 34 | 
 35 | def parse_args():
 36 |     parser = argparse.ArgumentParser(
 37 |         description='MMDet test (and eval) a model')
 38 |     parser.add_argument('config', help='test config file path')
 39 |     parser.add_argument('checkpoint', help='checkpoint file')
 40 |     parser.add_argument('--out', help='output result file in pickle format')
 41 |     parser.add_argument(
 42 |         '--fuse-conv-bn',
 43 |         action='store_true',
 44 |         help='Whether to fuse conv and bn, this will slightly increase'
 45 |              'the inference speed')
 46 |     parser.add_argument(
 47 |         '--gpu-ids',
 48 |         type=int,
 49 |         nargs='+',
 50 |         help='(Deprecated, please use --gpu-id) ids of gpus to use '
 51 |              '(only applicable to non-distributed training)')
 52 |     parser.add_argument(
 53 |         '--gpu-id',
 54 |         type=int,
 55 |         default=0,
 56 |         help='id of gpu to use '
 57 |              '(only applicable to non-distributed testing)')
 58 |     parser.add_argument(
 59 |         '--format-only',
 60 |         action='store_true',
 61 |         help='Format the output results without perform evaluation. It is'
 62 |              'useful when you want to format the result to a specific format and '
 63 |              'submit it to the test server')
 64 |     parser.add_argument(
 65 |         '--eval',
 66 |         type=str,
 67 |         nargs='+',
 68 |         help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
 69 |              ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
 70 |     parser.add_argument('--show', action='store_true', help='show results')
 71 |     parser.add_argument(
 72 |         '--show-dir', help='directory where results will be saved')
 73 |     parser.add_argument(
 74 |         '--gpu-collect',
 75 |         action='store_true',
 76 |         help='whether to use gpu to collect results.')
 77 |     parser.add_argument(
 78 |         '--tmpdir',
 79 |         help='tmp directory used for collecting results from multiple '
 80 |              'workers, available when gpu-collect is not specified')
 81 |     parser.add_argument('--seed', type=int, default=0, help='random seed')
 82 |     parser.add_argument(
 83 |         '--deterministic',
 84 |         action='store_true',
 85 |         help='whether to set deterministic options for CUDNN backend.')
 86 |     parser.add_argument(
 87 |         '--cfg-options',
 88 |         nargs='+',
 89 |         action=DictAction,
 90 |         help='override some settings in the used config, the key-value pair '
 91 |              'in xxx=yyy format will be merged into config file. If the value to '
 92 |              'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
 93 |              'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
 94 |              'Note that the quotation marks are necessary and that no white space '
 95 |              'is allowed.')
 96 |     parser.add_argument(
 97 |         '--options',
 98 |         nargs='+',
 99 |         action=DictAction,
100 |         help='custom options for evaluation, the key-value pair in xxx=yyy '
101 |              'format will be kwargs for dataset.evaluate() function (deprecate), '
102 |              'change to --eval-options instead.')
103 |     parser.add_argument(
104 |         '--eval-options',
105 |         nargs='+',
106 |         action=DictAction,
107 |         help='custom options for evaluation, the key-value pair in xxx=yyy '
108 |              'format will be kwargs for dataset.evaluate() function')
109 |     parser.add_argument(
110 |         '--launcher',
111 |         choices=['none', 'pytorch', 'slurm', 'mpi'],
112 |         default='none',
113 |         help='job launcher')
114 |     parser.add_argument('--local_rank', type=int, default=0)
115 |     args = parser.parse_args()
116 |     if 'LOCAL_RANK' not in os.environ:
117 |         os.environ['LOCAL_RANK'] = str(args.local_rank)
118 | 
119 |     if args.options and args.eval_options:
120 |         raise ValueError(
121 |             '--options and --eval-options cannot be both specified, '
122 |             '--options is deprecated in favor of --eval-options')
123 |     if args.options:
124 |         warnings.warn('--options is deprecated in favor of --eval-options')
125 |         args.eval_options = args.options
126 |     return args
127 | 
128 | 
129 | def main():
130 |     args = parse_args()
131 | 
132 |     assert args.out or args.eval or args.format_only or args.show \
133 |            or args.show_dir, \
134 |         ('Please specify at least one operation (save/eval/format/show the '
135 |          'results / save the results) with the argument "--out", "--eval"'
136 |          ', "--format-only", "--show" or "--show-dir"')
137 | 
138 |     if args.eval and args.format_only:
139 |         raise ValueError('--eval and --format_only cannot be both specified')
140 | 
141 |     if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
142 |         raise ValueError('The output file must be a pkl file.')
143 | 
144 |     cfg = Config.fromfile(args.config)
145 |     if args.cfg_options is not None:
146 |         cfg.merge_from_dict(args.cfg_options)
147 | 
148 |     # import modules from string list.
149 |     if cfg.get('custom_imports', None):
150 |         from mmcv.utils import import_modules_from_strings
151 |         import_modules_from_strings(**cfg['custom_imports'])
152 | 
153 |     # import modules from plguin/xx, registry will be updated
154 |     if hasattr(cfg, 'plugin'):
155 |         if cfg.plugin:
156 |             import importlib
157 |             if hasattr(cfg, 'plugin_dir'):
158 |                 plugin_dir = cfg.plugin_dir
159 |                 _module_dir = os.path.dirname(plugin_dir)
160 |                 _module_dir = _module_dir.split('/')
161 |                 _module_path = _module_dir[0]
162 | 
163 |                 for m in _module_dir[1:]:
164 |                     _module_path = _module_path + '.' + m
165 |                 print(_module_path)
166 |                 plg_lib = importlib.import_module(_module_path)
167 |             else:
168 |                 # import dir is the dirpath for the config file
169 |                 _module_dir = os.path.dirname(args.config)
170 |                 _module_dir = _module_dir.split('/')
171 |                 _module_path = _module_dir[0]
172 |                 for m in _module_dir[1:]:
173 |                     _module_path = _module_path + '.' + m
174 |                 print(_module_path)
175 |                 plg_lib = importlib.import_module(_module_path)
176 | 
177 |     cfg = compat_cfg(cfg)
178 | 
179 |     # set multi-process settings
180 |     setup_multi_processes(cfg)
181 | 
182 |     # set cudnn_benchmark
183 |     if cfg.get('cudnn_benchmark', False):
184 |         torch.backends.cudnn.benchmark = True
185 | 
186 |     cfg.model.pretrained = None
187 | 
188 |     if args.gpu_ids is not None:
189 |         cfg.gpu_ids = args.gpu_ids[0:1]
190 |         warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
191 |                       'Because we only support single GPU mode in '
192 |                       'non-distributed testing. Use the first GPU '
193 |                       'in `gpu_ids` now.')
194 |     else:
195 |         cfg.gpu_ids = [args.gpu_id]
196 | 
197 |     # init distributed env first, since logger depends on the dist info.
198 |     if args.launcher == 'none':
199 |         distributed = False
200 |     else:
201 |         distributed = True
202 |         init_dist(args.launcher, **cfg.dist_params)
203 | 
204 |     test_dataloader_default_args = dict(
205 |         samples_per_gpu=1, workers_per_gpu=2, dist=distributed, shuffle=False)
206 | 
207 |     # in case the test dataset is concatenated
208 |     if isinstance(cfg.data.test, dict):
209 |         cfg.data.test.test_mode = True
210 |         if cfg.data.test_dataloader.get('samples_per_gpu', 1) > 1:
211 |             # Replace 'ImageToTensor' to 'DefaultFormatBundle'
212 |             cfg.data.test.pipeline = replace_ImageToTensor(
213 |                 cfg.data.test.pipeline)
214 |     elif isinstance(cfg.data.test, list):
215 |         for ds_cfg in cfg.data.test:
216 |             ds_cfg.test_mode = True
217 |         if cfg.data.test_dataloader.get('samples_per_gpu', 1) > 1:
218 |             for ds_cfg in cfg.data.test:
219 |                 ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
220 | 
221 |     test_loader_cfg = {
222 |         **test_dataloader_default_args,
223 |         **cfg.data.get('test_dataloader', {})
224 |     }
225 | 
226 |     # set random seeds
227 |     if args.seed is not None:
228 |         set_random_seed(args.seed, deterministic=args.deterministic)
229 | 
230 |     # build the dataloader
231 |     dataset = build_dataset(cfg.data.test)
232 |     data_loader = build_dataloader(dataset, **test_loader_cfg)
233 | 
234 |     # build the model and load checkpoint
235 |     cfg.model.train_cfg = None
236 |     model = build_model(cfg.model, test_cfg=cfg.get('test_cfg'))
237 |     fp16_cfg = cfg.get('fp16', None)
238 |     if fp16_cfg is not None:
239 |         wrap_fp16_model(model)
240 |     if cfg.get('optimizer_config', None) is not None and cfg.optimizer_config['type'] == 'CustomFp16OptimizerHook':
241 |         wrap_fp16_model(model)
242 |         for module_name, v in cfg.optimizer_config['custom_fp16'].items():
243 |             model._modules[module_name].fp16_enabled = v
244 | 
245 |     checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
246 |     if args.fuse_conv_bn:
247 |         model = fuse_conv_bn(model)
248 |     # old versions did not save class info in checkpoints, this walkaround is
249 |     # for backward compatibility
250 |     if 'CLASSES' in checkpoint.get('meta', {}):
251 |         model.CLASSES = checkpoint['meta']['CLASSES']
252 |     else:
253 |         model.CLASSES = dataset.CLASSES
254 |     # palette for visualization in segmentation tasks
255 |     if 'PALETTE' in checkpoint.get('meta', {}):
256 |         model.PALETTE = checkpoint['meta']['PALETTE']
257 |     elif hasattr(dataset, 'PALETTE'):
258 |         # segmentation dataset has `PALETTE` attribute
259 |         model.PALETTE = dataset.PALETTE
260 | 
261 |     if not distributed:
262 |         model = MMDataParallel(model, device_ids=cfg.gpu_ids)
263 |         outputs = single_gpu_test(model, data_loader, args.show, args.show_dir)
264 |     else:
265 |         model = MMDistributedDataParallel(
266 |             model.cuda(),
267 |             device_ids=[torch.cuda.current_device()],
268 |             broadcast_buffers=False)
269 |         outputs = multi_gpu_test(model, data_loader, args.tmpdir,
270 |                                  args.gpu_collect)
271 | 
272 |     rank, _ = get_dist_info()
273 |     if rank == 0:
274 |         if args.out:
275 |             print(f'\nwriting results to {args.out}')
276 |             mmcv.dump(outputs, args.out)
277 |         kwargs = {} if args.eval_options is None else args.eval_options
278 |         if args.format_only:
279 |             dataset.format_results(outputs, **kwargs)
280 |         if args.eval:
281 |             eval_kwargs = cfg.get('evaluation', {}).copy()
282 |             # hard-code way to remove EvalHook args
283 |             for key in [
284 |                 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
285 |                 'rule'
286 |             ]:
287 |                 eval_kwargs.pop(key, None)
288 |             eval_kwargs.update(dict(metric=args.eval, **kwargs))
289 |             print(dataset.evaluate(outputs, **eval_kwargs))
290 | 
291 | 
292 | if __name__ == '__main__':
293 |     main()
294 | 


--------------------------------------------------------------------------------
/tools/train.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) OpenMMLab. All rights reserved.
  2 | from __future__ import division
  3 | 
  4 | import argparse
  5 | import copy
  6 | import datetime
  7 | import os
  8 | import time
  9 | import warnings
 10 | from os import path as osp
 11 | 
 12 | import mmcv
 13 | import torch
 14 | import torch.distributed as dist
 15 | from mmcv import Config, DictAction
 16 | from mmcv.runner import get_dist_info, init_dist
 17 | from mmdet import __version__ as mmdet_version
 18 | from mmdet.apis import set_random_seed
 19 | from mmdet3d import __version__ as mmdet3d_version
 20 | from mmdet3d.apis import init_random_seed, train_model
 21 | from mmdet3d.datasets import build_dataset
 22 | from mmdet3d.models import build_model
 23 | from mmdet3d.utils import collect_env, get_root_logger
 24 | from mmseg import __version__ as mmseg_version
 25 | 
 26 | try:
 27 |     # If mmdet version > 2.20.0, setup_multi_processes would be imported and
 28 |     # used from mmdet instead of mmdet3d.
 29 |     from mmdet.utils import setup_multi_processes
 30 | except ImportError:
 31 |     from mmdet3d.utils import setup_multi_processes
 32 | 
 33 | 
 34 | def parse_args():
 35 |     parser = argparse.ArgumentParser(description='Train a detector')
 36 |     parser.add_argument('config', help='train config file path')
 37 |     parser.add_argument('--work-dir', help='the dir to save logs and models')
 38 |     parser.add_argument(
 39 |         '--resume-from', help='the checkpoint file to resume from')
 40 |     parser.add_argument(
 41 |         '--auto-resume',
 42 |         action='store_true',
 43 |         help='resume from the latest checkpoint automatically')
 44 |     parser.add_argument(
 45 |         '--no-validate',
 46 |         action='store_true',
 47 |         help='whether not to evaluate the checkpoint during training')
 48 |     group_gpus = parser.add_mutually_exclusive_group()
 49 |     group_gpus.add_argument(
 50 |         '--gpus',
 51 |         type=int,
 52 |         help='(Deprecated, please use --gpu-id) number of gpus to use '
 53 |              '(only applicable to non-distributed training)')
 54 |     group_gpus.add_argument(
 55 |         '--gpu-ids',
 56 |         type=int,
 57 |         nargs='+',
 58 |         help='(Deprecated, please use --gpu-id) ids of gpus to use '
 59 |              '(only applicable to non-distributed training)')
 60 |     group_gpus.add_argument(
 61 |         '--gpu-id',
 62 |         type=int,
 63 |         default=0,
 64 |         help='number of gpus to use '
 65 |              '(only applicable to non-distributed training)')
 66 |     parser.add_argument('--seed', type=int, default=None, help='random seed')
 67 |     parser.add_argument(
 68 |         '--diff-seed',
 69 |         action='store_true',
 70 |         help='Whether or not set different seeds for different ranks')
 71 |     parser.add_argument(
 72 |         '--deterministic',
 73 |         action='store_true',
 74 |         help='whether to set deterministic options for CUDNN backend.')
 75 |     parser.add_argument(
 76 |         '--options',
 77 |         nargs='+',
 78 |         action=DictAction,
 79 |         help='override some settings in the used config, the key-value pair '
 80 |              'in xxx=yyy format will be merged into config file (deprecate), '
 81 |              'change to --cfg-options instead.')
 82 |     parser.add_argument(
 83 |         '--cfg-options',
 84 |         nargs='+',
 85 |         action=DictAction,
 86 |         help='override some settings in the used config, the key-value pair '
 87 |              'in xxx=yyy format will be merged into config file. If the value to '
 88 |              'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
 89 |              'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
 90 |              'Note that the quotation marks are necessary and that no white space '
 91 |              'is allowed.')
 92 |     parser.add_argument(
 93 |         '--launcher',
 94 |         choices=['none', 'pytorch', 'slurm', 'mpi'],
 95 |         default='none',
 96 |         help='job launcher')
 97 |     parser.add_argument('--local_rank', type=int, default=0)
 98 |     parser.add_argument(
 99 |         '--autoscale-lr',
100 |         action='store_true',
101 |         help='automatically scale lr with the number of gpus')
102 |     parser.add_argument(
103 |         '--debug',
104 |         action='store_true',
105 |         default=False,
106 |         help='flag for debugging')
107 |     parser.add_argument(
108 |         '--batch-size',
109 |         type=int,
110 |         default=None,
111 |         required=False,
112 |         help='batch size for training')
113 |     args = parser.parse_args()
114 |     if 'LOCAL_RANK' not in os.environ:
115 |         os.environ['LOCAL_RANK'] = str(args.local_rank)
116 | 
117 |     if args.options and args.cfg_options:
118 |         raise ValueError(
119 |             '--options and --cfg-options cannot be both specified, '
120 |             '--options is deprecated in favor of --cfg-options')
121 |     if args.options:
122 |         warnings.warn('--options is deprecated in favor of --cfg-options')
123 |         args.cfg_options = args.options
124 | 
125 |     return args
126 | 
127 | 
128 | def main():
129 |     args = parse_args()
130 | 
131 |     cfg = Config.fromfile(args.config)
132 |     if args.cfg_options is not None:
133 |         cfg.merge_from_dict(args.cfg_options)
134 | 
135 |     # set multi-process settings
136 |     setup_multi_processes(cfg)
137 | 
138 |     if cfg.get('custom_imports', None):
139 |         from mmcv.utils import import_modules_from_strings
140 |         import_modules_from_strings(**cfg['custom_imports'])
141 | 
142 |     # import modules from plguin/xx, registry will be updated
143 |     if hasattr(cfg, 'plugin'):
144 |         if cfg.plugin:
145 |             import importlib
146 |             if hasattr(cfg, 'plugin_dir'):
147 |                 plugin_dir = cfg.plugin_dir
148 |                 _module_dir = os.path.dirname(plugin_dir)
149 |                 _module_dir = _module_dir.split('/')
150 |                 _module_path = _module_dir[0]
151 | 
152 |                 for m in _module_dir[1:]:
153 |                     _module_path = _module_path + '.' + m
154 |                 print(_module_path)
155 |                 plg_lib = importlib.import_module(_module_path)
156 |             else:
157 |                 # import dir is the dirpath for the config file
158 |                 _module_dir = os.path.dirname(args.config)
159 |                 _module_dir = _module_dir.split('/')
160 |                 _module_path = _module_dir[0]
161 |                 for m in _module_dir[1:]:
162 |                     _module_path = _module_path + '.' + m
163 |                 print(_module_path)
164 |                 plg_lib = importlib.import_module(_module_path)
165 | 
166 |     plg_lib = importlib.import_module('mmdet3d')
167 | 
168 |     # set cudnn_benchmark
169 |     if cfg.get('cudnn_benchmark', False):
170 |         torch.backends.cudnn.benchmark = True
171 | 
172 |     # work_dir is determined in this priority: CLI > segment in file > filename
173 |     if args.work_dir is not None:
174 |         # update configs according to CLI args if args.work_dir is not None
175 |         cfg.work_dir = args.work_dir
176 |     elif cfg.get('work_dir', None) is None:
177 |         # use config filename as default work_dir if cfg.work_dir is None
178 |         cfg.work_dir = osp.join('./work_dirs',
179 |                                 "debug" if args.debug else osp.splitext(osp.basename(args.config))[0])
180 |     if args.resume_from is not None:
181 |         cfg.resume_from = args.resume_from
182 | 
183 |     if args.auto_resume:
184 |         cfg.auto_resume = args.auto_resume
185 |         warnings.warn('`--auto-resume` is only supported when mmdet'
186 |                       'version >= 2.20.0 for 3D detection model or'
187 |                       'mmsegmentation verision >= 0.21.0 for 3D'
188 |                       'segmentation model')
189 | 
190 |     if args.gpus is not None:
191 |         cfg.gpu_ids = range(1)
192 |         warnings.warn('`--gpus` is deprecated because we only support '
193 |                       'single GPU mode in non-distributed training. '
194 |                       'Use `gpus=1` now.')
195 |     if args.gpu_ids is not None:
196 |         cfg.gpu_ids = args.gpu_ids[0:1]
197 |         warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
198 |                       'Because we only support single GPU mode in '
199 |                       'non-distributed training. Use the first GPU '
200 |                       'in `gpu_ids` now.')
201 |     if args.gpus is None and args.gpu_ids is None:
202 |         cfg.gpu_ids = [args.gpu_id]
203 | 
204 |     if args.autoscale_lr:
205 |         # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
206 |         cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
207 | 
208 |     # init distributed env first, since logger depends on the dist info.
209 |     if args.launcher == 'none':
210 |         distributed = False
211 |         cfg.work_dir = osp.join(cfg.work_dir, datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
212 |         world_size = 1
213 |     else:
214 |         distributed = True
215 |         init_dist(args.launcher, **cfg.dist_params)
216 |         # re-set gpu_ids with distributed training mode
217 |         _, world_size = get_dist_info()
218 |         cfg.gpu_ids = range(world_size)
219 | 
220 |         date = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
221 |         date_gather = [None for _ in range(world_size)]
222 |         dist.all_gather_object(date_gather, date)
223 |         cfg.work_dir = osp.join(cfg.work_dir, str(date_gather[0]))
224 | 
225 |     if args.batch_size is not None:
226 |         cfg.data["samples_per_gpu"] = args.batch_size // world_size
227 | 
228 |     # create work_dir
229 |     mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
230 |     # init the logger before other steps
231 |     timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
232 |     log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
233 |     # specify logger name, if we still use 'mmdet', the output info will be
234 |     # filtered and won't be saved in the log_file
235 |     # TODO: ugly workaround to judge whether we are training det or seg model
236 |     if cfg.model.type in ['EncoderDecoder3D']:
237 |         logger_name = 'mmseg'
238 |     else:
239 |         logger_name = 'mmdet'
240 |     logger = get_root_logger(
241 |         log_file=log_file, log_level=cfg.log_level, name=logger_name)
242 | 
243 |     # init the meta dict to record some important information such as
244 |     # environment info and seed, which will be logged
245 |     meta = dict()
246 |     # log env info
247 |     env_info_dict = collect_env()
248 |     env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
249 |     dash_line = '-' * 60 + '\n'
250 |     logger.info('Environment info:\n' + dash_line + env_info + '\n' +
251 |                 dash_line)
252 |     meta['env_info'] = env_info
253 |     meta['config'] = cfg.pretty_text
254 | 
255 |     # log some basic info
256 |     logger.info(f'Distributed training: {distributed}')
257 |     logger.info(f'Config:\n{cfg.pretty_text}')
258 | 
259 |     # set random seeds
260 |     seed = init_random_seed(args.seed)
261 |     seed = seed + dist.get_rank() if args.diff_seed else seed
262 |     logger.info(f'Set random seed to {seed}, '
263 |                 f'deterministic: {args.deterministic}')
264 |     set_random_seed(seed, deterministic=args.deterministic)
265 |     cfg.seed = seed
266 |     meta['seed'] = seed
267 |     meta['exp_name'] = osp.basename(args.config)
268 | 
269 |     # dump config
270 |     cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
271 | 
272 |     model = build_model(
273 |         cfg.model,
274 |         train_cfg=cfg.get('train_cfg'),
275 |         test_cfg=cfg.get('test_cfg'))
276 |     model.init_weights()
277 | 
278 |     logger.info(f'Model:\n{model}')
279 |     datasets = [build_dataset(cfg.data.train)]
280 |     if len(cfg.workflow) == 2:
281 |         val_dataset = copy.deepcopy(cfg.data.val)
282 |         # in case we use a dataset wrapper
283 |         if 'dataset' in cfg.data.train:
284 |             val_dataset.pipeline = cfg.data.train.dataset.pipeline
285 |         else:
286 |             val_dataset.pipeline = cfg.data.train.pipeline
287 |         # set test_mode=False here in deep copied config
288 |         # which do not affect AP/AR calc ulation later
289 |         # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow  # noqa
290 |         val_dataset.test_mode = False
291 |         datasets.append(build_dataset(val_dataset))
292 |     if cfg.checkpoint_config is not None:
293 |         # save mmdet version, config file content and class names in
294 |         # checkpoints as meta data
295 |         cfg.checkpoint_config.meta = dict(
296 |             mmdet_version=mmdet_version,
297 |             mmseg_version=mmseg_version,
298 |             mmdet3d_version=mmdet3d_version,
299 |             config=cfg.pretty_text,
300 |             CLASSES=datasets[0].CLASSES,
301 |             PALETTE=datasets[0].PALETTE  # for segmentors
302 |             if hasattr(datasets[0], 'PALETTE') else None)
303 |     # add an attribute for visualization convenience
304 |     model.CLASSES = datasets[0].CLASSES
305 |     train_model(
306 |         model,
307 |         datasets,
308 |         cfg,
309 |         distributed=distributed,
310 |         validate=(not args.no_validate),
311 |         timestamp=timestamp,
312 |         meta=meta)
313 | 
314 | 
315 | if __name__ == '__main__':
316 |     main()
317 | 


--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/backbones/vovnet.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2023 megvii-model. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from DETR3D (https://github.com/WangYueFt/detr3d)
  5 | # Copyright (c) 2021 Wang, Yue
  6 | # ------------------------------------------------------------------------
  7 | # Copyright (c) Youngwan Lee (ETRI) All Rights Reserved.
  8 | # Copyright 2021 Toyota Research Institute.  All rights reserved.
  9 | # ------------------------------------------------------------------------
 10 | import warnings
 11 | from collections import OrderedDict
 12 | 
 13 | import torch
 14 | import torch.nn as nn
 15 | import torch.nn.functional as F
 16 | from mmcv.runner import BaseModule
 17 | from mmdet.models.builder import BACKBONES
 18 | from torch.nn.modules.batchnorm import _BatchNorm
 19 | 
 20 | VoVNet19_slim_dw_eSE = {
 21 |     'stem': [64, 64, 64],
 22 |     'stage_conv_ch': [64, 80, 96, 112],
 23 |     'stage_out_ch': [112, 256, 384, 512],
 24 |     "layer_per_block": 3,
 25 |     "block_per_stage": [1, 1, 1, 1],
 26 |     "eSE": True,
 27 |     "dw": True
 28 | }
 29 | 
 30 | VoVNet19_dw_eSE = {
 31 |     'stem': [64, 64, 64],
 32 |     "stage_conv_ch": [128, 160, 192, 224],
 33 |     "stage_out_ch": [256, 512, 768, 1024],
 34 |     "layer_per_block": 3,
 35 |     "block_per_stage": [1, 1, 1, 1],
 36 |     "eSE": True,
 37 |     "dw": True
 38 | }
 39 | 
 40 | VoVNet19_slim_eSE = {
 41 |     'stem': [64, 64, 128],
 42 |     'stage_conv_ch': [64, 80, 96, 112],
 43 |     'stage_out_ch': [112, 256, 384, 512],
 44 |     'layer_per_block': 3,
 45 |     'block_per_stage': [1, 1, 1, 1],
 46 |     'eSE': True,
 47 |     "dw": False
 48 | }
 49 | 
 50 | VoVNet19_eSE = {
 51 |     'stem': [64, 64, 128],
 52 |     "stage_conv_ch": [128, 160, 192, 224],
 53 |     "stage_out_ch": [256, 512, 768, 1024],
 54 |     "layer_per_block": 3,
 55 |     "block_per_stage": [1, 1, 1, 1],
 56 |     "eSE": True,
 57 |     "dw": False
 58 | }
 59 | 
 60 | VoVNet39_eSE = {
 61 |     'stem': [64, 64, 128],
 62 |     "stage_conv_ch": [128, 160, 192, 224],
 63 |     "stage_out_ch": [256, 512, 768, 1024],
 64 |     "layer_per_block": 5,
 65 |     "block_per_stage": [1, 1, 2, 2],
 66 |     "eSE": True,
 67 |     "dw": False
 68 | }
 69 | 
 70 | VoVNet57_eSE = {
 71 |     'stem': [64, 64, 128],
 72 |     "stage_conv_ch": [128, 160, 192, 224],
 73 |     "stage_out_ch": [256, 512, 768, 1024],
 74 |     "layer_per_block": 5,
 75 |     "block_per_stage": [1, 1, 4, 3],
 76 |     "eSE": True,
 77 |     "dw": False
 78 | }
 79 | 
 80 | VoVNet99_eSE = {
 81 |     'stem': [64, 64, 128],
 82 |     "stage_conv_ch": [128, 160, 192, 224],
 83 |     "stage_out_ch": [256, 512, 768, 1024],
 84 |     "layer_per_block": 5,
 85 |     "block_per_stage": [1, 3, 9, 3],
 86 |     "eSE": True,
 87 |     "dw": False
 88 | }
 89 | 
 90 | _STAGE_SPECS = {
 91 |     "V-19-slim-dw-eSE": VoVNet19_slim_dw_eSE,
 92 |     "V-19-dw-eSE": VoVNet19_dw_eSE,
 93 |     "V-19-slim-eSE": VoVNet19_slim_eSE,
 94 |     "V-19-eSE": VoVNet19_eSE,
 95 |     "V-39-eSE": VoVNet39_eSE,
 96 |     "V-57-eSE": VoVNet57_eSE,
 97 |     "V-99-eSE": VoVNet99_eSE,
 98 | }
 99 | 
100 | 
101 | def dw_conv3x3(in_channels, out_channels, module_name, postfix, stride=1, kernel_size=3, padding=1):
102 |     """3x3 convolution with padding"""
103 |     return [
104 |         (
105 |             '{}_{}/dw_conv3x3'.format(module_name, postfix),
106 |             nn.Conv2d(
107 |                 in_channels,
108 |                 out_channels,
109 |                 kernel_size=kernel_size,
110 |                 stride=stride,
111 |                 padding=padding,
112 |                 groups=out_channels,
113 |                 bias=False
114 |             )
115 |         ),
116 |         (
117 |             '{}_{}/pw_conv1x1'.format(module_name, postfix),
118 |             nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, groups=1, bias=False)
119 |         ),
120 |         ('{}_{}/pw_norm'.format(module_name, postfix), nn.BatchNorm2d(out_channels)),
121 |         ('{}_{}/pw_relu'.format(module_name, postfix), nn.ReLU(inplace=True)),
122 |     ]
123 | 
124 | 
125 | def conv3x3(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=3, padding=1):
126 |     """3x3 convolution with padding"""
127 |     return [
128 |         (
129 |             f"{module_name}_{postfix}/conv",
130 |             nn.Conv2d(
131 |                 in_channels,
132 |                 out_channels,
133 |                 kernel_size=kernel_size,
134 |                 stride=stride,
135 |                 padding=padding,
136 |                 groups=groups,
137 |                 bias=False,
138 |             ),
139 |         ),
140 |         (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)),
141 |         (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)),
142 |     ]
143 | 
144 | 
145 | def conv1x1(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=1, padding=0):
146 |     """1x1 convolution with padding"""
147 |     return [
148 |         (
149 |             f"{module_name}_{postfix}/conv",
150 |             nn.Conv2d(
151 |                 in_channels,
152 |                 out_channels,
153 |                 kernel_size=kernel_size,
154 |                 stride=stride,
155 |                 padding=padding,
156 |                 groups=groups,
157 |                 bias=False,
158 |             ),
159 |         ),
160 |         (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)),
161 |         (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)),
162 |     ]
163 | 
164 | 
165 | class Hsigmoid(nn.Module):
166 |     def __init__(self, inplace=True):
167 |         super(Hsigmoid, self).__init__()
168 |         self.inplace = inplace
169 | 
170 |     def forward(self, x):
171 |         return F.relu6(x + 3.0, inplace=self.inplace) / 6.0
172 | 
173 | 
174 | class eSEModule(nn.Module):
175 |     def __init__(self, channel, reduction=4):
176 |         super(eSEModule, self).__init__()
177 |         self.avg_pool = nn.AdaptiveAvgPool2d(1)
178 |         self.fc = nn.Conv2d(channel, channel, kernel_size=1, padding=0)
179 |         self.hsigmoid = Hsigmoid()
180 | 
181 |     def forward(self, x):
182 |         input = x
183 |         x = self.avg_pool(x)
184 |         x = self.fc(x)
185 |         x = self.hsigmoid(x)
186 |         return input * x
187 | 
188 | 
189 | class _OSA_module(nn.Module):
190 |     def __init__(
191 |             self, in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE=False, identity=False, depthwise=False
192 |     ):
193 | 
194 |         super(_OSA_module, self).__init__()
195 | 
196 |         self.identity = identity
197 |         self.depthwise = depthwise
198 |         self.isReduced = False
199 |         self.layers = nn.ModuleList()
200 |         in_channel = in_ch
201 |         if self.depthwise and in_channel != stage_ch:
202 |             self.isReduced = True
203 |             self.conv_reduction = nn.Sequential(
204 |                 OrderedDict(conv1x1(in_channel, stage_ch, "{}_reduction".format(module_name), "0"))
205 |             )
206 |         for i in range(layer_per_block):
207 |             if self.depthwise:
208 |                 self.layers.append(nn.Sequential(OrderedDict(dw_conv3x3(stage_ch, stage_ch, module_name, i))))
209 |             else:
210 |                 self.layers.append(nn.Sequential(OrderedDict(conv3x3(in_channel, stage_ch, module_name, i))))
211 |             in_channel = stage_ch
212 | 
213 |         # feature aggregation
214 |         in_channel = in_ch + layer_per_block * stage_ch
215 |         self.concat = nn.Sequential(OrderedDict(conv1x1(in_channel, concat_ch, module_name, "concat")))
216 | 
217 |         self.ese = eSEModule(concat_ch)
218 | 
219 |     def forward(self, x):
220 | 
221 |         identity_feat = x
222 | 
223 |         output = []
224 |         output.append(x)
225 |         if self.depthwise and self.isReduced:
226 |             x = self.conv_reduction(x)
227 |         for layer in self.layers:
228 |             x = layer(x)
229 |             output.append(x)
230 | 
231 |         x = torch.cat(output, dim=1)
232 |         xt = self.concat(x)
233 | 
234 |         xt = self.ese(xt)
235 | 
236 |         if self.identity:
237 |             xt = xt + identity_feat
238 | 
239 |         return xt
240 | 
241 | 
242 | class _OSA_stage(nn.Sequential):
243 |     def __init__(
244 |             self, in_ch, stage_ch, concat_ch, block_per_stage, layer_per_block, stage_num, SE=False, depthwise=False
245 |     ):
246 | 
247 |         super(_OSA_stage, self).__init__()
248 | 
249 |         if not stage_num == 2:
250 |             self.add_module("Pooling", nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True))
251 | 
252 |         if block_per_stage != 1:
253 |             SE = False
254 |         module_name = f"OSA{stage_num}_1"
255 |         self.add_module(
256 |             module_name, _OSA_module(in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE, depthwise=depthwise)
257 |         )
258 |         for i in range(block_per_stage - 1):
259 |             if i != block_per_stage - 2:  # last block
260 |                 SE = False
261 |             module_name = f"OSA{stage_num}_{i + 2}"
262 |             self.add_module(
263 |                 module_name,
264 |                 _OSA_module(
265 |                     concat_ch,
266 |                     stage_ch,
267 |                     concat_ch,
268 |                     layer_per_block,
269 |                     module_name,
270 |                     SE,
271 |                     identity=True,
272 |                     depthwise=depthwise
273 |                 ),
274 |             )
275 | 
276 | 
277 | @BACKBONES.register_module()
278 | class VoVNet(BaseModule):
279 |     def __init__(self, spec_name, input_ch=3, out_features=None,
280 |                  frozen_stages=-1, norm_eval=True, pretrained=None, init_cfg=None):
281 |         """
282 |         Args:
283 |             input_ch(int) : the number of input channel
284 |             out_features (list[str]): name of the layers whose outputs should
285 |                 be returned in forward. Can be anything in "stem", "stage2" ...
286 |         """
287 |         super(VoVNet, self).__init__(init_cfg)
288 |         self.fp16_enabled = False
289 |         self.frozen_stages = frozen_stages
290 |         self.norm_eval = norm_eval
291 | 
292 |         if isinstance(pretrained, str):
293 |             warnings.warn('DeprecationWarning: pretrained is deprecated, '
294 |                           'please use "init_cfg" instead')
295 |             self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
296 |         stage_specs = _STAGE_SPECS[spec_name]
297 | 
298 |         stem_ch = stage_specs["stem"]
299 |         config_stage_ch = stage_specs["stage_conv_ch"]
300 |         config_concat_ch = stage_specs["stage_out_ch"]
301 |         block_per_stage = stage_specs["block_per_stage"]
302 |         layer_per_block = stage_specs["layer_per_block"]
303 |         SE = stage_specs["eSE"]
304 |         depthwise = stage_specs["dw"]
305 | 
306 |         self._out_features = out_features
307 | 
308 |         # Stem module
309 |         conv_type = dw_conv3x3 if depthwise else conv3x3
310 |         stem = conv3x3(input_ch, stem_ch[0], "stem", "1", 2)
311 |         stem += conv_type(stem_ch[0], stem_ch[1], "stem", "2", 1)
312 |         stem += conv_type(stem_ch[1], stem_ch[2], "stem", "3", 2)
313 |         self.add_module("stem", nn.Sequential((OrderedDict(stem))))
314 |         current_stirde = 4
315 |         self._out_feature_strides = {"stem": current_stirde, "stage2": current_stirde}
316 |         self._out_feature_channels = {"stem": stem_ch[2]}
317 | 
318 |         stem_out_ch = [stem_ch[2]]
319 |         in_ch_list = stem_out_ch + config_concat_ch[:-1]
320 |         # OSA stages
321 |         self.stage_names = []
322 |         for i in range(4):  # num_stages
323 |             name = "stage%d" % (i + 2)  # stage 2 ... stage 5
324 |             self.stage_names.append(name)
325 |             self.add_module(
326 |                 name,
327 |                 _OSA_stage(
328 |                     in_ch_list[i],
329 |                     config_stage_ch[i],
330 |                     config_concat_ch[i],
331 |                     block_per_stage[i],
332 |                     layer_per_block,
333 |                     i + 2,
334 |                     SE,
335 |                     depthwise,
336 |                 ),
337 |             )
338 | 
339 |             self._out_feature_channels[name] = config_concat_ch[i]
340 |             if not i == 0:
341 |                 self._out_feature_strides[name] = current_stirde = int(current_stirde * 2)
342 | 
343 |         # initialize weights
344 |         # self._initialize_weights()
345 | 
346 |     def _initialize_weights(self):
347 |         for m in self.modules():
348 |             if isinstance(m, nn.Conv2d):
349 |                 nn.init.kaiming_normal_(m.weight)
350 | 
351 |     def init_weights(self):
352 |         super().init_weights()
353 |         self._freeze_stages()
354 | 
355 |     def forward(self, x):
356 |         outputs = {}
357 |         x = self.stem(x)
358 |         if "stem" in self._out_features:
359 |             outputs["stem"] = x
360 |         for name in self.stage_names:
361 |             x = getattr(self, name)(x)
362 |             if name in self._out_features:
363 |                 outputs[name] = x
364 | 
365 |         return outputs
366 | 
367 |     def _freeze_stages(self):
368 |         if self.frozen_stages >= 0:
369 |             m = getattr(self, 'stem')
370 |             m.eval()
371 |             for param in m.parameters():
372 |                 param.requires_grad = False
373 | 
374 |         for i in range(1, self.frozen_stages + 1):
375 |             m = getattr(self, f'stage{i + 1}')
376 |             m.eval()
377 |             for param in m.parameters():
378 |                 param.requires_grad = False
379 | 
380 |     def train(self, mode=True):
381 |         """Convert the model into training mode while keep normalization layer
382 |         freezed."""
383 |         super(VoVNet, self).train(mode)
384 |         # self._freeze_stages()
385 |         if mode and self.norm_eval:
386 |             for m in self.modules():
387 |                 # trick: eval have effect on BatchNorm only
388 |                 if isinstance(m, _BatchNorm):
389 |                     m.eval()
390 | 


--------------------------------------------------------------------------------
/projects/configs/meformer_voxel0075_vov_1600x640_cbgs.py:
--------------------------------------------------------------------------------
  1 | plugin = True
  2 | plugin_dir = 'projects/mmdet3d_plugin/'
  3 | 
  4 | point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
  5 | class_names = [
  6 |     'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
  7 |     'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
  8 | ]
  9 | voxel_size = [0.075, 0.075, 0.2]
 10 | out_size_factor = 8
 11 | evaluation = dict(interval=1)
 12 | dataset_type = 'CustomNuScenesDataset'
 13 | data_root = 'data/nuscenes/'
 14 | input_modality = dict(
 15 |     use_lidar=True,
 16 |     use_camera=True,
 17 |     use_radar=False,
 18 |     use_map=False,
 19 |     use_external=False)
 20 | 
 21 | img_norm_cfg = dict(
 22 |     mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395], to_rgb=False)
 23 | 
 24 | ida_aug_conf = {
 25 |     "resize_lim": (0.94, 1.25),
 26 |     "final_dim": (640, 1600),
 27 |     "bot_pct_lim": (0.0, 0.0),
 28 |     "rot_lim": (0.0, 0.0),
 29 |     "H": 900,
 30 |     "W": 1600,
 31 |     "rand_flip": True,
 32 | }
 33 | 
 34 | train_pipeline = [
 35 |     dict(
 36 |         type='LoadPointsFromFile',
 37 |         coord_type='LIDAR',
 38 |         load_dim=5,
 39 |         use_dim=[0, 1, 2, 3, 4],
 40 |     ),
 41 |     dict(
 42 |         type='LoadPointsFromMultiSweeps',
 43 |         sweeps_num=10,
 44 |         use_dim=[0, 1, 2, 3, 4],
 45 |     ),
 46 |     dict(type='LoadMultiViewImageFromFiles'),
 47 |     dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
 48 |     dict(
 49 |         type='GlobalRotScaleTransAll',
 50 |         rot_range=[-0.3925 * 2, 0.3925 * 2],
 51 |         scale_ratio_range=[0.9, 1.1],
 52 |         translation_std=[0.5, 0.5, 0.5]),
 53 |     dict(
 54 |         type='CustomRandomFlip3D',
 55 |         sync_2d=False,
 56 |         flip_ratio_bev_horizontal=0.5,
 57 |         flip_ratio_bev_vertical=0.5),
 58 |     dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
 59 |     dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
 60 |     dict(type='ObjectNameFilter', classes=class_names),
 61 |     dict(type='PointShuffle'),
 62 |     dict(type='ResizeCropFlipImage', data_aug_conf=ida_aug_conf, training=True),
 63 |     dict(type='NormalizeMultiviewImage', **img_norm_cfg),
 64 |     dict(type='PadMultiViewImage', size_divisor=32),
 65 |     dict(type='DefaultFormatBundle3D', class_names=class_names),
 66 |     dict(type='Collect3D', keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d'],
 67 |          meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
 68 |                     'depth2img', 'cam2img', 'pad_shape',
 69 |                     'scale_factor', 'flip', 'pcd_horizontal_flip',
 70 |                     'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
 71 |                     'img_norm_cfg', 'pcd_trans', 'sample_idx',
 72 |                     'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
 73 |                     'transformation_3d_flow', 'rot_degree',
 74 |                     'gt_bboxes_3d', 'gt_labels_3d'))
 75 | ]
 76 | test_pipeline = [
 77 |     dict(
 78 |         type='LoadPointsFromFile',
 79 |         coord_type='LIDAR',
 80 |         load_dim=5,
 81 |         use_dim=[0, 1, 2, 3, 4],
 82 |     ),
 83 |     dict(
 84 |         type='LoadPointsFromMultiSweeps',
 85 |         sweeps_num=10,
 86 |         use_dim=[0, 1, 2, 3, 4],
 87 |     ),
 88 |     dict(type='LoadMultiViewImageFromFiles'),
 89 |     dict(
 90 |         type='MultiScaleFlipAug3D',
 91 |         img_scale=(1333, 800),
 92 |         pts_scale_ratio=1,
 93 |         flip=False,
 94 |         transforms=[
 95 |             dict(
 96 |                 type='GlobalRotScaleTrans',
 97 |                 rot_range=[0, 0],
 98 |                 scale_ratio_range=[1.0, 1.0],
 99 |                 translation_std=[0, 0, 0]),
100 |             dict(type='RandomFlip3D'),
101 |             dict(type='ResizeCropFlipImage', data_aug_conf=ida_aug_conf, training=False),
102 |             dict(type='NormalizeMultiviewImage', **img_norm_cfg),
103 |             dict(type='PadMultiViewImage', size_divisor=32),
104 |             dict(
105 |                 type='DefaultFormatBundle3D',
106 |                 class_names=class_names,
107 |                 with_label=False),
108 |             dict(type='Collect3D', keys=['points', 'img'])
109 |         ])
110 | ]
111 | data = dict(
112 |     samples_per_gpu=4,
113 |     workers_per_gpu=6,
114 |     train=dict(
115 |         type='CBGSDataset',
116 |         dataset=dict(
117 |             type=dataset_type,
118 |             data_root=data_root,
119 |             ann_file=data_root + '/nuscenes_infos_train.pkl',
120 |             load_interval=1,
121 |             pipeline=train_pipeline,
122 |             classes=class_names,
123 |             modality=input_modality,
124 |             test_mode=False,
125 |             box_type_3d='LiDAR')),
126 |     val=dict(
127 |         type=dataset_type,
128 |         data_root=data_root,
129 |         ann_file=data_root + '/nuscenes_infos_val.pkl',
130 |         load_interval=1,
131 |         pipeline=test_pipeline,
132 |         classes=class_names,
133 |         modality=input_modality,
134 |         test_mode=True,
135 |         box_type_3d='LiDAR'),
136 |     test=dict(
137 |         type=dataset_type,
138 |         data_root=data_root,
139 |         ann_file=data_root + '/nuscenes_infos_val.pkl',
140 |         load_interval=1,
141 |         pipeline=test_pipeline,
142 |         classes=class_names,
143 |         modality=input_modality,
144 |         test_mode=True,
145 |         box_type_3d='LiDAR'))
146 | model = dict(
147 |     type='MEFormerDetector',
148 |     use_grid_mask=True,
149 |     img_backbone=dict(
150 |         type='VoVNet',
151 |         spec_name='V-99-eSE',
152 |         norm_eval=True,
153 |         frozen_stages=-1,
154 |         input_ch=3,
155 |         out_features=('stage4', 'stage5',)),
156 |     img_neck=dict(
157 |         type='CPFPN',
158 |         in_channels=[768, 1024],
159 |         out_channels=256,
160 |         num_outs=2),
161 |     pts_voxel_layer=dict(
162 |         num_point_features=5,
163 |         max_num_points=10,
164 |         voxel_size=voxel_size,
165 |         max_voxels=(120000, 160000),
166 |         point_cloud_range=point_cloud_range),
167 |     pts_voxel_encoder=dict(
168 |         type='HardSimpleVFE',
169 |         num_features=5,
170 |     ),
171 |     pts_middle_encoder=dict(
172 |         type='SparseEncoder',
173 |         in_channels=5,
174 |         sparse_shape=[41, 1440, 1440],
175 |         output_channels=128,
176 |         order=('conv', 'norm', 'act'),
177 |         encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)),
178 |         encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
179 |         block_type='basicblock'),
180 |     pts_backbone=dict(
181 |         type='SECOND',
182 |         in_channels=256,
183 |         out_channels=[128, 256],
184 |         layer_nums=[5, 5],
185 |         layer_strides=[1, 2],
186 |         norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
187 |         conv_cfg=dict(type='Conv2d', bias=False)),
188 |     pts_neck=dict(
189 |         type='SECONDFPN',
190 |         in_channels=[128, 256],
191 |         out_channels=[256, 256],
192 |         upsample_strides=[1, 2],
193 |         norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
194 |         upsample_cfg=dict(type='deconv', bias=False),
195 |         use_conv_for_no_stride=True),
196 |     pts_bbox_head=dict(
197 |         type='MEFormerHead',
198 |         in_channels=512,
199 |         hidden_dim=256,
200 |         downsample_scale=8,
201 |         pc_range=point_cloud_range,
202 |         use_ensemble=True,
203 |         modalities=dict(
204 |             train=["fused", "bev", "img"],
205 |             test=["fused", "bev", "img"]
206 |         ),
207 |         common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
208 |         tasks=[
209 |             dict(num_class=10, class_names=[
210 |                 'car', 'truck', 'construction_vehicle',
211 |                 'bus', 'trailer', 'barrier',
212 |                 'motorcycle', 'bicycle',
213 |                 'pedestrian', 'traffic_cone'
214 |             ]),
215 |         ],
216 |         bbox_coder=dict(
217 |             type='MultiTaskBBoxCoder',
218 |             post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
219 |             pc_range=point_cloud_range,
220 |             max_num=300,
221 |             voxel_size=voxel_size,
222 |             num_classes=10),
223 |         separate_head=dict(
224 |             type='SeparateTaskHead', init_bias=-2.19, final_kernel=1),
225 |         transformer=dict(
226 |             type='MOADTransformer',
227 |             use_cam_embed=True,
228 |             decoder=dict(
229 |                 type='PETRTransformerDecoder',
230 |                 return_intermediate=True,
231 |                 num_layers=6,
232 |                 transformerlayers=dict(
233 |                     type='PETRTransformerDecoderLayer',
234 |                     with_cp=False,
235 |                     attn_cfgs=[
236 |                         dict(
237 |                             type='MultiheadAttention',
238 |                             embed_dims=256,
239 |                             num_heads=8,
240 |                             dropout=0.1),
241 |                         dict(
242 |                             type='PETRMultiheadFlashAttention',
243 |                             embed_dims=256,
244 |                             num_heads=8,
245 |                             dropout=0.1),
246 |                     ],
247 |                     ffn_cfgs=dict(
248 |                         type='FFN',
249 |                         embed_dims=256,
250 |                         feedforward_channels=1024,
251 |                         num_fcs=2,
252 |                         ffn_drop=0.,
253 |                         act_cfg=dict(type='ReLU', inplace=True),
254 |                     ),
255 | 
256 |                     feedforward_channels=1024,
257 |                     operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
258 |                                      'ffn', 'norm')),
259 |             ),
260 |         ),
261 |         ensemble=dict(
262 |             type='PMETransformer',
263 |             decoder=dict(
264 |                 type="PETRTransformerDecoder",
265 |                 return_intermediate=True,
266 |                 num_layers=1,
267 |                 transformerlayers=dict(
268 |                     type='PETRTransformerDecoderLayer',
269 |                     with_cp=False,
270 |                     attn_cfgs=[
271 |                         dict(
272 |                             type='MultiheadAttention',
273 |                             embed_dims=256,
274 |                             num_heads=8,
275 |                             dropout=0.1),
276 |                     ],
277 |                     ffn_cfgs=dict(
278 |                         type='FFN',
279 |                         embed_dims=256,
280 |                         feedforward_channels=1024,
281 |                         num_fcs=2,
282 |                         ffn_drop=0.,
283 |                         act_cfg=dict(type='ReLU', inplace=True),
284 |                     ),
285 | 
286 |                     feedforward_channels=1024,
287 |                     operation_order=('cross_attn', 'norm', 'ffn', 'norm')
288 |                 ),
289 |             ),
290 |         ),
291 |         loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=2.0),
292 |         loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25)
293 |     ),
294 |     train_cfg=dict(
295 |         pts=dict(
296 |             dataset='nuScenes',
297 |             assigner=dict(
298 |                 type='HungarianAssigner3D',
299 |                 cls_cost=dict(type='FocalLossCost', weight=2.0),
300 |                 reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
301 |                 iou_cost=dict(type='IoUCost', weight=0.0),
302 |                 pc_range=point_cloud_range,
303 |                 code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
304 |             ),
305 |             pos_weight=-1,
306 |             gaussian_overlap=0.1,
307 |             min_radius=2,
308 |             grid_size=[1440, 1440, 40],
309 |             voxel_size=voxel_size,
310 |             out_size_factor=out_size_factor,
311 |             code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
312 |             point_cloud_range=point_cloud_range)),
313 |     test_cfg=dict(
314 |         pts=dict(
315 |             dataset='nuScenes',
316 |             grid_size=[1440, 1440, 40],
317 |             out_size_factor=out_size_factor,
318 |             pc_range=point_cloud_range,
319 |             voxel_size=voxel_size,
320 |             nms_type=None,
321 |             nms_thr=0.2,
322 |             use_rotate_nms=True,
323 |             max_num=200
324 |         )))
325 | optimizer = dict(
326 |     type='AdamW',
327 |     lr=0.0001,
328 |     paramwise_cfg=dict(
329 |         custom_keys={
330 |             'img_backbone': dict(lr_mult=0.01, decay_mult=5),
331 |             'img_neck': dict(lr_mult=0.1),
332 |         }),
333 |     weight_decay=0.01)
334 | optimizer_config = dict(
335 |     type='CustomFp16OptimizerHook',
336 |     loss_scale='dynamic',
337 |     grad_clip=dict(max_norm=35, norm_type=2),
338 |     custom_fp16=dict(pts_voxel_encoder=False, pts_middle_encoder=False, pts_bbox_head=False))
339 | lr_config = dict(
340 |     policy='CosineAnnealing',
341 |     by_epoch=False,
342 |     min_lr_ratio=0.001,
343 |     warmup="linear",
344 |     warmup_iters=1000)
345 | momentum_config = dict(
346 |     policy='cyclic',
347 |     target_ratio=(0.8947368421052632, 1),
348 |     cyclic_times=1,
349 |     step_ratio_up=0.4)
350 | total_epochs = 6
351 | checkpoint_config = dict(interval=1)
352 | log_config = dict(
353 |     interval=50,
354 |     hooks=[dict(type='TextLoggerHook'),
355 |            dict(type='TensorboardLoggerHook')])
356 | dist_params = dict(backend='nccl')
357 | log_level = 'INFO'
358 | work_dir = None
359 | load_from = 'ckpts/moad_voxel0075_vov_1600x640_cbgs.pth'
360 | resume_from = None
361 | workflow = [('train', 1)]
362 | gpu_ids = range(0, 8)
363 | 
364 | custom_hooks = [
365 |     dict(
366 |         type="FreezeWeight",
367 |         finetune_weight=["pts_bbox_head.ensemble"]
368 |     )
369 | ]
370 | 
371 | find_unused_parameters = True
372 | 


--------------------------------------------------------------------------------
/projects/configs/mome/mome.py:
--------------------------------------------------------------------------------
  1 | plugin = True
  2 | plugin_dir = 'projects/mmdet3d_plugin/'
  3 | 
  4 | point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
  5 | class_names = [
  6 |     'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
  7 |     'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
  8 | ]
  9 | voxel_size = [0.075, 0.075, 0.2]
 10 | out_size_factor = 8
 11 | evaluation = dict(interval=1)
 12 | dataset_type = 'CustomNuScenesDataset'
 13 | data_root = 'data/nuscenes/'
 14 | input_modality = dict(
 15 |     use_lidar=True,
 16 |     use_camera=True,
 17 |     use_radar=False,
 18 |     use_map=False,
 19 |     use_external=False)
 20 | 
 21 | img_norm_cfg = dict(
 22 |     mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395], to_rgb=False)
 23 | 
 24 | ida_aug_conf = {
 25 |     "resize_lim": (0.94, 1.25),
 26 |     "final_dim": (640, 1600),
 27 |     "bot_pct_lim": (0.0, 0.0),
 28 |     "rot_lim": (0.0, 0.0),
 29 |     "H": 900,
 30 |     "W": 1600,
 31 |     "rand_flip": True,
 32 | }
 33 | 
 34 | train_pipeline = [
 35 |     dict(
 36 |         type='LoadPointsFromFile',
 37 |         coord_type='LIDAR',
 38 |         load_dim=5,
 39 |         use_dim=[0, 1, 2, 3, 4],
 40 |     ),
 41 |     dict(
 42 |         type='LoadPointsFromMultiSweeps',
 43 |         sweeps_num=10,
 44 |         use_dim=[0, 1, 2, 3, 4],
 45 |     ),
 46 |     dict(type='LoadMultiViewImageFromFiles'),
 47 |     dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
 48 |     dict(type='ModalMask3D',
 49 |          mode='train'),
 50 |     dict(
 51 |         type='GlobalRotScaleTransAll',
 52 |         rot_range=[-0.3925 * 2, 0.3925 * 2],
 53 |         scale_ratio_range=[0.9, 1.1],
 54 |         translation_std=[0.5, 0.5, 0.5]),
 55 |     dict(
 56 |         type='CustomRandomFlip3D',
 57 |         sync_2d=False,
 58 |         flip_ratio_bev_horizontal=0.5,
 59 |         flip_ratio_bev_vertical=0.5),
 60 |     dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
 61 |     dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
 62 |     dict(type='ObjectNameFilter', classes=class_names),
 63 |     dict(type='PointShuffle'),
 64 |     dict(type='ResizeCropFlipImage', data_aug_conf=ida_aug_conf, training=True),
 65 |     dict(type='NormalizeMultiviewImage', **img_norm_cfg),
 66 |     dict(type='PadMultiViewImage', size_divisor=32),
 67 |     dict(type='DefaultFormatBundle3D', class_names=class_names),
 68 |     dict(type='Collect3D', keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d'],
 69 |          meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
 70 |                     'depth2img', 'cam2img', 'pad_shape',
 71 |                     'scale_factor', 'flip', 'pcd_horizontal_flip',
 72 |                     'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
 73 |                     'img_norm_cfg', 'pcd_trans', 'sample_idx',
 74 |                     'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
 75 |                     'transformation_3d_flow', 'rot_degree',
 76 |                     'gt_bboxes_3d', 'gt_labels_3d', 'modalmask'))
 77 | ]
 78 | test_pipeline = [
 79 |     dict(
 80 |         type='LoadPointsFromFile',
 81 |         coord_type='LIDAR',
 82 |         load_dim=5,
 83 |         use_dim=[0, 1, 2, 3, 4],
 84 |     ),
 85 |     dict(
 86 |         type='LoadPointsFromMultiSweeps',
 87 |         sweeps_num=10,
 88 |         use_dim=[0, 1, 2, 3, 4],
 89 |     ),
 90 |     dict(type='LoadMultiViewImageFromFiles'),
 91 |     dict(
 92 |         type='MultiScaleFlipAug3D',
 93 |         img_scale=(1333, 800),
 94 |         pts_scale_ratio=1,
 95 |         flip=False,
 96 |         transforms=[
 97 |             dict(
 98 |                 type='GlobalRotScaleTrans',
 99 |                 rot_range=[0, 0],
100 |                 scale_ratio_range=[1.0, 1.0],
101 |                 translation_std=[0, 0, 0]),
102 |             dict(type='RandomFlip3D'),
103 |             dict(type='ResizeCropFlipImage', data_aug_conf=ida_aug_conf, training=False),
104 |             dict(type='NormalizeMultiviewImage', **img_norm_cfg),
105 |             dict(type='PadMultiViewImage', size_divisor=32),
106 |             dict(
107 |                 type='DefaultFormatBundle3D',
108 |                 class_names=class_names,
109 |                 with_label=False),
110 |             dict(type='Collect3D', keys=['points', 'img'])
111 |         ])
112 | ]
113 | data = dict(
114 |     samples_per_gpu=2,
115 |     workers_per_gpu=6,
116 |     train=dict(
117 |         type='CBGSDataset',
118 |         dataset=dict(
119 |             type=dataset_type,
120 |             data_root=data_root,
121 |             ann_file=data_root + '/nuscenes_infos_train.pkl',
122 |             load_interval=1,
123 |             pipeline=train_pipeline,
124 |             classes=class_names,
125 |             modality=input_modality,
126 |             test_mode=False,
127 |             box_type_3d='LiDAR')),
128 |     val=dict(
129 |         type=dataset_type,
130 |         data_root=data_root,
131 |         ann_file=data_root + '/nuscenes_infos_val.pkl',
132 |         load_interval=1,
133 |         pipeline=test_pipeline,
134 |         classes=class_names,
135 |         modality=input_modality,
136 |         test_mode=True,
137 |         box_type_3d='LiDAR'),
138 |     test=dict(
139 |         type=dataset_type,
140 |         data_root=data_root,
141 |         ann_file=data_root + '/nuscenes_infos_val.pkl',
142 |         load_interval=1,
143 |         pipeline=test_pipeline,
144 |         classes=class_names,
145 |         modality=input_modality,
146 |         test_mode=True,
147 |         box_type_3d='LiDAR'))
148 | model = dict(
149 |     type='MoME',
150 |     use_grid_mask=True,
151 |     img_backbone=dict(
152 |         type='VoVNet',
153 |         spec_name='V-99-eSE',
154 |         norm_eval=True,
155 |         frozen_stages=-1,
156 |         input_ch=3,
157 |         out_features=('stage4', 'stage5',)),
158 |     img_neck=dict(
159 |         type='CPFPN',
160 |         in_channels=[768, 1024],
161 |         out_channels=256,
162 |         num_outs=2),
163 |     pts_voxel_layer=dict(
164 |         num_point_features=5,
165 |         max_num_points=10,
166 |         voxel_size=voxel_size,
167 |         max_voxels=(120000, 160000),
168 |         point_cloud_range=point_cloud_range),
169 |     pts_voxel_encoder=dict(
170 |         type='HardSimpleVFE',
171 |         num_features=5,
172 |     ),
173 |     pts_middle_encoder=dict(
174 |         type='SparseEncoder',
175 |         in_channels=5,
176 |         sparse_shape=[41, 1440, 1440],
177 |         output_channels=128,
178 |         order=('conv', 'norm', 'act'),
179 |         encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)),
180 |         encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
181 |         block_type='basicblock'),
182 |     pts_backbone=dict(
183 |         type='SECOND',
184 |         in_channels=256,
185 |         out_channels=[128, 256],
186 |         layer_nums=[5, 5],
187 |         layer_strides=[1, 2],
188 |         norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
189 |         conv_cfg=dict(type='Conv2d', bias=False)),
190 |     pts_neck=dict(
191 |         type='SECONDFPN',
192 |         in_channels=[128, 256],
193 |         out_channels=[256, 256],
194 |         upsample_strides=[1, 2],
195 |         norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
196 |         upsample_cfg=dict(type='deconv', bias=False),
197 |         use_conv_for_no_stride=True),
198 |     pts_bbox_head=dict(
199 |         type='MultiExpertDecoding',
200 |         in_channels=512,
201 |         hidden_dim=256,
202 |         downsample_scale=8,
203 |         pc_range=point_cloud_range,
204 |         modalities=dict(
205 |             train=["fused", "bev", "img"],
206 |             test=["fused", "bev", "img"]
207 |         ),
208 |         common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
209 |         tasks=[
210 |             dict(num_class=10, class_names=[
211 |                 'car', 'truck', 'construction_vehicle',
212 |                 'bus', 'trailer', 'barrier',
213 |                 'motorcycle', 'bicycle',
214 |                 'pedestrian', 'traffic_cone'
215 |             ]),
216 |         ],
217 |         bbox_coder=dict(
218 |             type='MultiTaskBBoxCoder',
219 |             post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
220 |             pc_range=point_cloud_range,
221 |             max_num=300,
222 |             voxel_size=voxel_size,
223 |             num_classes=10),
224 |         separate_head=dict(
225 |             type='SeparateTaskHead', init_bias=-2.19, final_kernel=1),
226 |         transformer=dict(
227 |             type='MultiExpert',
228 |             use_cam_embed=True,
229 |             window_sizes=[15,5],
230 |             encoder=dict(
231 |                 type="PETRTransformerDecoder",
232 |                 return_intermediate=True,
233 |                 num_layers=1, # same with len(ensemble.modal_seq)
234 |                 transformerlayers=dict(
235 |                     type='PETRTransformerDecoderLayer',
236 |                     with_cp=False,
237 |                     attn_cfgs=[
238 |                         dict(
239 |                             type='MultiheadAttention',
240 |                             embed_dims=256,
241 |                             num_heads=4,
242 |                             dropout=0.1),
243 |                     ],
244 |                     ffn_cfgs=dict(
245 |                         type='FFN',
246 |                         embed_dims=256,
247 |                         feedforward_channels=1024,
248 |                         num_fcs=2,
249 |                         ffn_drop=0.,
250 |                         act_cfg=dict(type='ReLU', inplace=True),
251 |                     ),
252 | 
253 |                     feedforward_channels=1024,
254 |                     operation_order=('cross_attn', 'norm', 'ffn', 'norm')
255 |                 ),
256 |             ),
257 |             decoder=dict(
258 |                 type='PETRTransformerDecoder',
259 |                 return_intermediate=True,
260 |                 num_layers=6,
261 |                 transformerlayers=dict(
262 |                     type='PETRTransformerDecoderLayer',
263 |                     with_cp=False,
264 |                     attn_cfgs=[
265 |                         dict(
266 |                             type='MultiheadAttention',
267 |                             embed_dims=256,
268 |                             num_heads=8,
269 |                             dropout=0.1),
270 |                         dict(
271 |                             type='PETRMultiheadFlashAttention',
272 |                             embed_dims=256,
273 |                             num_heads=8,
274 |                             dropout=0.1),
275 |                     ],
276 |                     ffn_cfgs=dict(
277 |                         type='FFN',
278 |                         embed_dims=256,
279 |                         feedforward_channels=1024,
280 |                         num_fcs=2,
281 |                         ffn_drop=0.,
282 |                         act_cfg=dict(type='ReLU', inplace=True),
283 |                     ),
284 | 
285 |                     feedforward_channels=1024,
286 |                     operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
287 |                                      'ffn', 'norm')),
288 |             ),
289 |         ),
290 |         loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=2.0),
291 |         loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25)
292 |     ),
293 |     train_cfg=dict(
294 |         pts=dict(
295 |             dataset='nuScenes',
296 |             assigner=dict(
297 |                 type='HungarianAssigner3D',
298 |                 cls_cost=dict(type='FocalLossCost', weight=2.0),
299 |                 reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
300 |                 iou_cost=dict(type='IoUCost', weight=0.0),
301 |                 pc_range=point_cloud_range,
302 |                 code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
303 |             ),
304 |             pos_weight=-1,
305 |             gaussian_overlap=0.1,
306 |             min_radius=2,
307 |             grid_size=[1440, 1440, 40],
308 |             voxel_size=voxel_size,
309 |             out_size_factor=out_size_factor,
310 |             code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
311 |             point_cloud_range=point_cloud_range)),
312 |     test_cfg=dict(
313 |         pts=dict(
314 |             dataset='nuScenes',
315 |             grid_size=[1440, 1440, 40],
316 |             out_size_factor=out_size_factor,
317 |             pc_range=point_cloud_range,
318 |             voxel_size=voxel_size,
319 |             nms_type=None,
320 |             nms_thr=0.2,
321 |             use_rotate_nms=True,
322 |             max_num=200
323 |         )))
324 | optimizer = dict(
325 |     type='AdamW',
326 |     lr=0.0001,
327 |     paramwise_cfg=dict(
328 |         custom_keys={
329 |             'img_backbone': dict(lr_mult=0.01, decay_mult=5),
330 |             'img_neck': dict(lr_mult=0.1),
331 |         }),
332 |     weight_decay=0.01)
333 | optimizer_config = dict(
334 |     type='CustomFp16OptimizerHook',
335 |     loss_scale='dynamic',
336 |     grad_clip=dict(max_norm=35, norm_type=2),
337 |     custom_fp16=dict(pts_voxel_encoder=False, pts_middle_encoder=False, pts_bbox_head=False))
338 | lr_config = dict(
339 |     policy='CosineAnnealing',
340 |     by_epoch=False,
341 |     min_lr_ratio=0.001,
342 |     warmup="linear",
343 |     warmup_iters=1000)
344 | momentum_config = dict(
345 |     policy='cyclic',
346 |     target_ratio=(0.8947368421052632, 1),
347 |     cyclic_times=1,
348 |     step_ratio_up=0.4)
349 | total_epochs = 6
350 | checkpoint_config = dict(interval=1)
351 | log_config = dict(
352 |     interval=50,
353 |     hooks=[dict(type='TextLoggerHook'),
354 |            dict(type='TensorboardLoggerHook')])
355 | dist_params = dict(backend='nccl')
356 | log_level = 'INFO'
357 | work_dir = None
358 | load_from = 'ckpts/moad_voxel0075_vov_1600x640_cbgs.pth'
359 | resume_from = None
360 | workflow = [('train', 1)]
361 | gpu_ids = range(0, 8)
362 | 
363 | custom_hooks = [
364 |     dict(
365 |         type="FreezeWeight",
366 |         finetune_weight=["pts_bbox_head.transformer.encoder", "pts_bbox_head.transformer.selected_cls"]
367 |     )
368 | ]
369 | 
370 | find_unused_parameters = True


--------------------------------------------------------------------------------