├── .gitignore
├── LICENSE
├── README.md
├── configs
    ├── base
    │   ├── dataloader.yml
    │   ├── deim.yml
    │   ├── deimv2.yml
    │   ├── dfine_hgnetv2.yml
    │   ├── optimizer.yml
    │   ├── rt_deim.yml
    │   ├── rt_optimizer.yml
    │   └── rtdetrv2_r50vd.yml
    ├── dataset
    │   ├── coco_detection.yml
    │   ├── crowdhuman_detection.yml
    │   ├── custom_detection.yml
    │   ├── obj365_detection.yml
    │   └── voc_detection.yml
    ├── deim_dfine
    │   ├── deim_hgnetv2_l_coco.yml
    │   ├── deim_hgnetv2_m_coco.yml
    │   ├── deim_hgnetv2_n_coco.yml
    │   ├── deim_hgnetv2_s_coco.yml
    │   ├── deim_hgnetv2_x_coco.yml
    │   ├── dfine_hgnetv2_l_coco.yml
    │   ├── dfine_hgnetv2_m_coco.yml
    │   ├── dfine_hgnetv2_n_coco.yml
    │   ├── dfine_hgnetv2_s_coco.yml
    │   ├── dfine_hgnetv2_x_coco.yml
    │   └── object365
    │   │   ├── deim_hgnetv2_x_obj2coco_24e.yml
    │   │   └── dfine_hgnetv2_x_obj2coco.yml
    ├── deim_rtdetrv2
    │   ├── deim_r101vd_60e_coco.yml
    │   ├── deim_r18vd_120e_coco.yml
    │   ├── deim_r34vd_120e_coco.yml
    │   ├── deim_r50vd_60e_coco.yml
    │   ├── deim_r50vd_m_60e_coco.yml
    │   ├── rtdetrv2_r101vd_6x_coco.yml
    │   ├── rtdetrv2_r18vd_120e_coco.yml
    │   ├── rtdetrv2_r34vd_120e_coco.yml
    │   ├── rtdetrv2_r50vd_6x_coco.yml
    │   └── rtdetrv2_r50vd_m_7x_coco.yml
    ├── deimv2
    │   ├── deimv2_dinov3_l_coco.yml
    │   ├── deimv2_dinov3_m_coco.yml
    │   ├── deimv2_dinov3_s_coco.yml
    │   ├── deimv2_dinov3_x_coco.yml
    │   ├── deimv2_hgnetv2_atto_coco.yml
    │   ├── deimv2_hgnetv2_femto_coco.yml
    │   ├── deimv2_hgnetv2_l_coco.yml
    │   ├── deimv2_hgnetv2_m_coco.yml
    │   ├── deimv2_hgnetv2_n_coco.yml
    │   ├── deimv2_hgnetv2_pico_coco.yml
    │   ├── deimv2_hgnetv2_s_coco.yml
    │   └── deimv2_hgnetv2_x_coco.yml
    └── runtime.yml
├── engine
    ├── __init__.py
    ├── backbone
    │   ├── __init__.py
    │   ├── common.py
    │   ├── csp_darknet.py
    │   ├── csp_resnet.py
    │   ├── dinov3
    │   │   ├── __init__.py
    │   │   ├── layers
    │   │   │   ├── __init__.py
    │   │   │   ├── attention.py
    │   │   │   ├── block.py
    │   │   │   ├── dino_head.py
    │   │   │   ├── ffn_layers.py
    │   │   │   ├── fp8_linear.py
    │   │   │   ├── layer_scale.py
    │   │   │   ├── patch_embed.py
    │   │   │   ├── rms_norm.py
    │   │   │   ├── rope_position_encoding.py
    │   │   │   └── sparse_linear.py
    │   │   ├── utils
    │   │   │   ├── __init__.py
    │   │   │   ├── cluster.py
    │   │   │   ├── custom_callable.py
    │   │   │   ├── dtype.py
    │   │   │   └── utils.py
    │   │   └── vision_transformer.py
    │   ├── dinov3_adapter.py
    │   ├── hgnetv2.py
    │   ├── ms_deform_attn.py
    │   ├── presnet.py
    │   ├── test_resnet.py
    │   ├── timm_model.py
    │   ├── torchvision_model.py
    │   ├── utils.py
    │   └── vit_tiny.py
    ├── core
    │   ├── __init__.py
    │   ├── _config.py
    │   ├── workspace.py
    │   ├── yaml_config.py
    │   └── yaml_utils.py
    ├── data
    │   ├── __init__.py
    │   ├── _misc.py
    │   ├── dataloader.py
    │   ├── dataset
    │   │   ├── __init__.py
    │   │   ├── _dataset.py
    │   │   ├── coco_dataset.py
    │   │   ├── coco_eval.py
    │   │   ├── coco_utils.py
    │   │   ├── voc_detection.py
    │   │   └── voc_eval.py
    │   └── transforms
    │   │   ├── __init__.py
    │   │   ├── _transforms.py
    │   │   ├── container.py
    │   │   ├── functional.py
    │   │   └── mosaic.py
    ├── deim
    │   ├── __init__.py
    │   ├── box_ops.py
    │   ├── deim.py
    │   ├── deim_criterion.py
    │   ├── deim_decoder.py
    │   ├── deim_utils.py
    │   ├── denoising.py
    │   ├── dfine_decoder.py
    │   ├── dfine_utils.py
    │   ├── hybrid_encoder.py
    │   ├── lite_encoder.py
    │   ├── matcher.py
    │   ├── postprocessor.py
    │   ├── rtdetrv2_decoder.py
    │   └── utils.py
    ├── misc
    │   ├── __init__.py
    │   ├── box_ops.py
    │   ├── dist_utils.py
    │   ├── lazy_loader.py
    │   ├── logger.py
    │   ├── profiler_utils.py
    │   └── visualizer.py
    ├── optim
    │   ├── __init__.py
    │   ├── amp.py
    │   ├── ema.py
    │   ├── lr_scheduler.py
    │   ├── optim.py
    │   └── warmup.py
    └── solver
    │   ├── __init__.py
    │   ├── _solver.py
    │   ├── clas_engine.py
    │   ├── clas_solver.py
    │   ├── det_engine.py
    │   └── det_solver.py
├── figures
    ├── deimv2_coco_AP_vs_GFLOPs.png
    └── deimv2_coco_AP_vs_Params.png
├── requirements.txt
├── tools
    ├── benchmark
    │   ├── dataset.py
    │   ├── get_info.py
    │   ├── requirements.txt
    │   ├── trt_benchmark.py
    │   └── utils.py
    ├── dataset
    │   ├── remap_obj365.py
    │   └── resize_obj365.py
    ├── deployment
    │   ├── export_onnx.py
    │   └── export_yolo_w_nms.py
    ├── inference
    │   ├── onnx_inf.py
    │   ├── openvino_inf.py
    │   ├── requirements.txt
    │   ├── torch_inf.py
    │   ├── torch_inf_vis.py
    │   └── trt_inf.py
    ├── reference
    │   ├── convert_weight.py
    │   └── safe_training.sh
    └── visualization
    │   └── fiftyone_vis.py
└── train.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Ignored Files
  2 | outputs/
  3 | ckpts/
  4 | testenv/
  5 | backup
  6 | 
  7 | # Byte-compiled / optimized / DLL files
  8 | __pycache__/
  9 | *.py[cod]
 10 | *$py.class
 11 | 
 12 | # C extensions
 13 | *.so
 14 | 
 15 | # Distribution / packaging
 16 | .Python
 17 | build/
 18 | develop-eggs/
 19 | dist/
 20 | downloads/
 21 | eggs/
 22 | .eggs/
 23 | lib/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | wheels/
 29 | pip-wheel-metadata/
 30 | share/python-wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | MANIFEST
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .nox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | *.py,cover
 57 | .hypothesis/
 58 | .pytest_cache/
 59 | 
 60 | # Translations
 61 | *.mo
 62 | *.pot
 63 | 
 64 | # Django stuff:
 65 | local_settings.py
 66 | db.sqlite3
 67 | db.sqlite3-journal
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100 | __pypackages__/
101 | 
102 | # Celery stuff
103 | celerybeat-schedule
104 | celerybeat.pid
105 | 
106 | # SageMath parsed files
107 | *.sage.py
108 | 
109 | # Environments
110 | .env
111 | .venv
112 | env/
113 | venv/
114 | ENV/
115 | env.bak/
116 | venv.bak/
117 | 
118 | # Spyder project settings
119 | .spyderproject
120 | .spyproject
121 | 
122 | # Rope project settings
123 | .ropeproject
124 | 
125 | # mkdocs documentation
126 | /site
127 | 
128 | # mypy
129 | .mypy_cache/
130 | .dmypy.json
131 | dmypy.json
132 | 
133 | # Pyre type checker
134 | .pyre/
135 | 
136 | # PyCharm
137 | .idea
138 | .vscode/
139 | *.pt
140 | *.pth
141 | *.onnx
142 | *.zip
143 | *.html
144 | .DS_Store
145 | 


--------------------------------------------------------------------------------
/configs/base/dataloader.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | train_dataloader:
 3 |   dataset:
 4 |     transforms:
 5 |       ops:
 6 |         - {type: RandomPhotometricDistort, p: 0.5}
 7 |         - {type: RandomZoomOut, fill: 0}
 8 |         - {type: RandomIoUCrop, p: 0.8}
 9 |         - {type: SanitizeBoundingBoxes, min_size: 1}
10 |         - {type: RandomHorizontalFlip}
11 |         - {type: Resize, size: [640, 640], }
12 |         - {type: SanitizeBoundingBoxes, min_size: 1}
13 |         - {type: ConvertPILImage, dtype: 'float32', scale: True}
14 |         - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
15 |       policy:
16 |         name: stop_epoch
17 |         epoch: 72 # epoch in [71, ~) stop `ops`
18 |         ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
19 | 
20 |   collate_fn:
21 |     type: BatchImageCollateFunction
22 |     base_size: 640
23 |     base_size_repeat: 3
24 |     stop_epoch: 72 # epoch in [72, ~) stop `multiscales`
25 | 
26 |   shuffle: True
27 |   total_batch_size: 32 # total batch size equals to 32 (4 * 8)
28 |   num_workers: 4
29 | 
30 | 
31 | val_dataloader:
32 |   dataset:
33 |     transforms:
34 |       ops:
35 |         - {type: Resize, size: [640, 640], }
36 |         - {type: ConvertPILImage, dtype: 'float32', scale: True}
37 |   shuffle: False
38 |   total_batch_size: 64
39 |   num_workers: 4
40 | 


--------------------------------------------------------------------------------
/configs/base/deim.yml:
--------------------------------------------------------------------------------
 1 | # Dense O2O
 2 | train_dataloader: 
 3 |   dataset: 
 4 |     transforms:
 5 |       ops:
 6 |         - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
 7 |            probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
 8 |         - {type: RandomPhotometricDistort, p: 0.5}
 9 |         - {type: RandomZoomOut, fill: 0}
10 |         - {type: RandomIoUCrop, p: 0.8}
11 |         - {type: SanitizeBoundingBoxes, min_size: 1}
12 |         - {type: RandomHorizontalFlip}
13 |         - {type: Resize, size: [640, 640], }
14 |         - {type: SanitizeBoundingBoxes, min_size: 1}
15 |         - {type: ConvertPILImage, dtype: 'float32', scale: True}
16 |         - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
17 |       policy:
18 |         epoch: [4, 29, 50]   # list 
19 |         ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
20 |       mosaic_prob: 0.5
21 | 
22 |   collate_fn:
23 |     mixup_prob: 0.5
24 |     mixup_epochs: [4, 29]
25 |     stop_epoch: 50    # epoch in [72, ~) stop `multiscales`
26 | 
27 | # Unfreezing BN
28 | HGNetv2:
29 |   freeze_at: -1         # 0 default
30 |   freeze_norm: False    # True default
31 | 
32 | # Activation
33 | DFINETransformer:
34 |   activation: silu
35 |   mlp_act: silu
36 | 
37 | ## Our LR-Scheduler
38 | lrsheduler: flatcosine
39 | lr_gamma: 0.5
40 | warmup_iter: 2000
41 | flat_epoch: 29    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
42 | no_aug_epoch: 8
43 | 
44 | ## Our Loss
45 | DEIMCriterion:
46 |   weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5}
47 |   losses: ['mal', 'boxes', 'local']
48 |   gamma: 1.5


--------------------------------------------------------------------------------
/configs/base/deimv2.yml:
--------------------------------------------------------------------------------
  1 | task: detection
  2 | 
  3 | model: DEIM
  4 | criterion: DEIMCriterion
  5 | postprocessor: PostProcessor
  6 | 
  7 | use_focal_loss: True
  8 | eval_spatial_size: [640, 640] # h w
  9 | checkpoint_freq: 5    # save freq
 10 | 
 11 | DEIM:
 12 |   backbone: HGNetv2
 13 |   encoder: HybridEncoder
 14 |   decoder: DEIMTransformer
 15 | 
 16 | HGNetv2:
 17 |   name: 'B4'
 18 |   return_idx: [1, 2, 3]
 19 |   freeze_at: -1         # 0 default
 20 |   freeze_stem_only: True
 21 |   freeze_norm: False    # True default
 22 |   pretrained: True
 23 |   local_model_dir: ./weight/hgnetv2/
 24 | 
 25 | HybridEncoder:
 26 |   in_channels: [512, 1024, 2048]
 27 |   feat_strides: [8, 16, 32]
 28 | 
 29 |   # intra
 30 |   hidden_dim: 256
 31 |   use_encoder_idx: [2]
 32 |   num_encoder_layers: 1
 33 |   nhead: 8
 34 |   dim_feedforward: 1024
 35 |   dropout: 0.
 36 |   enc_act: 'gelu'
 37 | 
 38 |   # cross
 39 |   expansion: 1.0
 40 |   depth_mult: 1
 41 |   act: 'silu'
 42 | 
 43 |   # New
 44 |   version: deim
 45 |   csp_type: csp2
 46 |   fuse_op: sum
 47 | 
 48 | DEIMTransformer:
 49 |   feat_channels: [256, 256, 256]
 50 |   feat_strides: [8, 16, 32]
 51 |   hidden_dim: 256
 52 |   num_levels: 3
 53 | 
 54 |   num_layers: 6
 55 |   eval_idx: -1
 56 |   num_queries: 300
 57 | 
 58 |   num_denoising: 100
 59 |   label_noise_ratio: 0.5
 60 |   box_noise_scale: 1.0
 61 | 
 62 |   reg_max: 32
 63 |   reg_scale: 4
 64 |   layer_scale: 1  # 2
 65 | 
 66 |   num_points: [3, 6, 3] # [4, 4, 4] [3, 6, 3]
 67 |   cross_attn_method: default # default, discrete
 68 |   query_select_method: default # default, agnostic
 69 | 
 70 |   # Act
 71 |   activation: silu
 72 |   mlp_act: silu
 73 | 
 74 |   # FFN
 75 |   dim_feedforward: 2048
 76 | 
 77 | PostProcessor:
 78 |   num_top_queries: 300
 79 | 
 80 | 
 81 | ## DEIM LR-Scheduler
 82 | epoches: 58 # 72 + 2n  # Increase to search for the optimal ema
 83 | 
 84 | lrsheduler: flatcosine
 85 | lr_gamma: 0.5
 86 | warmup_iter: 2000
 87 | flat_epoch: 29    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
 88 | no_aug_epoch: 8
 89 | 
 90 | ## Dense O2O: Mosaic + Mixup + CopyBlend
 91 | train_dataloader: 
 92 |   dataset: 
 93 |     transforms:
 94 |       ops:
 95 |         - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
 96 |            probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
 97 |         - {type: RandomPhotometricDistort, p: 0.5}
 98 |         - {type: RandomZoomOut, fill: 0}
 99 |         - {type: RandomIoUCrop, p: 0.8}
100 |         - {type: SanitizeBoundingBoxes, min_size: 1}
101 |         - {type: RandomHorizontalFlip}
102 |         - {type: Resize, size: [640, 640], }
103 |         - {type: SanitizeBoundingBoxes, min_size: 1}
104 |         - {type: ConvertPILImage, dtype: 'float32', scale: True}
105 |         - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
106 |       # Mosaic options
107 |       policy:
108 |         epoch: [4, 29, 50]   # list 
109 |         ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
110 |       mosaic_prob: 0.5
111 | 
112 |   collate_fn:
113 |     # Mixup options
114 |     mixup_prob: 0.5
115 |     mixup_epochs: [4, 29]
116 |     stop_epoch: 50    # epoch in [72, ~) stop `multiscales`
117 |     # CopyBlend options
118 |     copyblend_prob: 0.5
119 |     copyblend_epochs: [4, 50]
120 |     area_threshold: 100
121 |     num_objects: 3
122 |     with_expand: True
123 |     expand_ratios: [0.1, 0.25]
124 | 
125 |     ema_restart_decay: 0.9999
126 |     base_size_repeat: 4
127 | 
128 | ## DEIM Loss
129 | DEIMCriterion:
130 |   weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5}
131 |   losses: ['mal', 'boxes', 'local']
132 |   gamma: 1.5
133 |   alpha: 0.75
134 |   reg_max: 32
135 | 
136 |   matcher:
137 |     type: HungarianMatcher
138 |     weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2}
139 |     alpha: 0.25
140 |     gamma: 2.0
141 |     # change matcher
142 |     change_matcher: True
143 |     iou_order_alpha: 4.0
144 |     matcher_change_epoch: 45


--------------------------------------------------------------------------------
/configs/base/dfine_hgnetv2.yml:
--------------------------------------------------------------------------------
 1 | task: detection
 2 | 
 3 | model: DEIM
 4 | criterion: DEIMCriterion
 5 | postprocessor: PostProcessor
 6 | 
 7 | use_focal_loss: True
 8 | eval_spatial_size: [640, 640] # h w
 9 | checkpoint_freq: 4    # save freq
10 | 
11 | DEIM:
12 |   backbone: HGNetv2
13 |   encoder: HybridEncoder
14 |   decoder: DFINETransformer
15 | 
16 | # Add, default for step lr scheduler 
17 | lrsheduler: flatcosine
18 | lr_gamma: 1
19 | warmup_iter: 500
20 | flat_epoch: 4000000
21 | no_aug_epoch: 0
22 | 
23 | HGNetv2:
24 |   pretrained: True
25 |   local_model_dir: ../RT-DETR-main/D-FINE/weight/hgnetv2/
26 | 
27 | HybridEncoder:
28 |   in_channels: [512, 1024, 2048]
29 |   feat_strides: [8, 16, 32]
30 | 
31 |   # intra
32 |   hidden_dim: 256
33 |   use_encoder_idx: [2]
34 |   num_encoder_layers: 1
35 |   nhead: 8
36 |   dim_feedforward: 1024
37 |   dropout: 0.
38 |   enc_act: 'gelu'
39 | 
40 |   # cross
41 |   expansion: 1.0
42 |   depth_mult: 1
43 |   act: 'silu'
44 | 
45 | 
46 | DFINETransformer:
47 |   feat_channels: [256, 256, 256]
48 |   feat_strides: [8, 16, 32]
49 |   hidden_dim: 256
50 |   num_levels: 3
51 | 
52 |   num_layers: 6
53 |   eval_idx: -1
54 |   num_queries: 300
55 | 
56 |   num_denoising: 100
57 |   label_noise_ratio: 0.5
58 |   box_noise_scale: 1.0
59 | 
60 |   # NEW
61 |   reg_max: 32
62 |   reg_scale: 4
63 | 
64 |   # Auxiliary decoder layers dimension scaling
65 |   # "eg. If num_layers: 6 eval_idx: -4,
66 |   # then layer 3, 4, 5 are auxiliary decoder layers."
67 |   layer_scale: 1  # 2
68 | 
69 | 
70 |   num_points: [3, 6, 3] # [4, 4, 4] [3, 6, 3]
71 |   cross_attn_method: default # default, discrete
72 |   query_select_method: default # default, agnostic
73 | 
74 | 
75 | PostProcessor:
76 |   num_top_queries: 300
77 | 
78 | 
79 | DEIMCriterion:
80 |   weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5}
81 |   losses: ['vfl', 'boxes', 'local']
82 |   alpha: 0.75
83 |   gamma: 2.0
84 |   reg_max: 32
85 | 
86 |   matcher:
87 |     type: HungarianMatcher
88 |     weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2}
89 |     alpha: 0.25
90 |     gamma: 2.0


--------------------------------------------------------------------------------
/configs/base/optimizer.yml:
--------------------------------------------------------------------------------
 1 | use_amp: True
 2 | use_ema: True
 3 | ema:
 4 |   type: ModelEMA
 5 |   decay: 0.9999
 6 |   warmups: 1000
 7 |   start: 0
 8 | 
 9 | epoches: 72
10 | clip_max_norm: 0.1
11 | 
12 | 
13 | optimizer:
14 |   type: AdamW
15 |   params:
16 |     -
17 |       params: '^(?=.*backbone)(?!.*norm).*$'
18 |       lr: 0.0000125
19 |     -
20 |       params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
21 |       weight_decay: 0.
22 | 
23 |   lr: 0.00025
24 |   betas: [0.9, 0.999]
25 |   weight_decay: 0.000125
26 | 
27 | 
28 | lr_scheduler:
29 |   type: MultiStepLR
30 |   milestones: [500]
31 |   gamma: 0.1
32 | 
33 | lr_warmup_scheduler:
34 |   type: LinearWarmup
35 |   warmup_duration: 500
36 | 


--------------------------------------------------------------------------------
/configs/base/rt_deim.yml:
--------------------------------------------------------------------------------
 1 | # Dense O2O
 2 | train_dataloader: 
 3 |   dataset: 
 4 |     transforms:
 5 |       ops:
 6 |         - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
 7 |            probability: 1.0, fill_value: 0, use_cache: False, max_cached_images: 50, random_pop: True}
 8 |         - {type: RandomPhotometricDistort, p: 0.5}
 9 |         - {type: RandomZoomOut, fill: 0}
10 |         - {type: RandomIoUCrop, p: 0.8}
11 |         - {type: SanitizeBoundingBoxes, min_size: 1}
12 |         - {type: RandomHorizontalFlip}
13 |         - {type: Resize, size: [640, 640], }
14 |         - {type: SanitizeBoundingBoxes, min_size: 1}
15 |         - {type: ConvertPILImage, dtype: 'float32', scale: True}
16 |         - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
17 |       policy:
18 |         epoch: [4, 29, 50]   # list 
19 |         ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
20 |       mosaic_prob: 0.5
21 | 
22 |   collate_fn:
23 |     mixup_prob: 0.5
24 |     mixup_epochs: [4, 29]
25 |     stop_epoch: 50    # epoch in [72, ~) stop `multiscales`
26 | 
27 | # Unfreezing BN
28 | PResNet:
29 |   freeze_at: -1     # default 0
30 |   freeze_norm: False   # default True
31 | 
32 | # Activation
33 | RTDETRTransformerv2:
34 |   query_pos_method: as_reg
35 |   activation: silu
36 |   mlp_act: silu
37 | 
38 | ## Our LR-Scheduler
39 | lrsheduler: flatcosine
40 | lr_gamma: 0.5
41 | warmup_iter: 2000
42 | flat_epoch: 29    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
43 | no_aug_epoch: 8
44 | 
45 | ## Our Loss
46 | DEIMCriterion:
47 |   weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2}
48 |   losses: ['mal', 'boxes', ]
49 |   gamma: 1.5


--------------------------------------------------------------------------------
/configs/base/rt_optimizer.yml:
--------------------------------------------------------------------------------
 1 | use_amp: True
 2 | use_ema: True 
 3 | ema:
 4 |   type: ModelEMA
 5 |   decay: 0.9999
 6 |   warmups: 2000
 7 |   start: 0
 8 | 
 9 | epoches: 72
10 | clip_max_norm: 0.1
11 | 
12 | train_dataloader:
13 |   total_batch_size: 16
14 | 
15 | optimizer:
16 |   type: AdamW
17 |   params: 
18 |     - 
19 |       params: '^(?=.*backbone)(?!.*norm).*$'
20 |       lr: 0.00001
21 |     - 
22 |       params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
23 |       weight_decay: 0.
24 | 
25 |   lr: 0.0001
26 |   betas: [0.9, 0.999]
27 |   weight_decay: 0.0001
28 |   
29 | lr_scheduler:
30 |   type: MultiStepLR
31 |   milestones: [1000]
32 |   gamma: 0.1
33 | 
34 | 
35 | lr_warmup_scheduler:
36 |   type: LinearWarmup
37 |   warmup_duration: 2000
38 | 


--------------------------------------------------------------------------------
/configs/base/rtdetrv2_r50vd.yml:
--------------------------------------------------------------------------------
 1 | task: detection
 2 | 
 3 | model: DEIM
 4 | criterion: DEIMCriterion
 5 | postprocessor: PostProcessor
 6 | 
 7 | use_focal_loss: True
 8 | eval_spatial_size: [640, 640] # h w
 9 | checkpoint_freq: 4    # save freq
10 | 
11 | DEIM: 
12 |   backbone: PResNet
13 |   encoder: HybridEncoder
14 |   decoder: RTDETRTransformerv2
15 |   
16 | 
17 | # Add, default for step lr scheduler 
18 | lrsheduler: flatcosine
19 | lr_gamma: 1
20 | warmup_iter: 2000
21 | flat_epoch: 4000000
22 | no_aug_epoch: 0
23 | 
24 | PResNet:
25 |   depth: 50
26 |   variant: d
27 |   freeze_at: 0
28 |   return_idx: [1, 2, 3]
29 |   num_stages: 4
30 |   freeze_norm: True
31 |   pretrained: True 
32 |   local_model_dir: ../RT-DETR-main/rtdetrv2_pytorch/INK1k/
33 | 
34 | 
35 | HybridEncoder:
36 |   in_channels: [512, 1024, 2048]
37 |   feat_strides: [8, 16, 32]
38 | 
39 |   # intra
40 |   hidden_dim: 256
41 |   use_encoder_idx: [2]
42 |   num_encoder_layers: 1
43 |   nhead: 8
44 |   dim_feedforward: 1024
45 |   dropout: 0.
46 |   enc_act: 'gelu'
47 |   
48 |   # cross
49 |   expansion: 1.0
50 |   depth_mult: 1
51 |   act: 'silu'
52 |   version: rt_detrv2    # pay attention to this
53 | 
54 | 
55 | RTDETRTransformerv2:
56 |   feat_channels: [256, 256, 256]
57 |   feat_strides: [8, 16, 32]
58 |   hidden_dim: 256
59 |   num_levels: 3
60 | 
61 |   num_layers: 6
62 |   num_queries: 300
63 | 
64 |   num_denoising: 100
65 |   label_noise_ratio: 0.5
66 |   box_noise_scale: 1.0    # 1.0 0.4
67 | 
68 |   eval_idx: -1
69 | 
70 |   # NEW, can be chosen 
71 |   num_points: [4, 4, 4]     # [3,3,3] [2,2,2]
72 |   cross_attn_method: default  # default, discrete
73 |   query_select_method: default  # default, agnostic 
74 | 
75 | 
76 | PostProcessor:
77 |   num_top_queries: 300
78 | 
79 | DEIMCriterion:
80 |   weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2,}
81 |   losses: ['vfl', 'boxes', ]
82 |   alpha: 0.75
83 |   gamma: 2.0
84 |   use_uni_set: False
85 | 
86 |   matcher:
87 |     type: HungarianMatcher
88 |     weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2}
89 |     alpha: 0.25
90 |     gamma: 2.0


--------------------------------------------------------------------------------
/configs/dataset/coco_detection.yml:
--------------------------------------------------------------------------------
 1 | task: detection
 2 | 
 3 | evaluator:
 4 |   type: CocoEvaluator
 5 |   iou_types: ['bbox', ]
 6 | 
 7 | num_classes: 80
 8 | remap_mscoco_category: True
 9 | 
10 | train_dataloader: 
11 |   type: DataLoader
12 |   dataset: 
13 |     type: CocoDetection
14 |     img_folder: /datassd/COCO/train2017/
15 |     ann_file: /datassd/COCO/annotations/instances_train2017.json
16 |     return_masks: False
17 |     transforms:
18 |       type: Compose
19 |       ops: ~
20 |   shuffle: True
21 |   num_workers: 4
22 |   drop_last: True 
23 |   collate_fn:
24 |     type: BatchImageCollateFunction
25 | 
26 | 
27 | val_dataloader:
28 |   type: DataLoader
29 |   dataset: 
30 |     type: CocoDetection
31 |     img_folder: /datassd/COCO/val2017/
32 |     ann_file: /datassd/COCO/annotations/instances_val2017.json
33 |     return_masks: False
34 |     transforms:
35 |       type: Compose
36 |       ops: ~ 
37 |   shuffle: False
38 |   num_workers: 4
39 |   drop_last: False
40 |   collate_fn:
41 |     type: BatchImageCollateFunction


--------------------------------------------------------------------------------
/configs/dataset/crowdhuman_detection.yml:
--------------------------------------------------------------------------------
 1 | task: detection
 2 | 
 3 | evaluator:
 4 |   type: CocoEvaluator
 5 |   iou_types: ['bbox', ]
 6 | 
 7 | num_classes: 2 # your dataset classes
 8 | remap_mscoco_category: False
 9 | 
10 | train_dataloader:
11 |   type: DataLoader
12 |   dataset:
13 |     type: CocoDetection
14 |     img_folder: /datassd/coco/crowd_human_coco/CrowdHuman_train
15 |     ann_file: /datassd/coco/crowd_human_coco/Chuman-train.json
16 |     return_masks: False
17 |     transforms:
18 |       type: Compose
19 |       ops: ~
20 |   shuffle: True
21 |   num_workers: 4
22 |   drop_last: True
23 |   collate_fn:
24 |     type: BatchImageCollateFunction
25 | 
26 | 
27 | val_dataloader:
28 |   type: DataLoader
29 |   dataset:
30 |     type: CocoDetection
31 |     img_folder: /datassd/coco/crowd_human_coco/CrowdHuman_val
32 |     ann_file: /datassd/coco/crowd_human_coco/Chuman-val.json
33 |     return_masks: False
34 |     transforms:
35 |       type: Compose
36 |       ops: ~
37 |   shuffle: False
38 |   num_workers: 4
39 |   drop_last: False
40 |   collate_fn:
41 |     type: BatchImageCollateFunction
42 | 


--------------------------------------------------------------------------------
/configs/dataset/custom_detection.yml:
--------------------------------------------------------------------------------
 1 | task: detection
 2 | 
 3 | evaluator:
 4 |   type: CocoEvaluator
 5 |   iou_types: ['bbox', ]
 6 | 
 7 | num_classes: 777 # your dataset classes
 8 | remap_mscoco_category: False
 9 | 
10 | train_dataloader:
11 |   type: DataLoader
12 |   dataset:
13 |     type: CocoDetection
14 |     img_folder: /data/yourdataset/train
15 |     ann_file: /data/yourdataset/train/train.json
16 |     return_masks: False
17 |     transforms:
18 |       type: Compose
19 |       ops: ~
20 |   shuffle: True
21 |   num_workers: 4
22 |   drop_last: True
23 |   collate_fn:
24 |     type: BatchImageCollateFunction
25 | 
26 | 
27 | val_dataloader:
28 |   type: DataLoader
29 |   dataset:
30 |     type: CocoDetection
31 |     img_folder: /data/yourdataset/val
32 |     ann_file: /data/yourdataset/val/val.json
33 |     return_masks: False
34 |     transforms:
35 |       type: Compose
36 |       ops: ~
37 |   shuffle: False
38 |   num_workers: 4
39 |   drop_last: False
40 |   collate_fn:
41 |     type: BatchImageCollateFunction
42 | 


--------------------------------------------------------------------------------
/configs/dataset/obj365_detection.yml:
--------------------------------------------------------------------------------
 1 | task: detection
 2 | 
 3 | evaluator:
 4 |   type: CocoEvaluator
 5 |   iou_types: ['bbox', ]
 6 | 
 7 | num_classes: 366
 8 | remap_mscoco_category: False
 9 | 
10 | train_dataloader:
11 |   type: DataLoader
12 |   dataset:
13 |     type: CocoDetection
14 |     img_folder: /home/Dataset/objects365/train
15 |     ann_file: /home/Dataset/objects365/train/new_zhiyuan_objv2_train_resized640.json
16 |     return_masks: False
17 |     transforms:
18 |       type: Compose
19 |       ops: ~
20 |   shuffle: True
21 |   num_workers: 4
22 |   drop_last: True
23 |   collate_fn:
24 |     type: BatchImageCollateFunction
25 | 
26 | 
27 | val_dataloader:
28 |   type: DataLoader
29 |   dataset:
30 |     type: CocoDetection
31 |     img_folder: /home/Dataset/objects365/val
32 |     ann_file: /home/Dataset/objects365/val/new_zhiyuan_objv2_val_resized640.json
33 |     return_masks: False
34 |     transforms:
35 |       type: Compose
36 |       ops: ~
37 |   shuffle: False
38 |   num_workers: 4
39 |   drop_last: False
40 |   collate_fn:
41 |     type: BatchImageCollateFunction
42 | 


--------------------------------------------------------------------------------
/configs/dataset/voc_detection.yml:
--------------------------------------------------------------------------------
 1 | task: detection
 2 | 
 3 | evaluator:
 4 |   type: CocoEvaluator
 5 |   iou_types: ['bbox', ]
 6 | 
 7 | num_classes: 20
 8 | 
 9 | train_dataloader:
10 |   type: DataLoader
11 |   dataset:
12 |     type: VOCDetection
13 |     root: ./dataset/voc/
14 |     ann_file: trainval.txt
15 |     label_file: label_list.txt
16 |     transforms:
17 |       type: Compose
18 |       ops: ~
19 |   shuffle: True
20 |   num_workers: 4
21 |   drop_last: True
22 |   collate_fn:
23 |     type: BatchImageCollateFunction
24 | 
25 | 
26 | val_dataloader:
27 |   type: DataLoader
28 |   dataset:
29 |     type: VOCDetection
30 |     root: ./dataset/voc/
31 |     ann_file: test.txt
32 |     label_file: label_list.txt
33 |     transforms:
34 |       type: Compose
35 |       ops: ~
36 |   shuffle: False
37 |   num_workers: 4
38 |   drop_last: False
39 |   collate_fn:
40 |     type: BatchImageCollateFunction
41 | 


--------------------------------------------------------------------------------
/configs/deim_dfine/deim_hgnetv2_l_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   './dfine_hgnetv2_l_coco.yml',
 3 |   '../base/deim.yml'
 4 | ]
 5 | 
 6 | output_dir: ./outputs/deim_hgnetv2_l_coco
 7 | 
 8 | optimizer:
 9 |   type: AdamW
10 |   params: 
11 |     - 
12 |       params: '^(?=.*backbone)(?!.*norm|bn).*$'
13 |       lr: 0.000025
14 |     - 
15 |       params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
16 |       weight_decay: 0.
17 | 
18 |   lr: 0.0005
19 |   betas: [0.9, 0.999]
20 |   weight_decay: 0.000125
21 |   
22 | # Increase to search for the optimal ema
23 | epoches: 58 # 72 + 2n
24 | 
25 | ## Our LR-Scheduler
26 | flat_epoch: 29    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
27 | no_aug_epoch: 8
28 | 
29 | train_dataloader: 
30 |   dataset: 
31 |     transforms:
32 |       policy:
33 |         epoch: [4, 29, 50]   # list 
34 | 
35 |   collate_fn:
36 |     mixup_epochs: [4, 29]
37 |     stop_epoch: 50


--------------------------------------------------------------------------------
/configs/deim_dfine/deim_hgnetv2_m_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   './dfine_hgnetv2_m_coco.yml',
 3 |   '../base/deim.yml'
 4 | ]
 5 | 
 6 | output_dir: ./outputs/deim_hgnetv2_m_coco
 7 | 
 8 | optimizer:
 9 |   type: AdamW
10 |   params: 
11 |     -
12 |       params: '^(?=.*backbone)(?!.*bn).*$'
13 |       lr: 0.00004
14 |     - 
15 |       params: '^(?=.*(?:norm|bn)).*$'
16 |       weight_decay: 0.
17 | 
18 |   lr: 0.0004
19 |   betas: [0.9, 0.999]
20 |   weight_decay: 0.0001
21 | 
22 | 
23 | # Increase to search for the optimal ema
24 | epoches: 102 # 120 + 4n
25 | 
26 | ## Our LR-Scheduler
27 | flat_epoch: 49    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
28 | no_aug_epoch: 12
29 | 
30 | ## Our DataAug
31 | train_dataloader: 
32 |   dataset: 
33 |     transforms:
34 |       policy:
35 |         epoch: [4, 49, 90]   # list 
36 | 
37 |   collate_fn:
38 |     mixup_epochs: [4, 49]
39 |     stop_epoch: 90


--------------------------------------------------------------------------------
/configs/deim_dfine/deim_hgnetv2_n_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   './dfine_hgnetv2_n_coco.yml',
 3 |   '../base/deim.yml'
 4 | ]
 5 | 
 6 | output_dir: ./deim_outputs/deim_hgnetv2_n_coco
 7 | 
 8 | optimizer:
 9 |   type: AdamW
10 |   params:
11 |     -
12 |       params: '^(?=.*backbone)(?!.*norm|bn).*$'
13 |       lr: 0.0004
14 |     -
15 |       params: '^(?=.*backbone)(?=.*norm|bn).*$'
16 |       lr: 0.0004
17 |       weight_decay: 0.
18 |     -
19 |       params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
20 |       weight_decay: 0.
21 | 
22 |   lr: 0.0008
23 |   betas: [0.9, 0.999]
24 |   weight_decay: 0.0001
25 | 
26 | # Increase to search for the optimal ema
27 | epoches: 160 # 148 + 12
28 | 
29 | ## Our LR-Scheduler
30 | flat_epoch: 7800    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
31 | no_aug_epoch: 12
32 | lr_gamma: 1.0
33 | 
34 | ## Our DataAug
35 | train_dataloader: 
36 |   dataset: 
37 |     transforms:
38 |       policy:
39 |         epoch: [4, 78, 148]   # list 
40 | 
41 |   collate_fn:
42 |     mixup_epochs: [4, 78]
43 |     stop_epoch: 148
44 |     base_size_repeat: ~


--------------------------------------------------------------------------------
/configs/deim_dfine/deim_hgnetv2_s_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   './dfine_hgnetv2_s_coco.yml',
 3 |   '../base/deim.yml'
 4 | ]
 5 | 
 6 | output_dir: ./outputs/deim_hgnetv2_s_coco
 7 | 
 8 | optimizer:
 9 |   type: AdamW
10 |   params: 
11 |     - 
12 |       params: '^(?=.*backbone)(?!.*bn).*$'
13 |       lr: 0.0002
14 |     - 
15 |       params: '^(?=.*(?:norm|bn)).*$'     # except bias
16 |       weight_decay: 0.
17 | 
18 |   lr: 0.0004
19 |   betas: [0.9, 0.999]
20 |   weight_decay: 0.0001
21 | 
22 | 
23 | # Increase to search for the optimal ema
24 | epoches: 132 # 120 + 4n
25 | 
26 | ## Our LR-Scheduler
27 | flat_epoch: 64    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
28 | no_aug_epoch: 12
29 | 
30 | ## Our DataAug
31 | train_dataloader: 
32 |   dataset: 
33 |     transforms:
34 |       policy:
35 |         epoch: [4, 64, 120]   # list 
36 | 
37 |   collate_fn:
38 |     mixup_epochs: [4, 64]
39 |     stop_epoch: 120


--------------------------------------------------------------------------------
/configs/deim_dfine/deim_hgnetv2_x_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   './dfine_hgnetv2_x_coco.yml',
 3 |   '../base/deim.yml'
 4 | ]
 5 | 
 6 | output_dir: ./outputs/deim_hgnetv2_x_coco
 7 |   
 8 | optimizer:
 9 |   type: AdamW
10 |   params: 
11 |     - 
12 |       params: '^(?=.*backbone)(?!.*norm|bn).*$'
13 |       lr: 0.000005   
14 |     - 
15 |       params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
16 |       weight_decay: 0.
17 | 
18 |   lr: 0.0005
19 |   betas: [0.9, 0.999]
20 |   weight_decay: 0.000125
21 |   
22 | # Increase to search for the optimal ema
23 | epoches: 58 # 72 + 2n
24 | 
25 | ## Our LR-Scheduler
26 | flat_epoch: 29    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
27 | no_aug_epoch: 8
28 | 
29 | train_dataloader: 
30 |   dataset: 
31 |     transforms:
32 |       policy:
33 |         epoch: [4, 29, 50]   # list 
34 | 
35 |   collate_fn:
36 |     mixup_epochs: [4, 29]
37 |     stop_epoch: 50


--------------------------------------------------------------------------------
/configs/deim_dfine/dfine_hgnetv2_l_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   '../dataset/coco_detection.yml',
 3 |   '../runtime.yml',
 4 |   '../base/dataloader.yml',
 5 |   '../base/optimizer.yml',
 6 |   '../base/dfine_hgnetv2.yml',
 7 | ]
 8 | 
 9 | output_dir: ./outputs/dfine_hgnetv2_l_coco
10 | 
11 | 
12 | HGNetv2:
13 |   name: 'B4'
14 |   return_idx: [1, 2, 3]
15 |   freeze_stem_only: True
16 |   freeze_at: 0
17 |   freeze_norm: True
18 | 
19 | optimizer:
20 |   type: AdamW
21 |   params:
22 |     -
23 |       params: '^(?=.*backbone)(?!.*norm|bn).*$'
24 |       lr: 0.0000125
25 |     -
26 |       params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
27 |       weight_decay: 0.
28 | 
29 |   lr: 0.00025
30 |   betas: [0.9, 0.999]
31 |   weight_decay: 0.000125
32 | 
33 | 
34 | # Increase to search for the optimal ema
35 | epoches: 80 # 72 + 2n
36 | train_dataloader:
37 |   dataset:
38 |     transforms:
39 |       policy:
40 |         epoch: 72
41 |   collate_fn:
42 |     stop_epoch: 72
43 |     ema_restart_decay: 0.9999
44 |     base_size_repeat: 4
45 | 


--------------------------------------------------------------------------------
/configs/deim_dfine/dfine_hgnetv2_m_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   '../dataset/coco_detection.yml',
 3 |   '../runtime.yml',
 4 |   '../base/dataloader.yml',
 5 |   '../base/optimizer.yml',
 6 |   '../base/dfine_hgnetv2.yml',
 7 | ]
 8 | 
 9 | output_dir: ./output/dfine_hgnetv2_m_coco
10 | 
11 | 
12 | DEIM:
13 |   backbone: HGNetv2
14 | 
15 | HGNetv2:
16 |   name: 'B2'
17 |   return_idx: [1, 2, 3]
18 |   freeze_at: -1
19 |   freeze_norm: False
20 |   use_lab: True
21 | 
22 | DFINETransformer:
23 |   num_layers: 4  # 5 6
24 |   eval_idx: -1  # -2 -3
25 | 
26 | HybridEncoder:
27 |   in_channels: [384, 768, 1536]
28 |   hidden_dim: 256
29 |   depth_mult: 0.67
30 | 
31 | optimizer:
32 |   type: AdamW
33 |   params:
34 |     -
35 |       params: '^(?=.*backbone)(?!.*norm|bn).*$'
36 |       lr: 0.00002
37 |     -
38 |       params: '^(?=.*backbone)(?=.*norm|bn).*$'
39 |       lr: 0.00002
40 |       weight_decay: 0.
41 |     -
42 |       params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
43 |       weight_decay: 0.
44 | 
45 |   lr: 0.0002
46 |   betas: [0.9, 0.999]
47 |   weight_decay: 0.0001
48 | 
49 | 
50 | # Increase to search for the optimal ema
51 | epoches: 132 # 120 + 4n
52 | train_dataloader:
53 |   dataset:
54 |     transforms:
55 |       policy:
56 |         epoch: 120
57 |   collate_fn:
58 |     stop_epoch: 120
59 |     ema_restart_decay: 0.9999
60 |     base_size_repeat: 6
61 | 


--------------------------------------------------------------------------------
/configs/deim_dfine/dfine_hgnetv2_n_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   '../dataset/coco_detection.yml',
 3 |   '../runtime.yml',
 4 |   '../base/dataloader.yml',
 5 |   '../base/optimizer.yml',
 6 |   '../base/dfine_hgnetv2.yml',
 7 | ]
 8 | 
 9 | output_dir: ./output/dfine_hgnetv2_n_coco
10 | 
11 | 
12 | DEIM:
13 |   backbone: HGNetv2
14 | 
15 | HGNetv2:
16 |   name: 'B0'
17 |   return_idx: [2, 3]
18 |   freeze_at: -1
19 |   freeze_norm: False
20 |   use_lab: True
21 | 
22 | 
23 | HybridEncoder:
24 |   in_channels: [512, 1024]
25 |   feat_strides: [16, 32]
26 | 
27 |   # intra
28 |   hidden_dim: 128
29 |   use_encoder_idx: [1]
30 |   dim_feedforward: 512
31 | 
32 |   # cross
33 |   expansion: 0.34
34 |   depth_mult: 0.5
35 | 
36 | 
37 | DFINETransformer:
38 |   feat_channels: [128, 128]
39 |   feat_strides: [16, 32]
40 |   hidden_dim: 128
41 |   dim_feedforward: 512
42 |   num_levels: 2
43 | 
44 |   num_layers: 3
45 |   eval_idx: -1
46 | 
47 |   num_points: [6, 6]
48 | 
49 | optimizer:
50 |   type: AdamW
51 |   params:
52 |     -
53 |       params: '^(?=.*backbone)(?!.*norm|bn).*$'
54 |       lr: 0.0004
55 |     -
56 |       params: '^(?=.*backbone)(?=.*norm|bn).*$'
57 |       lr: 0.0004
58 |       weight_decay: 0.
59 |     -
60 |       params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
61 |       weight_decay: 0.
62 | 
63 |   lr: 0.0008
64 |   betas: [0.9, 0.999]
65 |   weight_decay: 0.0001
66 | 
67 | 
68 | # Increase to search for the optimal ema
69 | epoches: 160 # 148 + 4n
70 | train_dataloader:
71 |   total_batch_size: 128
72 |   dataset:
73 |     transforms:
74 |       policy:
75 |         epoch: 148
76 |   collate_fn:
77 |     stop_epoch: 148
78 |     ema_restart_decay: 0.9999
79 |     base_size_repeat: ~
80 | 
81 | val_dataloader:
82 |   total_batch_size: 256
83 | 


--------------------------------------------------------------------------------
/configs/deim_dfine/dfine_hgnetv2_s_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   '../dataset/coco_detection.yml',
 3 |   '../runtime.yml',
 4 |   '../base/dataloader.yml',
 5 |   '../base/optimizer.yml',
 6 |   '../base/dfine_hgnetv2.yml',
 7 | ]
 8 | 
 9 | output_dir: ./output/dfine_hgnetv2_s_coco
10 | 
11 | 
12 | DEIM:
13 |   backbone: HGNetv2
14 | 
15 | HGNetv2:
16 |   name: 'B0'
17 |   return_idx: [1, 2, 3]
18 |   freeze_at: -1
19 |   freeze_norm: False
20 |   use_lab: True
21 | 
22 | DFINETransformer:
23 |   num_layers: 3  # 4 5 6
24 |   eval_idx: -1  # -2 -3 -4
25 | 
26 | HybridEncoder:
27 |   in_channels: [256, 512, 1024]
28 |   hidden_dim: 256
29 |   depth_mult: 0.34
30 |   expansion: 0.5
31 | 
32 | optimizer:
33 |   type: AdamW
34 |   params:
35 |     -
36 |       params: '^(?=.*backbone)(?!.*norm|bn).*$'
37 |       lr: 0.0001
38 |     -
39 |       params: '^(?=.*backbone)(?=.*norm|bn).*$'
40 |       lr: 0.0001
41 |       weight_decay: 0.
42 |     -
43 |       params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
44 |       weight_decay: 0.
45 | 
46 |   lr: 0.0002
47 |   betas: [0.9, 0.999]
48 |   weight_decay: 0.0001
49 | 
50 | 
51 | # Increase to search for the optimal ema
52 | epoches: 132 # 120 + 4n
53 | train_dataloader:
54 |   dataset:
55 |     transforms:
56 |       policy:
57 |         epoch: 120
58 |   collate_fn:
59 |     stop_epoch: 120
60 |     ema_restart_decay: 0.9999
61 |     base_size_repeat: 20
62 | 


--------------------------------------------------------------------------------
/configs/deim_dfine/dfine_hgnetv2_x_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   '../dataset/coco_detection.yml',
 3 |   '../runtime.yml',
 4 |   '../base/dataloader.yml',
 5 |   '../base/optimizer.yml',
 6 |   '../base/dfine_hgnetv2.yml',
 7 | ]
 8 | 
 9 | output_dir: ./output/dfine_hgnetv2_x_coco
10 | 
11 | 
12 | DEIM:
13 |   backbone: HGNetv2
14 | 
15 | HGNetv2:
16 |   name: 'B5'
17 |   return_idx: [1, 2, 3]
18 |   freeze_stem_only: True
19 |   freeze_at: 0
20 |   freeze_norm: True
21 | 
22 | HybridEncoder:
23 |   # intra
24 |   hidden_dim: 384
25 |   dim_feedforward: 2048
26 | 
27 | DFINETransformer:
28 |   feat_channels: [384, 384, 384]
29 |   reg_scale: 8
30 | 
31 | optimizer:
32 |   type: AdamW
33 |   params:
34 |     -
35 |       params: '^(?=.*backbone)(?!.*norm|bn).*$'
36 |       lr: 0.0000025
37 |     -
38 |       params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
39 |       weight_decay: 0.
40 | 
41 |   lr: 0.00025
42 |   betas: [0.9, 0.999]
43 |   weight_decay: 0.000125
44 | 
45 | 
46 | # Increase to search for the optimal ema
47 | epoches: 80 # 72 + 2n
48 | train_dataloader:
49 |   dataset:
50 |     transforms:
51 |       policy:
52 |         epoch: 72
53 |   collate_fn:
54 |     stop_epoch: 72
55 |     ema_restart_decay: 0.9998
56 |     base_size_repeat: 3
57 | 


--------------------------------------------------------------------------------
/configs/deim_dfine/object365/deim_hgnetv2_x_obj2coco_24e.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   './dfine_hgnetv2_x_obj2coco.yml',
 3 |   '../../base/deim.yml'
 4 | ]
 5 | 
 6 | output_dir: ./deim_outputs/deim_hgnetv2_x_obj2coco_24e
 7 |   
 8 | HGNetv2:
 9 |   freeze_at: 0         # 0 default
10 |   freeze_norm: True    # True default
11 |   
12 | # Activation
13 | DFINETransformer:
14 |   activation: relu
15 |   mlp_act: relu
16 | 
17 | optimizer:
18 |   type: AdamW
19 |   params:
20 |     -
21 |       params: '^(?=.*backbone)(?!.*norm|bn).*$'
22 |       lr: 0.0000025
23 |     -
24 |       params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
25 |       weight_decay: 0.
26 | 
27 |   lr: 0.00025
28 |   betas: [0.9, 0.999]
29 |   weight_decay: 0.000125
30 |   
31 | # Increase to search for the optimal ema
32 | epoches: 24 # 72 + 2n
33 | 
34 | ## Our LR-Scheduler
35 | lrsheduler: flatcosine
36 | lr_gamma: 1
37 | warmup_iter: 0    # 0
38 | flat_epoch: 12000    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
39 | no_aug_epoch: 4
40 | 
41 | ## Our DataAug
42 | train_dataloader: 
43 |   dataset: 
44 |     transforms:
45 |       policy:
46 |         epoch: [2, 12, 20]   # list 
47 | 
48 |   collate_fn:
49 |     mixup_epochs: [2, 12]
50 |     stop_epoch: 20


--------------------------------------------------------------------------------
/configs/deim_dfine/object365/dfine_hgnetv2_x_obj2coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   '../../dataset/coco_detection.yml',
 3 |   '../../runtime.yml',
 4 |   '../../base/dataloader.yml',
 5 |   '../../base/optimizer.yml',
 6 |   '../../base/dfine_hgnetv2.yml',
 7 | ]
 8 | 
 9 | output_dir: ./outputs/dfine_hgnetv2_x_obj2coco
10 | 
11 | HGNetv2:
12 |   name: 'B5'
13 |   return_idx: [1, 2, 3]
14 |   freeze_stem_only: True
15 |   freeze_at: 0
16 |   freeze_norm: True
17 | 
18 | HybridEncoder:
19 |   # intra
20 |   hidden_dim: 384
21 |   dim_feedforward: 2048
22 | 
23 | DFINETransformer:
24 |   feat_channels: [384, 384, 384]
25 |   reg_scale: 8
26 | 
27 | optimizer:
28 |   type: AdamW
29 |   params:
30 |     -
31 |       params: '^(?=.*backbone)(?!.*norm|bn).*$'
32 |       lr: 0.0000025
33 |     -
34 |       params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
35 |       weight_decay: 0.
36 | 
37 |   lr: 0.00025
38 |   betas: [0.9, 0.999]
39 |   weight_decay: 0.000125
40 | 
41 | 
42 | epoches: 36 # Early stop
43 | train_dataloader:
44 |   dataset:
45 |     transforms:
46 |       policy:
47 |         epoch: 30
48 |   collate_fn:
49 |     stop_epoch: 30
50 |     ema_restart_decay: 0.9999
51 |     base_size_repeat: 3
52 | 
53 | ema:
54 |   warmups: 0
55 | 
56 | lr_warmup_scheduler:
57 |   warmup_duration: 0
58 | 


--------------------------------------------------------------------------------
/configs/deim_rtdetrv2/deim_r101vd_60e_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   './rtdetrv2_r101vd_6x_coco.yml',
 3 |   '../base/rt_deim.yml',
 4 | ]
 5 | 
 6 | output_dir: ./outputs/deim_rtdetrv2_r101vd_60e_coco
 7 | 
 8 | optimizer:
 9 |   type: AdamW
10 |   params: 
11 |     - 
12 |       params: '^(?=.*backbone)(?!.*norm).*$'
13 |       lr: 0.000002
14 |     - 
15 |       params: '^(?=.*(?:norm|bn)).*$'
16 |       weight_decay: 0.
17 | 
18 |   lr: 0.0002
19 |   betas: [0.9, 0.999]
20 |   weight_decay: 0.0001
21 | 
22 | 
23 | # change part
24 | epoches: 60
25 | flat_epoch: 34    # 4 + 60 / 2
26 | no_aug_epoch: 2
27 | 
28 | train_dataloader: 
29 |   dataset: 
30 |     transforms:
31 |       policy:
32 |         epoch: [4, 34, 58]   # list 
33 |       
34 |   collate_fn:
35 |     mixup_epochs: [4, 34]
36 |     stop_epoch: 58
37 | 


--------------------------------------------------------------------------------
/configs/deim_rtdetrv2/deim_r18vd_120e_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   './rtdetrv2_r18vd_120e_coco.yml',
 3 |   '../base/rt_deim.yml',
 4 | ]
 5 | 
 6 | output_dir: ./output/deim_rtdetrv2_r18vd_120e_coco
 7 | 
 8 | optimizer:
 9 |   type: AdamW
10 |   params:
11 |     - 
12 |       params: '^(?=.*(?:norm|bn)).*$'
13 |       weight_decay: 0.
14 | 
15 |   lr: 0.0002
16 |   betas: [0.9, 0.999]
17 |   weight_decay: 0.0001
18 | 
19 | # change part
20 | epoches: 120
21 | flat_epoch: 64    # 4 + 120 / 2
22 | no_aug_epoch: 3
23 | 
24 | train_dataloader: 
25 |   dataset: 
26 |     transforms:
27 |       policy:
28 |         epoch: [4, 64, 117]   # list 
29 |       
30 |   collate_fn:
31 |     mixup_epochs: [4, 64]
32 |     stop_epoch: 117


--------------------------------------------------------------------------------
/configs/deim_rtdetrv2/deim_r34vd_120e_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   './rtdetrv2_r34vd_120e_coco.yml',
 3 |   '../base/rt_deim.yml',
 4 | ]
 5 | 
 6 | output_dir: ./outputs/deim_rtdetrv2_r34vd_120e_coco
 7 | 
 8 | optimizer:
 9 |   type: AdamW
10 |   params:
11 |     - 
12 |       params: '^(?=.*backbone)(?!.*norm).*$'
13 |       lr: 0.0001
14 |     - 
15 |       params: '^(?=.*(?:norm|bn)).*$'
16 |       weight_decay: 0.
17 | 
18 |   lr: 0.0002
19 |   betas: [0.9, 0.999]
20 |   weight_decay: 0.0001
21 | 
22 | 
23 | # change part
24 | epoches: 120
25 | flat_epoch: 64
26 | no_aug_epoch: 3
27 | 
28 | train_dataloader: 
29 |   dataset: 
30 |     transforms:
31 |       policy:
32 |         epoch: [4, 64, 117]   # list 
33 |       
34 |   collate_fn:
35 |     mixup_epochs: [4, 64]
36 |     stop_epoch: 117


--------------------------------------------------------------------------------
/configs/deim_rtdetrv2/deim_r50vd_60e_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   './rtdetrv2_r50vd_6x_coco.yml',
 3 |   '../base/rt_deim.yml',
 4 | ]
 5 | 
 6 | output_dir: ./outputs/deim_rtdetrv2_r50vd_60e_coco
 7 | 
 8 | optimizer:
 9 |   type: AdamW
10 |   params: 
11 |     - 
12 |       params: '^(?=.*backbone)(?!.*norm).*$'
13 |       lr: 0.00002
14 |     - 
15 |       params: '^(?=.*(?:norm|bn)).*$'
16 |       weight_decay: 0.
17 | 
18 |   lr: 0.0002
19 |   betas: [0.9, 0.999]
20 |   weight_decay: 0.0001
21 | 
22 | # change part
23 | epoches: 60
24 | flat_epoch: 34    # 4 + 60 / 2
25 | no_aug_epoch: 2
26 | 
27 | train_dataloader: 
28 |   dataset: 
29 |     transforms:
30 |       policy:
31 |         epoch: [4, 34, 58]   # list 
32 |       
33 |   collate_fn:
34 |     mixup_epochs: [4, 34]
35 |     stop_epoch: 58


--------------------------------------------------------------------------------
/configs/deim_rtdetrv2/deim_r50vd_m_60e_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   './rtdetrv2_r50vd_m_7x_coco.yml',
 3 |   '../base/rt_deim.yml',
 4 | ]
 5 | 
 6 | output_dir: ./outputs/deim_rtdetrv2_r50vd_m_60e_coco
 7 | 
 8 | RTDETRTransformerv2:
 9 |   eval_idx: 2 # use 3th decoder layer to eval
10 |   num_layers: 3
11 |   
12 | optimizer:
13 |   type: AdamW
14 |   params: 
15 |     - 
16 |       params: '^(?=.*backbone)(?!.*norm).*$'
17 |       lr: 0.00002
18 |     - 
19 |       params: '^(?=.*(?:norm|bn)).*$'
20 |       weight_decay: 0.
21 | 
22 |   lr: 0.0002
23 |   betas: [0.9, 0.999]
24 |   weight_decay: 0.0001
25 | 
26 | # change part
27 | epoches: 60
28 | flat_epoch: 34    # 4 + 60 / 2
29 | no_aug_epoch: 2
30 | 
31 | train_dataloader: 
32 |   dataset: 
33 |     transforms:
34 |       policy:
35 |         epoch: [4, 34, 58]   # list 
36 |       
37 |   collate_fn:
38 |     mixup_epochs: [4, 34]
39 |     stop_epoch: 58


--------------------------------------------------------------------------------
/configs/deim_rtdetrv2/rtdetrv2_r101vd_6x_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   '../dataset/coco_detection.yml',
 3 |   '../runtime.yml',
 4 |   '../base/dataloader.yml',
 5 |   '../base/rt_optimizer.yml',
 6 |   '../base/rtdetrv2_r50vd.yml',
 7 | ]
 8 | 
 9 | 
10 | output_dir: ./outputs/rtdetrv2_r101vd_6x_coco
11 | 
12 | 
13 | PResNet:
14 |   depth: 101
15 | 
16 | 
17 | HybridEncoder:
18 |   # intra
19 |   hidden_dim: 384
20 |   dim_feedforward: 2048
21 | 
22 | 
23 | RTDETRTransformerv2:
24 |   feat_channels: [384, 384, 384]
25 | 
26 | 
27 | optimizer:
28 |   type: AdamW
29 |   params: 
30 |     - 
31 |       params: '^(?=.*backbone)(?!.*norm|bn).*$'
32 |       lr: 0.000001
33 |     - 
34 |       params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'    # only encoder + decoder norm
35 |       weight_decay: 0.
36 | 
37 |   lr: 0.0001
38 |   betas: [0.9, 0.999]
39 |   weight_decay: 0.0001
40 | 
41 | 


--------------------------------------------------------------------------------
/configs/deim_rtdetrv2/rtdetrv2_r18vd_120e_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   '../dataset/coco_detection.yml',
 3 |   '../runtime.yml',
 4 |   '../base/dataloader.yml',
 5 |   '../base/rt_optimizer.yml',
 6 |   '../base/rtdetrv2_r50vd.yml',
 7 | ]
 8 | 
 9 | 
10 | output_dir: ./output/rtdetrv2_r18vd_120e_coco
11 | 
12 | 
13 | PResNet:
14 |   depth: 18
15 |   freeze_at: -1
16 |   freeze_norm: False
17 |   pretrained: True
18 | 
19 | HybridEncoder:
20 |   in_channels: [128, 256, 512]
21 |   hidden_dim: 256
22 |   expansion: 0.5
23 | 
24 | RTDETRTransformerv2:
25 |   num_layers: 3
26 | 
27 | 
28 | epoches: 120 
29 | 
30 | optimizer:
31 |   type: AdamW
32 |   params:
33 |     - 
34 |       params: '^(?=.*(?:norm|bn)).*$'
35 |       weight_decay: 0.
36 | 
37 | 
38 | train_dataloader: 
39 |   dataset: 
40 |     transforms:
41 |       policy:
42 |         epoch: 117
43 |   collate_fn:
44 |     scales: ~


--------------------------------------------------------------------------------
/configs/deim_rtdetrv2/rtdetrv2_r34vd_120e_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   '../dataset/coco_detection.yml',
 3 |   '../runtime.yml',
 4 |   '../base/dataloader.yml',
 5 |   '../base/rt_optimizer.yml',
 6 |   '../base/rtdetrv2_r50vd.yml',
 7 | ]
 8 | 
 9 | 
10 | output_dir: ./outputs/rtdetrv2_r34vd_120e_coco
11 | 
12 | 
13 | PResNet:
14 |   depth: 34
15 |   freeze_at: -1
16 |   freeze_norm: False
17 |   pretrained: True
18 | 
19 | 
20 | HybridEncoder:
21 |   in_channels: [128, 256, 512]
22 |   hidden_dim: 256
23 |   expansion: 0.5
24 | 
25 | 
26 | RTDETRTransformerv2:
27 |   num_layers: 4
28 | 
29 | 
30 | epoches: 120
31 | 
32 | optimizer:
33 |   type: AdamW
34 |   params: 
35 |     - 
36 |       params: '^(?=.*backbone)(?!.*norm|bn).*$'
37 |       lr: 0.00005
38 |     - 
39 |       params: '^(?=.*backbone)(?=.*norm|bn).*$'
40 |       lr: 0.00005
41 |       weight_decay: 0.
42 |     - 
43 |       params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
44 |       weight_decay: 0.
45 | 
46 |   lr: 0.0001
47 |   betas: [0.9, 0.999]
48 |   weight_decay: 0.0001
49 | 
50 | 
51 | train_dataloader: 
52 |   dataset: 
53 |     transforms:
54 |       policy:
55 |         epoch: 117
56 |   collate_fn:
57 |     stop_epoch: 117
58 | 


--------------------------------------------------------------------------------
/configs/deim_rtdetrv2/rtdetrv2_r50vd_6x_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   '../dataset/coco_detection.yml',
 3 |   '../runtime.yml',
 4 |   '../base/dataloader.yml',
 5 |   '../base/rt_optimizer.yml',
 6 |   '../base/rtdetrv2_r50vd.yml',
 7 | ]
 8 | 
 9 | 
10 | output_dir: ./outputs/rtdetrv2_r50vd_6x_coco
11 | 
12 | 
13 | optimizer:
14 |   type: AdamW
15 |   params: 
16 |     - 
17 |       params: '^(?=.*backbone)(?!.*norm).*$'
18 |       lr: 0.00001
19 |     - 
20 |       params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
21 |       weight_decay: 0.
22 | 
23 |   lr: 0.0001
24 |   betas: [0.9, 0.999]
25 |   weight_decay: 0.0001


--------------------------------------------------------------------------------
/configs/deim_rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   '../dataset/coco_detection.yml',
 3 |   '../runtime.yml',
 4 |   '../base/dataloader.yml',
 5 |   '../base/rt_optimizer.yml',
 6 |   '../base/rtdetrv2_r50vd.yml',
 7 | ]
 8 | 
 9 | output_dir: ./outputs/rtdetrv2_r50vd_m_6x_coco
10 | 
11 | 
12 | HybridEncoder:
13 |   expansion: 0.5
14 | 
15 | 
16 | RTDETRTransformerv2:
17 |   eval_idx: 2 # use 3th decoder layer to eval
18 | 
19 | 
20 | epoches: 84
21 | 
22 | optimizer:
23 |   type: AdamW
24 |   params: 
25 |     - 
26 |       params: '^(?=.*backbone)(?!.*norm).*$'
27 |       lr: 0.00001
28 |     - 
29 |       params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
30 |       weight_decay: 0.
31 | 
32 |   lr: 0.0001
33 |   betas: [0.9, 0.999]
34 |   weight_decay: 0.0001
35 | 
36 | 
37 | train_dataloader: 
38 |   dataset: 
39 |     transforms:
40 |       policy:
41 |         epoch: 81
42 |   collate_fn:
43 |     stop_epoch: 81


--------------------------------------------------------------------------------
/configs/deimv2/deimv2_dinov3_l_coco.yml:
--------------------------------------------------------------------------------
  1 | __include__: [
  2 |   '../dataset/coco_detection.yml',
  3 |   '../runtime.yml',
  4 |   '../base/dataloader.yml',
  5 |   '../base/optimizer.yml',
  6 |   '../base/deimv2.yml',
  7 | ]
  8 | 
  9 | 
 10 | output_dir: ./outputs/deimv2_dinov3_l_coco
 11 | 
 12 | DEIM:
 13 |   backbone: DINOv3STAs
 14 | 
 15 | DINOv3STAs:
 16 |   name: dinov3_vits16
 17 |   weights_path: ./ckpts/dinov3_vits16_pretrain_lvd1689m-08c60483.pth
 18 |   interaction_indexes: [5,8,11]   # only need the [1/8, 1/16, 1/32]
 19 |   finetune: True
 20 |   conv_inplane: 32
 21 |   hidden_dim: 224
 22 | 
 23 | HybridEncoder:
 24 |   in_channels: [224, 224, 224]
 25 |   hidden_dim: 224
 26 |   dim_feedforward: 896
 27 | 
 28 | DEIMTransformer:
 29 |   feat_channels: [224, 224, 224]
 30 |   hidden_dim: 224
 31 |   num_layers: 4
 32 |   eval_idx: -1
 33 |   dim_feedforward: 1792
 34 | 
 35 | ## DEIM LR-Scheduler
 36 | epoches: 68 # 72 + 2n  # Increase to search for the optimal ema
 37 | 
 38 | lrsheduler: flatcosine
 39 | lr_gamma: 0.5
 40 | warmup_iter: 2000
 41 | flat_epoch: 34    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
 42 | no_aug_epoch: 8
 43 | 
 44 | ## Optimizer
 45 | optimizer:
 46 |   type: AdamW
 47 |   params: 
 48 |     -
 49 |       # except norm/bn/bias in self.dinov3
 50 |       params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$'  
 51 |       lr: 0.0000125
 52 |     -
 53 |       # including norm/bn/bias in self.dinov3
 54 |       params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$'    
 55 |       lr: 0.0000125
 56 |       weight_decay: 0.
 57 |     - 
 58 |       # including norm/bn/bias except for the self.dinov3
 59 |       params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
 60 |       weight_decay: 0.
 61 | 
 62 |   lr: 0.0005
 63 |   betas: [0.9, 0.999]
 64 |   weight_decay: 0.000125
 65 | 
 66 | 
 67 | ## Dense O2O: Mosaic + Mixup + CopyBlend
 68 | train_dataloader: 
 69 |   dataset: 
 70 |     transforms:
 71 |       ops:
 72 |         - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
 73 |            probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
 74 |         - {type: RandomPhotometricDistort, p: 0.5}
 75 |         - {type: RandomZoomOut, fill: 0}
 76 |         - {type: RandomIoUCrop, p: 0.8}
 77 |         - {type: SanitizeBoundingBoxes, min_size: 1}
 78 |         - {type: RandomHorizontalFlip}
 79 |         - {type: Resize, size: [640, 640], }
 80 |         - {type: SanitizeBoundingBoxes, min_size: 1}
 81 |         - {type: ConvertPILImage, dtype: 'float32', scale: True}
 82 |         - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
 83 |         - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
 84 |       policy:
 85 |         epoch: [4, 34, 60]   # list 
 86 | 
 87 |   collate_fn:
 88 |     mixup_epochs: [4, 34]
 89 |     stop_epoch: 60
 90 |     copyblend_epochs: [4, 60]
 91 |     base_size_repeat: 3
 92 | 
 93 | val_dataloader:
 94 |   dataset:
 95 |     transforms:
 96 |       ops:
 97 |         - {type: Resize, size: [640, 640], }
 98 |         - {type: ConvertPILImage, dtype: 'float32', scale: True}
 99 |         - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
100 | 
101 | ## DEIM Loss
102 | DEIMCriterion:
103 |   matcher:
104 |     matcher_change_epoch: 50


--------------------------------------------------------------------------------
/configs/deimv2/deimv2_dinov3_m_coco.yml:
--------------------------------------------------------------------------------
  1 | __include__: [
  2 |   '../dataset/coco_detection.yml',
  3 |   '../runtime.yml',
  4 |   '../base/dataloader.yml',
  5 |   '../base/optimizer.yml',
  6 |   '../base/deimv2.yml',
  7 | ]
  8 | 
  9 | output_dir: ./outputs/deimv2_dinov3_m_coco
 10 | 
 11 | DEIM:
 12 |   backbone: DINOv3STAs
 13 | 
 14 | DINOv3STAs:
 15 |   name: vit_tinyplus
 16 |   embed_dim: 256
 17 |   weights_path: ./ckpts/vittplus_distill.pt
 18 |   interaction_indexes: [3, 7, 11]   # only need the [1/8, 1/16, 1/32]
 19 |   num_heads: 4
 20 | 
 21 | HybridEncoder:
 22 |   in_channels: [256, 256, 256]
 23 |   depth_mult: 1
 24 |   expansion: 0.67
 25 |   hidden_dim: 256
 26 |   dim_feedforward: 512
 27 | 
 28 | 
 29 | DEIMTransformer:
 30 |   feat_channels: [256, 256, 256]
 31 |   hidden_dim: 256
 32 |   dim_feedforward: 512
 33 |   num_layers: 4  # 4 5 6
 34 |   eval_idx: -1  # -2 -3 -4
 35 | 
 36 | optimizer:
 37 |   type: AdamW
 38 | 
 39 |   params: 
 40 |     -
 41 |       # except norm/bn/bias in self.dinov3
 42 |       params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$'  
 43 |       lr: 0.000025
 44 |     -
 45 |       # including norm/bn/bias in self.dinov3
 46 |       params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$'    
 47 |       lr: 0.000025
 48 |       weight_decay: 0.
 49 |     - 
 50 |       # including norm/bn/bias except for the self.dinov3
 51 |       params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
 52 |       weight_decay: 0.
 53 | 
 54 |   lr: 0.0005
 55 |   betas: [0.9, 0.999]
 56 |   weight_decay: 0.0001
 57 | 
 58 | epoches: 102 # 120 + 4n
 59 | 
 60 | ## Our LR-Scheduler
 61 | flat_epoch: 49    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
 62 | no_aug_epoch: 12
 63 | 
 64 | 
 65 | ## Our DataAug
 66 | train_dataloader:
 67 |   dataset: 
 68 |     transforms:
 69 |       ops:
 70 |         - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
 71 |            probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
 72 |         - {type: RandomPhotometricDistort, p: 0.5}
 73 |         - {type: RandomZoomOut, fill: 0}
 74 |         - {type: RandomIoUCrop, p: 0.8}
 75 |         - {type: SanitizeBoundingBoxes, min_size: 1}
 76 |         - {type: RandomHorizontalFlip}
 77 |         - {type: Resize, size: [640, 640], }
 78 |         - {type: SanitizeBoundingBoxes, min_size: 1}
 79 |         - {type: ConvertPILImage, dtype: 'float32', scale: True}
 80 |         - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
 81 |         - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
 82 |       policy:
 83 |         epoch: [4, 49, 90]   # list 
 84 | 
 85 |   collate_fn:
 86 |     mixup_prob: 0.5
 87 |     ema_restart_decay: 0.9999
 88 |     base_size_repeat: 6
 89 |     mixup_epochs: [4, 49]
 90 |     stop_epoch: 90
 91 |     copyblend_epochs: [4, 90]
 92 | 
 93 | 
 94 | val_dataloader:
 95 |   dataset:
 96 |     transforms:
 97 |       ops:
 98 |         - {type: Resize, size: [640, 640], }
 99 |         - {type: ConvertPILImage, dtype: 'float32', scale: True}
100 |         - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
101 | 
102 | DEIMCriterion:
103 |   matcher:
104 |     # new matcher
105 |     change_matcher: True
106 |     iou_order_alpha: 4.0
107 |     matcher_change_epoch: 80
108 | 


--------------------------------------------------------------------------------
/configs/deimv2/deimv2_dinov3_s_coco.yml:
--------------------------------------------------------------------------------
  1 | __include__: [
  2 |   '../dataset/coco_detection.yml',
  3 |   '../runtime.yml',
  4 |   '../base/dataloader.yml',
  5 |   '../base/optimizer.yml',
  6 |   '../base/deimv2.yml',
  7 | ]
  8 | 
  9 | output_dir: ./outputs/deimv2_dinov3_s_coco
 10 | 
 11 | DEIM:
 12 |   backbone: DINOv3STAs
 13 | 
 14 | DINOv3STAs:
 15 |   name: vit_tiny
 16 |   embed_dim: 192
 17 |   weights_path: ./ckpts/vitt_distill.pt
 18 |   interaction_indexes: [3, 7, 11]   # only need the [1/8, 1/16, 1/32]
 19 |   num_heads: 3
 20 | 
 21 | HybridEncoder:
 22 |   in_channels: [192, 192, 192]
 23 |   depth_mult: 0.67
 24 |   expansion: 0.34
 25 |   hidden_dim: 192
 26 |   dim_feedforward: 512
 27 | 
 28 | DEIMTransformer:
 29 |   feat_channels: [192, 192, 192]
 30 |   hidden_dim: 192
 31 |   dim_feedforward: 512
 32 |   num_layers: 4  # 4 5 6
 33 |   eval_idx: -1  # -2 -3 -4
 34 | 
 35 | 
 36 | ## Optimizer
 37 | optimizer:
 38 |   type: AdamW
 39 | 
 40 |   params: 
 41 |     -
 42 |       # except norm/bn/bias in self.dinov3
 43 |       params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$'  
 44 |       lr: 0.000025
 45 |     -
 46 |       # including all norm/bn/bias in self.dinov3
 47 |       params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$'    
 48 |       lr: 0.000025
 49 |       weight_decay: 0.
 50 |     - 
 51 |       # including all norm/bn/bias except for the self.dinov3
 52 |       params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
 53 |       weight_decay: 0.
 54 | 
 55 |   lr: 0.0005
 56 |   betas: [0.9, 0.999]
 57 |   weight_decay: 0.0001
 58 | 
 59 | # Increase to search for the optimal ema
 60 | epoches: 132 # 120 + 4n
 61 | 
 62 | ## Our LR-Scheduler
 63 | flat_epoch: 64    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
 64 | no_aug_epoch: 12
 65 | 
 66 | ## Our DataAug
 67 | train_dataloader:
 68 |   dataset: 
 69 |     transforms:
 70 |       ops:
 71 |         - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
 72 |            probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
 73 |         - {type: RandomPhotometricDistort, p: 0.5}
 74 |         - {type: RandomZoomOut, fill: 0}
 75 |         - {type: RandomIoUCrop, p: 0.8}
 76 |         - {type: SanitizeBoundingBoxes, min_size: 1}
 77 |         - {type: RandomHorizontalFlip}
 78 |         - {type: Resize, size: [640, 640], }
 79 |         - {type: SanitizeBoundingBoxes, min_size: 1}
 80 |         - {type: ConvertPILImage, dtype: 'float32', scale: True}
 81 |         - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
 82 |         - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
 83 |       policy:
 84 |         epoch: [4, 64, 120]   # list 
 85 | 
 86 |   collate_fn:
 87 |     base_size: 640
 88 |     mixup_prob: 0.5
 89 |     ema_restart_decay: 0.9999
 90 |     base_size_repeat: 20
 91 |     mixup_epochs: [4, 64]
 92 |     stop_epoch: 120
 93 |     copyblend_epochs: [4, 120]
 94 | 
 95 | val_dataloader:
 96 |   dataset:
 97 |     transforms:
 98 |       ops:
 99 |         - {type: Resize, size: [640, 640], }
100 |         - {type: ConvertPILImage, dtype: 'float32', scale: True}
101 |         - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
102 | 
103 | DEIMCriterion:
104 |   matcher:
105 |     # change matcher
106 |     change_matcher: True
107 |     iou_order_alpha: 4.0
108 |     matcher_change_epoch: 100
109 | 


--------------------------------------------------------------------------------
/configs/deimv2/deimv2_dinov3_x_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   '../dataset/coco_detection.yml',
 3 |   '../runtime.yml',
 4 |   '../base/dataloader.yml',
 5 |   '../base/optimizer.yml',
 6 |   '../base/deimv2.yml',
 7 | ]
 8 | 
 9 | 
10 | output_dir: ./outputs/deimv2_dinov3_x_coco
11 |   
12 | DEIM:
13 |   backbone: DINOv3STAs
14 | 
15 | DINOv3STAs:
16 |   name: dinov3_vits16plus
17 |   weights_path: ./ckpts/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pth
18 |   interaction_indexes: [5,8,11]   # only need the [1/8, 1/16, 1/32]
19 |   finetune: True
20 |   conv_inplane: 64
21 |   hidden_dim: 256
22 |   
23 | HybridEncoder:
24 |   in_channels: [256, 256, 256]
25 |   # intra
26 |   hidden_dim: 256
27 |   dim_feedforward: 1024
28 | 
29 |   # cross 
30 |   expansion: 1.25
31 |   depth_mult: 1.37
32 | 
33 | DEIMTransformer:
34 |   num_layers: 6
35 |   eval_idx: -1
36 |   feat_channels: [256, 256, 256]
37 |   # reg_scale: 8
38 |   hidden_dim: 256
39 |   dim_feedforward: 2048
40 | 
41 | optimizer:
42 |   type: AdamW
43 |   params: 
44 |     -
45 |       # except norm/bn/bias in self.dinov3
46 |       params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$'  
47 |       lr: 0.00001
48 |     -
49 |       # including norm/bn/bias in self.dinov3
50 |       params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$'    
51 |       lr: 0.00001
52 |       weight_decay: 0.
53 |     - 
54 |       # including norm/bn/bias except for the self.dinov3
55 |       params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
56 |       weight_decay: 0.
57 | 
58 |   lr: 0.0005
59 |   betas: [0.9, 0.999]
60 |   weight_decay: 0.000125
61 | 
62 | ## Dense O2O: Mosaic + Mixup + CopyBlend
63 | train_dataloader: 
64 |   dataset: 
65 |     transforms:
66 |       ops:
67 |         - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
68 |            probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
69 |         - {type: RandomPhotometricDistort, p: 0.5}
70 |         - {type: RandomZoomOut, fill: 0}
71 |         - {type: RandomIoUCrop, p: 0.8}
72 |         - {type: SanitizeBoundingBoxes, min_size: 1}
73 |         - {type: RandomHorizontalFlip}
74 |         - {type: Resize, size: [640, 640], }
75 |         - {type: SanitizeBoundingBoxes, min_size: 1}
76 |         - {type: ConvertPILImage, dtype: 'float32', scale: True}
77 |         - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}
78 |         - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
79 |       policy:
80 |         epoch: [4, 29, 50]   # list 
81 | 
82 |   collate_fn:
83 |     mixup_epochs: [4, 29]
84 |     stop_epoch: 50
85 |     copyblend_epochs: [4, 50]
86 |     base_size_repeat: 3
87 | 
88 | val_dataloader:
89 |   dataset:
90 |     transforms:
91 |       ops:
92 |         - {type: Resize, size: [640, 640], }
93 |         - {type: ConvertPILImage, dtype: 'float32', scale: True}
94 |         - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]}


--------------------------------------------------------------------------------
/configs/deimv2/deimv2_hgnetv2_atto_coco.yml:
--------------------------------------------------------------------------------
  1 | __include__: [
  2 |   '../dataset/coco_detection.yml',
  3 |   '../runtime.yml',
  4 |   '../base/dataloader.yml',
  5 |   '../base/optimizer.yml',
  6 |   '../base/deimv2.yml',
  7 | ]
  8 | 
  9 | output_dir: ./outputs/deimv2_hgnetv2_atto_coco
 10 | 
 11 | DEIM:
 12 |   encoder: LiteEncoder
 13 | 
 14 | HGNetv2:
 15 |   name: 'Atto'
 16 |   return_idx: [2]
 17 |   freeze_at: -1
 18 |   freeze_norm: False
 19 |   use_lab: True
 20 | 
 21 | LiteEncoder:
 22 |   in_channels: [256]
 23 |   feat_strides: [16]
 24 |   # intra
 25 |   hidden_dim: 64
 26 | 
 27 |   # cross
 28 |   expansion: 0.34
 29 |   depth_mult: 0.5
 30 |   act: 'silu'
 31 | 
 32 | 
 33 | DEIMTransformer:
 34 |   feat_channels: [64, 64]
 35 |   feat_strides: [16, 32]
 36 |   hidden_dim: 64
 37 |   num_levels: 2
 38 |   num_points: [4, 2]
 39 | 
 40 |   num_layers: 3
 41 |   eval_idx: -1
 42 |   num_queries: 100
 43 | 
 44 |   # FFN
 45 |   dim_feedforward: 160
 46 | 
 47 |   # New options for DEIMv2
 48 |   share_bbox_head: True
 49 |   use_gateway: False
 50 | 
 51 | # Increase to search for the optimal ema
 52 | epoches: 500 # 468 + 32
 53 | 
 54 | ## Our LR-Scheduler
 55 | warmup_iter: 4000
 56 | flat_epoch: 250    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
 57 | no_aug_epoch: 32
 58 | lr_gamma: 0.5
 59 | 
 60 | optimizer:
 61 |   type: AdamW
 62 |   params:
 63 |     - params: '^(?=.*backbone)(?!.*norm|bn).*$'
 64 |       lr: 0.001
 65 |     - params: '^(?=.*backbone)(?=.*norm|bn).*$'
 66 |       lr: 0.001
 67 |       weight_decay: 0.
 68 |     - params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'    # except bias
 69 |       weight_decay: 0.
 70 | 
 71 |   lr: 0.002
 72 |   betas: [0.9, 0.999]
 73 |   weight_decay: 0.0001
 74 | 
 75 | eval_spatial_size: [320, 320]
 76 | train_dataloader:
 77 |   total_batch_size: 128
 78 |   dataset: 
 79 |     transforms:
 80 |       ops:
 81 |         - {type: Mosaic, output_size: 160, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
 82 |            probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
 83 |         - {type: RandomPhotometricDistort, p: 0.5}
 84 |         - {type: RandomZoomOut, fill: 0}
 85 |         - {type: RandomIoUCrop, p: 0.8}
 86 |         - {type: SanitizeBoundingBoxes, min_size: 12}
 87 |         - {type: RandomHorizontalFlip}
 88 |         - {type: Resize, size: [320, 320], }
 89 |         - {type: SanitizeBoundingBoxes, min_size: 12}
 90 |         - {type: ConvertPILImage, dtype: 'float32', scale: True}
 91 |         - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
 92 |       policy:
 93 |         epoch: [4, 250, 400]   # list 
 94 |       mosaic_prob: 0.3
 95 | 
 96 |   collate_fn:
 97 |     mixup_prob: 0.0
 98 |     mixup_epochs: [40000, 15000]
 99 |     copyblend_prob: 0.0
100 |     copyblend_epochs: [40000, 15000]
101 | 
102 |     stop_epoch: 468 # 468 + 32
103 |     ema_restart_decay: 0.9999
104 |     base_size: 320
105 |     base_size_repeat: ~
106 | 
107 | val_dataloader:
108 |   total_batch_size: 256
109 |   dataset:
110 |     transforms:
111 |       ops:
112 |         - {type: Resize, size: [320, 320], }
113 |         - {type: ConvertPILImage, dtype: 'float32', scale: True}
114 |   shuffle: False
115 |   num_workers: 16
116 | 
117 | 
118 | DEIMCriterion:
119 |   losses: ['mal', 'boxes']    # , 'local'
120 |   use_uni_set: False
121 | 
122 |   matcher:
123 |     matcher_change_epoch: 450   # FIX This


--------------------------------------------------------------------------------
/configs/deimv2/deimv2_hgnetv2_femto_coco.yml:
--------------------------------------------------------------------------------
  1 | __include__: [
  2 |   '../dataset/coco_detection.yml',
  3 |   '../runtime.yml',
  4 |   '../base/dataloader.yml',
  5 |   '../base/optimizer.yml',
  6 |   '../base/deimv2.yml',
  7 | ]
  8 | 
  9 | output_dir: ./outputs/deimv2_hgnetv2_femto_coco
 10 | 
 11 | DEIM:
 12 |   encoder: LiteEncoder
 13 | 
 14 | HGNetv2:
 15 |   name: 'Femto'
 16 |   return_idx: [2]
 17 |   freeze_at: -1
 18 |   freeze_norm: False
 19 |   use_lab: True
 20 | 
 21 | LiteEncoder:
 22 |   in_channels: [512]
 23 |   feat_strides: [16]
 24 | 
 25 |   # intra
 26 |   hidden_dim: 96
 27 | 
 28 |   # cross
 29 |   expansion: 0.34
 30 |   depth_mult: 0.5
 31 |   act: 'silu'
 32 | 
 33 | 
 34 | DEIMTransformer:
 35 |   feat_channels: [96, 96]
 36 |   feat_strides: [16, 32]
 37 |   hidden_dim: 96
 38 |   num_levels: 2
 39 |   num_points: [4, 2]
 40 | 
 41 |   num_layers: 3
 42 |   eval_idx: -1
 43 |   num_queries: 150
 44 | 
 45 |   # FFN
 46 |   dim_feedforward: 256
 47 | 
 48 |   # New options for DEIMv2
 49 |   share_bbox_head: True
 50 |   use_gateway: False
 51 | 
 52 | # Increase to search for the optimal ema
 53 | epoches: 500 # 468 + 32
 54 | 
 55 | ## Our LR-Scheduler
 56 | warmup_iter: 4000
 57 | flat_epoch: 250    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
 58 | no_aug_epoch: 32
 59 | lr_gamma: 0.5
 60 | 
 61 | optimizer:
 62 |   type: AdamW
 63 |   params:
 64 |     -
 65 |       params: '^(?=.*backbone)(?!.*norm|bn).*$'
 66 |       lr: 0.0008
 67 |     -
 68 |       params: '^(?=.*backbone)(?=.*norm|bn).*$'
 69 |       lr: 0.0008
 70 |       weight_decay: 0.
 71 |     -  # not opt
 72 |       params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
 73 |       weight_decay: 0.
 74 | 
 75 |   lr: 0.0016
 76 |   betas: [0.9, 0.999]
 77 |   weight_decay: 0.0001
 78 | 
 79 | eval_spatial_size: [416, 416]
 80 | train_dataloader:
 81 |   total_batch_size: 128
 82 |   dataset: 
 83 |     transforms:
 84 |       ops:
 85 |         - {type: Mosaic, output_size: 208, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
 86 |            probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
 87 |         - {type: RandomPhotometricDistort, p: 0.5}
 88 |         - {type: RandomZoomOut, fill: 0}
 89 |         - {type: RandomIoUCrop, p: 0.8}
 90 |         - {type: SanitizeBoundingBoxes, min_size: 10}
 91 |         - {type: RandomHorizontalFlip}
 92 |         - {type: Resize, size: [416, 416], }
 93 |         - {type: SanitizeBoundingBoxes, min_size: 10}
 94 |         - {type: ConvertPILImage, dtype: 'float32', scale: True}
 95 |         - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
 96 |       policy:
 97 |         epoch: [4, 250, 400]   # list 
 98 |         ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
 99 |       mosaic_prob: 0.5
100 | 
101 |   collate_fn:
102 |     mixup_prob: 0.0
103 |     mixup_epochs: [40000, 15000]
104 |     copyblend_prob: 0.0
105 |     copyblend_epochs: [40000, 15000]
106 | 
107 |     stop_epoch: 468 # 468 + 32
108 |     ema_restart_decay: 0.9999
109 |     base_size: 416
110 |     base_size_repeat: ~
111 | 
112 | val_dataloader:
113 |   total_batch_size: 256
114 |   dataset:
115 |     transforms:
116 |       ops:
117 |         - {type: Resize, size: [416, 416], }
118 |         - {type: ConvertPILImage, dtype: 'float32', scale: True}
119 |   shuffle: False
120 |   num_workers: 16
121 | 
122 | 
123 | DEIMCriterion:
124 |   losses: ['mal', 'boxes']    # , 'local'
125 |   use_uni_set: False
126 | 
127 |   matcher:
128 |     matcher_change_epoch: 450   # FIX This


--------------------------------------------------------------------------------
/configs/deimv2/deimv2_hgnetv2_l_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   '../dataset/coco_detection.yml',
 3 |   '../runtime.yml',
 4 |   '../base/dataloader.yml',
 5 |   '../base/optimizer.yml',
 6 |   '../base/deimv2.yml'
 7 | ]
 8 | 
 9 | output_dir: ./outputs/deimv2_hgnetv2_l_coco
10 | 
11 | 
12 | optimizer:
13 |   type: AdamW
14 |   params:
15 |     - 
16 |       params: '^(?=.*backbone)(?!.*norm|bn).*$'
17 |       lr: 0.000025
18 |     - 
19 |       params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
20 |       weight_decay: 0.
21 | 
22 |   lr: 0.0005
23 |   betas: [0.9, 0.999]
24 |   weight_decay: 0.000125
25 | 


--------------------------------------------------------------------------------
/configs/deimv2/deimv2_hgnetv2_m_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   '../dataset/coco_detection.yml',
 3 |   '../runtime.yml',
 4 |   '../base/dataloader.yml',
 5 |   '../base/optimizer.yml',
 6 |   '../base/deimv2.yml'
 7 | ]
 8 | 
 9 | output_dir: ./outputs/deimv2_hgnetv2_m_coco
10 | 
11 | HGNetv2:
12 |   name: 'B2'
13 |   return_idx: [1, 2, 3]
14 |   freeze_at: -1
15 |   freeze_norm: False
16 |   use_lab: True
17 | 
18 | HybridEncoder:
19 |   in_channels: [384, 768, 1536]
20 |   hidden_dim: 256
21 |   depth_mult: 0.67
22 | 
23 | DEIMTransformer:
24 |   num_layers: 4  # 5 6
25 |   eval_idx: -1  # -2 -3
26 | 
27 | optimizer:
28 |   type: AdamW
29 |   params: 
30 |     -
31 |       params: '^(?=.*backbone)(?!.*bn).*$'
32 |       lr: 0.00004
33 |     - 
34 |       params: '^(?=.*(?:norm|bn)).*$'
35 |       weight_decay: 0.
36 | 
37 |   lr: 0.0004
38 |   betas: [0.9, 0.999]
39 |   weight_decay: 0.0001
40 | 
41 | # Increase to search for the optimal ema
42 | epoches: 102 # 120 + 4n
43 | 
44 | ## Our LR-Scheduler
45 | flat_epoch: 49    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
46 | no_aug_epoch: 12
47 | 
48 | ## Our DataAug
49 | train_dataloader: 
50 |   dataset: 
51 |     transforms:
52 |       policy:
53 |         epoch: [4, 49, 90]   # list 
54 | 
55 |   collate_fn:
56 |     ema_restart_decay: 0.9999
57 |     base_size_repeat: 6
58 |     mixup_epochs: [4, 49]
59 |     stop_epoch: 90
60 |     copyblend_prob: 0.5
61 |     copyblend_epochs: [4, 90]
62 |     area_threshold: 100
63 |     num_objects: 3
64 |     with_expand: True
65 |     expand_ratios: [0.1, 0.25]
66 | 
67 | DEIMCriterion:
68 |   matcher:
69 |     # new matcher
70 |     change_matcher: True
71 |     iou_order_alpha: 4.0
72 |     matcher_change_epoch: 80


--------------------------------------------------------------------------------
/configs/deimv2/deimv2_hgnetv2_n_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   '../dataset/coco_detection.yml',
 3 |   '../runtime.yml',
 4 |   '../base/dataloader.yml',
 5 |   '../base/optimizer.yml',
 6 |   '../base/deimv2.yml'
 7 | ]
 8 | 
 9 | output_dir: ./outputs/deimv2_hgnetv2_n_coco
10 | 
11 | HGNetv2:
12 |   name: 'B0'
13 |   return_idx: [2, 3]
14 |   freeze_at: -1
15 |   freeze_norm: False
16 |   use_lab: True
17 | 
18 | HybridEncoder:
19 |   in_channels: [512, 1024]
20 |   feat_strides: [16, 32]
21 | 
22 |   # intra
23 |   hidden_dim: 128
24 |   use_encoder_idx: [1]
25 |   dim_feedforward: 512
26 | 
27 |   # cross
28 |   expansion: 0.34
29 |   depth_mult: 0.5
30 | 
31 |   version: 'dfine'
32 | 
33 | DEIMTransformer:
34 |   feat_channels: [128, 128]
35 |   feat_strides: [16, 32]
36 |   hidden_dim: 128
37 |   num_levels: 2
38 |   num_points: [6, 6]
39 | 
40 |   num_layers: 3
41 |   eval_idx: -1
42 | 
43 |   # FFN
44 |   dim_feedforward: 512
45 | 
46 | optimizer:
47 |   type: AdamW
48 |   params:
49 |     -
50 |       params: '^(?=.*backbone)(?!.*norm|bn).*$'
51 |       lr: 0.0004
52 |     -
53 |       params: '^(?=.*backbone)(?=.*norm|bn).*$'
54 |       lr: 0.0004
55 |       weight_decay: 0.
56 |     -
57 |       params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
58 |       weight_decay: 0.
59 | 
60 |   lr: 0.0008
61 |   betas: [0.9, 0.999]
62 |   weight_decay: 0.0001
63 | 
64 | # Increase to search for the optimal ema
65 | epoches: 160 # 148 + 12
66 | 
67 | ## Our LR-Scheduler
68 | flat_epoch: 7800    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
69 | no_aug_epoch: 12
70 | lr_gamma: 1.0
71 | 
72 | ## Our DataAug
73 | train_dataloader: 
74 |   dataset: 
75 |     transforms:
76 |       policy:
77 |         epoch: [4, 78, 148]   # list 
78 | 
79 |   collate_fn:
80 |     ema_restart_decay: 0.9999
81 |     base_size_repeat: ~
82 |     mixup_epochs: [4, 78]
83 |     stop_epoch: 148
84 |     copyblend_prob: 0.4
85 |     copyblend_epochs: [4, 78]   # CP half
86 |     area_threshold: 100
87 |     num_objects: 3
88 |     with_expand: True
89 |     expand_ratios: [0.1, 0.25]
90 | 
91 | DEIMCriterion:
92 |   matcher:
93 |     # new matcher
94 |     change_matcher: True
95 |     iou_order_alpha: 4.0
96 |     matcher_change_epoch: 136


--------------------------------------------------------------------------------
/configs/deimv2/deimv2_hgnetv2_pico_coco.yml:
--------------------------------------------------------------------------------
  1 | __include__: [
  2 |   '../dataset/coco_detection.yml',
  3 |   '../runtime.yml',
  4 |   '../base/dataloader.yml',
  5 |   '../base/optimizer.yml',
  6 |   '../base/deimv2.yml',
  7 | ]
  8 | 
  9 | output_dir: ./outputs/deimv2_hgnetv2_pico_coco
 10 | 
 11 | DEIM:
 12 |   encoder: LiteEncoder
 13 |   decoder: DEIMTransformer
 14 | 
 15 | HGNetv2:
 16 |   name: 'Pico'
 17 |   return_idx: [2]
 18 |   freeze_at: -1
 19 |   freeze_norm: False
 20 |   use_lab: True
 21 | 
 22 | LiteEncoder:
 23 |   in_channels: [512]
 24 |   feat_strides: [16]
 25 | 
 26 |   # intra
 27 |   hidden_dim: 112
 28 | 
 29 |   # cross
 30 |   expansion: 0.34
 31 |   depth_mult: 0.5
 32 |   act: 'silu'
 33 | 
 34 | 
 35 | DEIMTransformer:
 36 |   feat_channels: [112, 112]
 37 |   feat_strides: [16, 32]
 38 |   hidden_dim: 112
 39 |   num_levels: 2
 40 |   num_points: [4, 2]
 41 | 
 42 |   num_layers: 3
 43 |   eval_idx: -1
 44 |   num_queries: 200
 45 | 
 46 |   # FFN
 47 |   dim_feedforward: 320
 48 | 
 49 |   # New options for DEIMv2
 50 |   share_bbox_head: True
 51 |   use_gateway: False
 52 | 
 53 | # Increase to search for the optimal ema
 54 | epoches: 500 # 468 + 32
 55 | 
 56 | ## Our LR-Scheduler
 57 | warmup_iter: 4000
 58 | flat_epoch: 250    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
 59 | no_aug_epoch: 32
 60 | lr_gamma: 0.5
 61 | 
 62 | optimizer:
 63 |   type: AdamW
 64 |   params:
 65 |     -
 66 |       params: '^(?=.*backbone)(?!.*norm|bn).*$'
 67 |       lr: 0.0008
 68 |     -
 69 |       params: '^(?=.*backbone)(?=.*norm|bn).*$'
 70 |       lr: 0.0008
 71 |       weight_decay: 0.
 72 |     -  # not opt
 73 |       params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
 74 |       weight_decay: 0.
 75 | 
 76 |   lr: 0.0016
 77 |   betas: [0.9, 0.999]
 78 |   weight_decay: 0.0001
 79 | 
 80 | eval_spatial_size: [640, 640]
 81 | train_dataloader:
 82 |   total_batch_size: 128
 83 |   dataset: 
 84 |     transforms:
 85 |       ops:
 86 |         - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5],
 87 |            probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True}
 88 |         - {type: RandomPhotometricDistort, p: 0.5}
 89 |         - {type: RandomZoomOut, fill: 0}
 90 |         - {type: RandomIoUCrop, p: 0.8}
 91 |         - {type: SanitizeBoundingBoxes, min_size: 8}
 92 |         - {type: RandomHorizontalFlip}
 93 |         - {type: Resize, size: [640, 640], }
 94 |         - {type: SanitizeBoundingBoxes, min_size: 8}
 95 |         - {type: ConvertPILImage, dtype: 'float32', scale: True}
 96 |         - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
 97 |       policy:
 98 |         epoch: [4, 250, 400]   # list 
 99 |         ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
100 |       mosaic_prob: 0.5
101 | 
102 |   collate_fn:
103 |     mixup_prob: 0.0
104 |     mixup_epochs: [40000, 15000]
105 |     copyblend_prob: 0.0
106 |     copyblend_epochs: [40000, 15000]
107 |     stop_epoch: 468 # 468 + 32
108 |     ema_restart_decay: 0.9999
109 |     base_size: 640
110 |     base_size_repeat: ~
111 | 
112 | val_dataloader:
113 |   total_batch_size: 256
114 |   dataset:
115 |     transforms:
116 |       ops:
117 |         - {type: Resize, size: [640, 640], }
118 |         - {type: ConvertPILImage, dtype: 'float32', scale: True}
119 |   shuffle: False
120 |   num_workers: 16
121 | 
122 | 
123 | DEIMCriterion:
124 |   losses: ['mal', 'boxes']    # , 'local'
125 |   use_uni_set: False
126 | 
127 |   matcher:
128 |     matcher_change_epoch: 450   # FIX This


--------------------------------------------------------------------------------
/configs/deimv2/deimv2_hgnetv2_s_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   '../dataset/coco_detection.yml',
 3 |   '../runtime.yml',
 4 |   '../base/dataloader.yml',
 5 |   '../base/optimizer.yml',
 6 |   '../base/deimv2.yml'
 7 | ]
 8 | 
 9 | output_dir: ./outputs/deimv2_hgnetv2_s_coco
10 | 
11 | HGNetv2:
12 |   name: 'B0'
13 |   return_idx: [1, 2, 3]
14 |   freeze_at: -1
15 |   freeze_norm: False
16 |   use_lab: True
17 | 
18 | HybridEncoder:
19 |   in_channels: [256, 512, 1024]
20 |   hidden_dim: 256
21 |   depth_mult: 0.34
22 |   expansion: 0.5
23 | 
24 |   version: 'dfine'
25 | 
26 | DEIMTransformer:
27 |   num_layers: 3  # 4 5 6
28 |   eval_idx: -1  # -2 -3 -4
29 | 
30 | optimizer:
31 |   type: AdamW
32 |   params: 
33 |     - 
34 |       params: '^(?=.*backbone)(?!.*bn).*$'
35 |       lr: 0.0002
36 |     - 
37 |       params: '^(?=.*(?:norm|bn)).*$'     # except bias
38 |       weight_decay: 0.
39 | 
40 |   lr: 0.0004
41 |   betas: [0.9, 0.999]
42 |   weight_decay: 0.0001
43 | 
44 | # Increase to search for the optimal ema
45 | epoches: 132 # 120 + 4n
46 | 
47 | ## Our LR-Scheduler
48 | flat_epoch: 64    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
49 | no_aug_epoch: 12
50 | 
51 | ## Our DataAug
52 | train_dataloader: 
53 |   dataset: 
54 |     transforms:
55 |       policy:
56 |         epoch: [4, 64, 120]   # list 
57 | 
58 |   collate_fn:
59 |     ema_restart_decay: 0.9999
60 |     base_size_repeat: 20
61 |     mixup_epochs: [4, 64]
62 |     stop_epoch: 120
63 |     copyblend_prob: 0.5
64 |     # copyblend_epochs: [4, 64]   # from v11 to v12: copy-paste continues only half epochs
65 |     copyblend_epochs: [4, 120]
66 |     area_threshold: 100
67 |     num_objects: 3
68 |     with_expand: True
69 |     expand_ratios: [0.1, 0.25]
70 | 
71 | DEIMCriterion:
72 |   matcher:
73 |     # new matcher
74 |     change_matcher: True
75 |     iou_order_alpha: 4.0
76 |     matcher_change_epoch: 100


--------------------------------------------------------------------------------
/configs/deimv2/deimv2_hgnetv2_x_coco.yml:
--------------------------------------------------------------------------------
 1 | __include__: [
 2 |   '../dataset/coco_detection.yml',
 3 |   '../runtime.yml',
 4 |   '../base/dataloader.yml',
 5 |   '../base/optimizer.yml',
 6 |   '../base/deimv2.yml'
 7 | ]
 8 | 
 9 | output_dir: ./outputs/deimv2_hgnetv2_x_coco
10 | 
11 | 
12 | HGNetv2:
13 |   name: 'B5'
14 |   return_idx: [1, 2, 3]
15 |   freeze_stem_only: True
16 |   freeze_at: 0
17 |   freeze_norm: True
18 | 
19 | HybridEncoder:
20 |   # intra
21 |   hidden_dim: 384
22 |   dim_feedforward: 2048
23 | 
24 | DEIMTransformer:
25 |   feat_channels: [384, 384, 384]    # [256, 256, 256]
26 |   reg_scale: 8   # 4
27 | 
28 |   # FFN
29 |   dim_feedforward: 2048
30 | 
31 | optimizer:
32 |   type: AdamW
33 |   params: 
34 |     - 
35 |       params: '^(?=.*backbone)(?!.*norm|bn).*$'
36 |       lr: 0.000005   
37 |     - 
38 |       params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
39 |       weight_decay: 0.
40 | 
41 |   lr: 0.0005
42 |   betas: [0.9, 0.999]
43 |   weight_decay: 0.000125
44 |   
45 | # Increase to search for the optimal ema
46 | epoches: 58 # 72 + 2n
47 | 
48 | ## Our LR-Scheduler
49 | flat_epoch: 29    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
50 | no_aug_epoch: 8
51 | 
52 | train_dataloader: 
53 |   dataset: 
54 |     transforms:
55 |       policy:
56 |         epoch: [4, 29, 50]   # list 
57 | 
58 |   collate_fn:
59 |     ema_restart_decay: 0.9998
60 |     base_size_repeat: 3
61 | 


--------------------------------------------------------------------------------
/configs/runtime.yml:
--------------------------------------------------------------------------------
 1 | print_freq: 500
 2 | output_dir: './logs'
 3 | checkpoint_freq: 12
 4 | 
 5 | 
 6 | sync_bn: True
 7 | find_unused_parameters: True
 8 | 
 9 | 
10 | use_amp: False
11 | scaler:
12 |   type: GradScaler
13 |   enabled: True
14 | 
15 | 
16 | use_ema: False
17 | ema:
18 |   type: ModelEMA
19 |   decay: 0.9999
20 |   warmups: 1000
21 | 


--------------------------------------------------------------------------------
/engine/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
 3 | """
 4 | 
 5 | # for register purpose
 6 | from . import optim
 7 | from . import data
 8 | from . import deim
 9 | 
10 | from .backbone import *
11 | 
12 | from .backbone import (
13 |     get_activation,
14 |     FrozenBatchNorm2d,
15 |     freeze_batch_norm2d,
16 | )


--------------------------------------------------------------------------------
/engine/backbone/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 4 | """
 5 | 
 6 | from .common import (
 7 |     get_activation,
 8 |     FrozenBatchNorm2d,
 9 |     freeze_batch_norm2d,
10 | )
11 | from .presnet import PResNet
12 | from .test_resnet import MResNet
13 | 
14 | from .timm_model import TimmModel
15 | from .torchvision_model import TorchVisionModel
16 | 
17 | from .csp_resnet import CSPResNet
18 | from .csp_darknet import CSPDarkNet, CSPPAN
19 | 
20 | from .hgnetv2 import HGNetv2
21 | 
22 | from .dinov3_adapter import *
23 | 


--------------------------------------------------------------------------------
/engine/backbone/common.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
  3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
  4 | """
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | 
  9 | 
 10 | class ConvNormLayer(nn.Module):
 11 |     def __init__(self, ch_in, ch_out, kernel_size, stride, padding=None, bias=False, act=None):
 12 |         super().__init__()
 13 |         self.conv = nn.Conv2d(
 14 |             ch_in,
 15 |             ch_out,
 16 |             kernel_size,
 17 |             stride,
 18 |             padding=(kernel_size-1)//2 if padding is None else padding,
 19 |             bias=bias)
 20 |         self.norm = nn.BatchNorm2d(ch_out)
 21 |         self.act = nn.Identity() if act is None else get_activation(act)
 22 | 
 23 |     def forward(self, x):
 24 |         return self.act(self.norm(self.conv(x)))
 25 | 
 26 | 
 27 | class FrozenBatchNorm2d(nn.Module):
 28 |     """copy and modified from https://github.com/facebookresearch/detr/blob/master/models/backbone.py
 29 |     BatchNorm2d where the batch statistics and the affine parameters are fixed.
 30 |     Copy-paste from torchvision.misc.ops with added eps before rqsrt,
 31 |     without which any other models than torchvision.models.resnet[18,34,50,101]
 32 |     produce nans.
 33 |     """
 34 |     def __init__(self, num_features, eps=1e-5):
 35 |         super(FrozenBatchNorm2d, self).__init__()
 36 |         n = num_features
 37 |         self.register_buffer("weight", torch.ones(n))
 38 |         self.register_buffer("bias", torch.zeros(n))
 39 |         self.register_buffer("running_mean", torch.zeros(n))
 40 |         self.register_buffer("running_var", torch.ones(n))
 41 |         self.eps = eps
 42 |         self.num_features = n
 43 | 
 44 |     def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
 45 |                               missing_keys, unexpected_keys, error_msgs):
 46 |         num_batches_tracked_key = prefix + 'num_batches_tracked'
 47 |         if num_batches_tracked_key in state_dict:
 48 |             del state_dict[num_batches_tracked_key]
 49 | 
 50 |         super(FrozenBatchNorm2d, self)._load_from_state_dict(
 51 |             state_dict, prefix, local_metadata, strict,
 52 |             missing_keys, unexpected_keys, error_msgs)
 53 | 
 54 |     def forward(self, x):
 55 |         # move reshapes to the beginning
 56 |         # to make it fuser-friendly
 57 |         w = self.weight.reshape(1, -1, 1, 1)
 58 |         b = self.bias.reshape(1, -1, 1, 1)
 59 |         rv = self.running_var.reshape(1, -1, 1, 1)
 60 |         rm = self.running_mean.reshape(1, -1, 1, 1)
 61 |         scale = w * (rv + self.eps).rsqrt()
 62 |         bias = b - rm * scale
 63 |         return x * scale + bias
 64 | 
 65 |     def extra_repr(self):
 66 |         return (
 67 |             "{num_features}, eps={eps}".format(**self.__dict__)
 68 |         )
 69 | 
 70 | def freeze_batch_norm2d(module: nn.Module) -> nn.Module:
 71 |     if isinstance(module, nn.BatchNorm2d):
 72 |         module = FrozenBatchNorm2d(module.num_features)
 73 |     else:
 74 |         for name, child in module.named_children():
 75 |             _child = freeze_batch_norm2d(child)
 76 |             if _child is not child:
 77 |                 setattr(module, name, _child)
 78 |     return module
 79 | 
 80 | 
 81 | def get_activation(act: str, inplace: bool=True):
 82 |     """get activation
 83 |     """
 84 |     if act is None:
 85 |         return nn.Identity()
 86 | 
 87 |     elif isinstance(act, nn.Module):
 88 |         return act
 89 | 
 90 |     act = act.lower()
 91 | 
 92 |     if act == 'silu' or act == 'swish':
 93 |         m = nn.SiLU()
 94 | 
 95 |     elif act == 'relu':
 96 |         m = nn.ReLU()
 97 | 
 98 |     elif act == 'leaky_relu':
 99 |         m = nn.LeakyReLU()
100 | 
101 |     elif act == 'silu':
102 |         m = nn.SiLU()
103 | 
104 |     elif act == 'gelu':
105 |         m = nn.GELU()
106 | 
107 |     elif act == 'hardsigmoid':
108 |         m = nn.Hardsigmoid()
109 | 
110 |     else:
111 |         raise RuntimeError('')
112 | 
113 |     if hasattr(m, 'inplace'):
114 |         m.inplace = inplace
115 | 
116 |     return m
117 | 


--------------------------------------------------------------------------------
/engine/backbone/dinov3/__init__.py:
--------------------------------------------------------------------------------
1 | from .vision_transformer import DinoVisionTransformer


--------------------------------------------------------------------------------
/engine/backbone/dinov3/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This software may be used and distributed in accordance with
 4 | # the terms of the DINOv3 License Agreement.
 5 | 
 6 | from .attention import CausalSelfAttention, LinearKMaskedBias, SelfAttention
 7 | from .block import CausalSelfAttentionBlock, SelfAttentionBlock
 8 | from .ffn_layers import Mlp, SwiGLUFFN
 9 | from .fp8_linear import convert_linears_to_fp8
10 | from .layer_scale import LayerScale
11 | from .patch_embed import PatchEmbed
12 | from .rms_norm import RMSNorm
13 | from .rope_position_encoding import RopePositionEmbedding
14 | 


--------------------------------------------------------------------------------
/engine/backbone/dinov3/layers/dino_head.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This software may be used and distributed in accordance with
 4 | # the terms of the DINOv3 License Agreement.
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | from torch.nn.init import trunc_normal_
 9 | 
10 | 
11 | class DINOHead(nn.Module):
12 |     def __init__(
13 |         self,
14 |         in_dim,
15 |         out_dim,
16 |         use_bn=False,
17 |         nlayers=3,
18 |         hidden_dim=2048,
19 |         bottleneck_dim=256,
20 |         mlp_bias=True,
21 |     ):
22 |         super().__init__()
23 |         nlayers = max(nlayers, 1)
24 |         self.mlp = _build_mlp(
25 |             nlayers,
26 |             in_dim,
27 |             bottleneck_dim,
28 |             hidden_dim=hidden_dim,
29 |             use_bn=use_bn,
30 |             bias=mlp_bias,
31 |         )
32 |         self.last_layer = nn.Linear(bottleneck_dim, out_dim, bias=False)
33 | 
34 |     def init_weights(self) -> None:
35 |         self.apply(self._init_weights)
36 | 
37 |     def _init_weights(self, m):
38 |         if isinstance(m, nn.Linear):
39 |             trunc_normal_(m.weight, std=0.02)
40 |             if m.bias is not None:
41 |                 nn.init.constant_(m.bias, 0)
42 | 
43 |     def forward(self, x, no_last_layer=False, only_last_layer=False):
44 |         if not only_last_layer:
45 |             x = self.mlp(x)
46 |             eps = 1e-6 if x.dtype == torch.float16 else 1e-12
47 |             x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
48 |         if not no_last_layer:
49 |             x = self.last_layer(x)
50 |         return x
51 | 
52 | 
53 | def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
54 |     if nlayers == 1:
55 |         return nn.Linear(in_dim, bottleneck_dim, bias=bias)
56 |     else:
57 |         layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
58 |         if use_bn:
59 |             layers.append(nn.BatchNorm1d(hidden_dim))
60 |         layers.append(nn.GELU())
61 |         for _ in range(nlayers - 2):
62 |             layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
63 |             if use_bn:
64 |                 layers.append(nn.BatchNorm1d(hidden_dim))
65 |             layers.append(nn.GELU())
66 |         layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
67 |         return nn.Sequential(*layers)
68 | 


--------------------------------------------------------------------------------
/engine/backbone/dinov3/layers/ffn_layers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This software may be used and distributed in accordance with
 4 | # the terms of the DINOv3 License Agreement.
 5 | 
 6 | from typing import Callable, List, Optional
 7 | 
 8 | import torch.nn.functional as F
 9 | from torch import Tensor, nn
10 | 
11 | from ..utils import cat_keep_shapes, uncat_with_shapes
12 | 
13 | 
14 | class ListForwardMixin(object):
15 |     def forward(self, x: Tensor):
16 |         raise NotImplementedError
17 | 
18 |     def forward_list(self, x_list: List[Tensor]) -> List[Tensor]:
19 |         x_flat, shapes, num_tokens = cat_keep_shapes(x_list)
20 |         x_flat = self.forward(x_flat)
21 |         return uncat_with_shapes(x_flat, shapes, num_tokens)
22 | 
23 | 
24 | class Mlp(nn.Module, ListForwardMixin):
25 |     def __init__(
26 |         self,
27 |         in_features: int,
28 |         hidden_features: Optional[int] = None,
29 |         out_features: Optional[int] = None,
30 |         act_layer: Callable[..., nn.Module] = nn.GELU,
31 |         drop: float = 0.0,
32 |         bias: bool = True,
33 |         device=None,
34 |     ) -> None:
35 |         super().__init__()
36 |         out_features = out_features or in_features
37 |         hidden_features = hidden_features or in_features
38 |         self.fc1 = nn.Linear(in_features, hidden_features, bias=bias, device=device)
39 |         self.act = act_layer()
40 |         self.fc2 = nn.Linear(hidden_features, out_features, bias=bias, device=device)
41 |         self.drop = nn.Dropout(drop)
42 | 
43 |     def forward(self, x: Tensor) -> Tensor:
44 |         x = self.fc1(x)
45 |         x = self.act(x)
46 |         x = self.drop(x)
47 |         x = self.fc2(x)
48 |         x = self.drop(x)
49 |         return x
50 | 
51 | 
52 | class SwiGLUFFN(nn.Module, ListForwardMixin):
53 |     def __init__(
54 |         self,
55 |         in_features: int,
56 |         hidden_features: Optional[int] = None,
57 |         out_features: Optional[int] = None,
58 |         act_layer: Optional[Callable[..., nn.Module]] = None,
59 |         drop: float = 0.0,
60 |         bias: bool = True,
61 |         align_to: int = 8,
62 |         device=None,
63 |     ) -> None:
64 |         super().__init__()
65 |         out_features = out_features or in_features
66 |         hidden_features = hidden_features or in_features
67 |         d = int(hidden_features * 2 / 3)
68 |         swiglu_hidden_features = d + (-d % align_to)
69 |         self.w1 = nn.Linear(in_features, swiglu_hidden_features, bias=bias, device=device)
70 |         self.w2 = nn.Linear(in_features, swiglu_hidden_features, bias=bias, device=device)
71 |         self.w3 = nn.Linear(swiglu_hidden_features, out_features, bias=bias, device=device)
72 | 
73 |     def forward(self, x: Tensor) -> Tensor:
74 |         x1 = self.w1(x)
75 |         x2 = self.w2(x)
76 |         hidden = F.silu(x1) * x2
77 |         return self.w3(hidden)
78 | 


--------------------------------------------------------------------------------
/engine/backbone/dinov3/layers/fp8_linear.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | #
  3 | # This software may be used and distributed in accordance with
  4 | # the terms of the DINOv3 License Agreement.
  5 | 
  6 | import re
  7 | 
  8 | import torch
  9 | 
 10 | from ..layers.attention import LinearKMaskedBias
 11 | from ..utils import named_replace
 12 | 
 13 | # avoid division by zero when calculating scale
 14 | EPS = 1e-12
 15 | 
 16 | 
 17 | def scale(t, amax_t):
 18 |     max_v = torch.finfo(torch.float8_e4m3fn).max
 19 |     scale_t = torch.clamp(amax_t.float(), min=EPS) / max_v
 20 |     t_fp8 = (t / scale_t).to(torch.float8_e4m3fn)
 21 |     return t_fp8, scale_t
 22 | 
 23 | 
 24 | def matmul(first, amax_first, second_t, amax_second_t, bias):
 25 |     first_fp8, scale_first = scale(first, amax_first)
 26 |     second_t_fp8, scale_second_t = scale(second_t, amax_second_t)
 27 |     # PyTorch's row-wise scaled matmul kernel is based on CUTLASS and is quite
 28 |     # slow. Hence we fall back to an "unscaled" matmul, which uses cuBLAS, and
 29 |     # apply the scale manually afterwards.
 30 |     output = torch._scaled_mm(
 31 |         first_fp8,
 32 |         second_t_fp8.t(),
 33 |         scale_a=scale_first.new_ones((1, 1)),
 34 |         scale_b=scale_second_t.t().new_ones((1, 1)),
 35 |         bias=None,
 36 |         out_dtype=torch.bfloat16,
 37 |         use_fast_accum=False,
 38 |     )
 39 |     output = (output * scale_first * scale_second_t.t()).to(torch.bfloat16)
 40 |     if bias is not None:
 41 |         output = output + bias
 42 |     return output
 43 | 
 44 | 
 45 | @torch.compiler.allow_in_graph
 46 | class Fp8LinearFn(torch.autograd.Function):
 47 |     @staticmethod
 48 |     def forward(ctx, a, b_t, bias):
 49 |         amax_a = a.abs().amax(dim=-1, keepdim=True)
 50 |         amax_b_t = b_t.abs().amax(dim=-1, keepdim=True)
 51 |         out = matmul(a, amax_a, b_t, amax_b_t, bias)
 52 | 
 53 |         ctx.a_requires_grad = a.requires_grad
 54 |         ctx.b_requires_grad = b_t.requires_grad
 55 |         ctx.bias_requires_grad = bias.requires_grad if bias is not None else False
 56 | 
 57 |         ctx.save_for_backward(a, b_t, amax_b_t.max())
 58 | 
 59 |         return out
 60 | 
 61 |     @staticmethod
 62 |     def backward(ctx, grad_out):
 63 |         a, b_t, amax_b = ctx.saved_tensors
 64 | 
 65 |         if ctx.a_requires_grad:
 66 |             b = b_t.t().contiguous()
 67 |             amax_grad_out = grad_out.abs().amax(dim=-1, keepdim=True)
 68 |             amax_b = amax_b.repeat(b.shape[0], 1)
 69 |             grad_a = matmul(grad_out, amax_grad_out, b, amax_b, None)
 70 |         else:
 71 |             grad_a = None
 72 |         if ctx.b_requires_grad:
 73 |             grad_b = grad_out.t() @ a
 74 |         else:
 75 |             grad_b = None
 76 |         if ctx.bias_requires_grad:
 77 |             grad_bias = grad_out.sum(dim=0)
 78 |         else:
 79 |             grad_bias = None
 80 | 
 81 |         return grad_a, grad_b, grad_bias
 82 | 
 83 | 
 84 | class Fp8Linear(torch.nn.Linear):
 85 |     def forward(self, input: torch.Tensor) -> torch.Tensor:
 86 |         out = Fp8LinearFn.apply(input.flatten(end_dim=-2), self.weight, self.bias)
 87 |         out = out.unflatten(0, input.shape[:-1])
 88 |         return out
 89 | 
 90 | 
 91 | class Fp8LinearKMaskedBias(LinearKMaskedBias):
 92 |     def forward(self, input: torch.Tensor) -> torch.Tensor:
 93 |         masked_bias = self.bias * self.bias_mask if self.bias is not None else None
 94 |         out = Fp8LinearFn.apply(input.flatten(end_dim=-2), self.weight, masked_bias)
 95 |         out = out.unflatten(0, input.shape[:-1])
 96 |         return out
 97 | 
 98 | 
 99 | def convert_linears_to_fp8(root_module: torch.nn.Module, *, filter: str) -> torch.nn.Module:
100 |     filter_re = re.compile(filter)
101 |     total_count = 0
102 | 
103 |     def replace(module: torch.nn.Module, name: str) -> torch.nn.Module:
104 |         nonlocal total_count
105 |         if not isinstance(module, torch.nn.Linear) or not filter_re.search(name):
106 |             return module
107 |         if type(module) == torch.nn.Linear:
108 |             new_cls = Fp8Linear
109 |         elif type(module) == LinearKMaskedBias:
110 |             new_cls = Fp8LinearKMaskedBias
111 |         else:
112 |             assert False, str(type(module))
113 |         if module.in_features % 64 != 0 or module.out_features % 64 != 0:
114 |             # This is not a strict requirement, but H100 TensorCores for fp8
115 |             # operate on tiles of 64 elements anyways, and Inductor sometimes
116 |             # pads inner dims to become multiples of 64. Also, if one day we
117 |             # switch back to cuBLAS, it artificially requires dims to be
118 |             # multiples of 16.
119 |             raise RuntimeError(
120 |                 "fp8 requires all dimensions to be multiples of 64 " "(consider using ffn_layer=swiglu64 or higher)"
121 |             )
122 |         new_module = new_cls(
123 |             in_features=module.in_features,
124 |             out_features=module.out_features,
125 |             bias=module.bias is not None,
126 |             dtype=module.weight.dtype,
127 |             device=module.weight.device,
128 |         )
129 |         new_module.weight = module.weight
130 |         new_module.bias = module.bias
131 |         total_count += 1
132 |         return new_module
133 | 
134 |     out = named_replace(replace, root_module)
135 |     assert total_count > 0, "fp8: no layer found to convert"
136 |     # Force re-compile everything
137 |     torch._dynamo.reset_code_caches()
138 |     from torch._inductor.cudagraph_trees import reset_cudagraph_trees
139 | 
140 |     reset_cudagraph_trees()
141 |     return out
142 | 


--------------------------------------------------------------------------------
/engine/backbone/dinov3/layers/layer_scale.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This software may be used and distributed in accordance with
 4 | # the terms of the DINOv3 License Agreement.
 5 | 
 6 | from typing import Union
 7 | 
 8 | import torch
 9 | from torch import Tensor, nn
10 | 
11 | 
12 | class LayerScale(nn.Module):
13 |     def __init__(
14 |         self,
15 |         dim: int,
16 |         init_values: Union[float, Tensor] = 1e-5,
17 |         inplace: bool = False,
18 |         device=None,
19 |     ) -> None:
20 |         super().__init__()
21 |         self.inplace = inplace
22 |         self.gamma = nn.Parameter(torch.empty(dim, device=device))
23 |         self.init_values = init_values
24 | 
25 |     def reset_parameters(self):
26 |         nn.init.constant_(self.gamma, self.init_values)
27 | 
28 |     def forward(self, x: Tensor) -> Tensor:
29 |         return x.mul_(self.gamma) if self.inplace else x * self.gamma
30 | 


--------------------------------------------------------------------------------
/engine/backbone/dinov3/layers/patch_embed.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This software may be used and distributed in accordance with
 4 | # the terms of the DINOv3 License Agreement.
 5 | 
 6 | import math
 7 | from typing import Callable, Tuple, Union
 8 | 
 9 | from torch import Tensor, nn
10 | 
11 | 
12 | def make_2tuple(x):
13 |     if isinstance(x, tuple):
14 |         assert len(x) == 2
15 |         return x
16 | 
17 |     assert isinstance(x, int)
18 |     return (x, x)
19 | 
20 | 
21 | class PatchEmbed(nn.Module):
22 |     """
23 |     2D image to patch embedding: (B,C,H,W) -> (B,N,D)
24 | 
25 |     Args:
26 |         img_size: Image size.
27 |         patch_size: Patch token size.
28 |         in_chans: Number of input image channels.
29 |         embed_dim: Number of linear projection output channels.
30 |         norm_layer: Normalization layer.
31 |     """
32 | 
33 |     def __init__(
34 |         self,
35 |         img_size: Union[int, Tuple[int, int]] = 224,
36 |         patch_size: Union[int, Tuple[int, int]] = 16,
37 |         in_chans: int = 3,
38 |         embed_dim: int = 768,
39 |         norm_layer: Callable | None = None,
40 |         flatten_embedding: bool = True,
41 |     ) -> None:
42 |         super().__init__()
43 | 
44 |         image_HW = make_2tuple(img_size)
45 |         patch_HW = make_2tuple(patch_size)
46 |         patch_grid_size = (
47 |             image_HW[0] // patch_HW[0],
48 |             image_HW[1] // patch_HW[1],
49 |         )
50 | 
51 |         self.img_size = image_HW
52 |         self.patch_size = patch_HW
53 |         self.patches_resolution = patch_grid_size
54 |         self.num_patches = patch_grid_size[0] * patch_grid_size[1]
55 | 
56 |         self.in_chans = in_chans
57 |         self.embed_dim = embed_dim
58 | 
59 |         self.flatten_embedding = flatten_embedding
60 | 
61 |         self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
62 |         self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
63 | 
64 |     def forward(self, x: Tensor) -> Tensor:
65 |         _, _, H, W = x.shape
66 |         # patch_H, patch_W = self.patch_size
67 |         # assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
68 |         # assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
69 | 
70 |         x = self.proj(x)  # B C H W
71 |         H, W = x.size(2), x.size(3)
72 |         x = x.flatten(2).transpose(1, 2)  # B HW C
73 |         x = self.norm(x)
74 |         if not self.flatten_embedding:
75 |             x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
76 |         return x
77 | 
78 |     def flops(self) -> float:
79 |         Ho, Wo = self.patches_resolution
80 |         flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
81 |         if self.norm is not None:
82 |             flops += Ho * Wo * self.embed_dim
83 |         return flops
84 | 
85 |     def reset_parameters(self):
86 |         k = 1 / (self.in_chans * (self.patch_size[0] ** 2))
87 |         nn.init.uniform_(self.proj.weight, -math.sqrt(k), math.sqrt(k))
88 |         if self.proj.bias is not None:
89 |             nn.init.uniform_(self.proj.bias, -math.sqrt(k), math.sqrt(k))
90 | 


--------------------------------------------------------------------------------
/engine/backbone/dinov3/layers/rms_norm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This software may be used and distributed in accordance with
 4 | # the terms of the DINOv3 License Agreement.
 5 | 
 6 | import torch
 7 | from torch import Tensor, nn
 8 | 
 9 | 
10 | class RMSNorm(nn.Module):
11 |     def __init__(self, dim: int, eps: float = 1e-5):
12 |         super().__init__()
13 |         self.weight = nn.Parameter(torch.ones(dim))
14 |         self.eps = eps
15 | 
16 |     def reset_parameters(self) -> None:
17 |         nn.init.constant_(self.weight, 1)
18 | 
19 |     def _norm(self, x: Tensor) -> Tensor:
20 |         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
21 | 
22 |     def forward(self, x: Tensor) -> Tensor:
23 |         output = self._norm(x.float()).type_as(x)
24 |         return output * self.weight
25 | 


--------------------------------------------------------------------------------
/engine/backbone/dinov3/layers/sparse_linear.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This software may be used and distributed in accordance with
 4 | # the terms of the DINOv3 License Agreement.
 5 | 
 6 | import logging
 7 | from typing import Callable
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | import torch.nn.functional as F
12 | import xformers.ops as xops
13 | 
14 | from ..utils import named_apply, named_replace
15 | 
16 | logger = logging.getLogger("dinov3")
17 | 
18 | 
19 | class LinearW24(torch.nn.Linear):
20 |     ALGO = "largest_abs_values_greedy"
21 | 
22 |     def __init__(self, *args, **kwargs) -> None:
23 |         super().__init__(*args, **kwargs)
24 |         self.sparsity_enabled = False
25 | 
26 |     def forward(self, input: torch.Tensor) -> torch.Tensor:
27 |         if not self.sparsity_enabled:
28 |             return super().forward(input)
29 | 
30 |         input_shape = input.shape
31 |         input = input.flatten(end_dim=-2)
32 |         dim0 = input.shape[0]
33 |         if dim0 % 8 != 0:
34 |             # NOTE: This should be torch-compiled away
35 |             input = F.pad(input, [0, 0, 0, -dim0 % 8])
36 |         w_sparse = xops.sparsify24(
37 |             self.weight,
38 |             algo=self.ALGO,
39 |             gradient="ste",
40 |             backend="cusparselt",
41 |         )
42 |         return F.linear(input, w_sparse, self.bias,)[
43 |             :dim0
44 |         ].unflatten(dim=0, sizes=input_shape[:-1])
45 | 
46 | 
47 | def replace_linears_with_sparse_linear(root_module: nn.Module, *, filter_fn: Callable[[str], bool]) -> nn.Module:
48 |     total_count = 0
49 | 
50 |     def replace(module: nn.Module, name: str) -> nn.Module:
51 |         nonlocal total_count
52 |         if not isinstance(module, nn.Linear) or not filter_fn(name):
53 |             return module
54 |         assert type(module) == nn.Linear, "Subtypes not supported"
55 |         new_module = LinearW24(
56 |             in_features=module.in_features,
57 |             out_features=module.out_features,
58 |             bias=module.bias is not None,
59 |             dtype=module.weight.dtype,
60 |             device=module.weight.device,
61 |         )
62 |         new_module.weight = module.weight
63 |         new_module.bias = module.bias
64 |         total_count += 1
65 |         return new_module
66 | 
67 |     out = named_replace(replace, root_module)
68 |     assert total_count > 0, "2:4 sparsity: no layer found to sparsify"
69 |     return out
70 | 
71 | 
72 | def update_24sparsity(root_module: nn.Module, enabled: bool) -> int:
73 |     num_modified = 0
74 | 
75 |     def maybe_apply_sparsity(module: nn.Module, name: str) -> nn.Module:
76 |         nonlocal num_modified
77 |         if not isinstance(module, LinearW24):
78 |             return module
79 |         num_modified += 1
80 |         module.sparsity_enabled = enabled
81 |         logger.info(f"- {'' if module.sparsity_enabled else 'de'}sparsifying {name}")
82 |         return module
83 | 
84 |     named_apply(maybe_apply_sparsity, root_module)
85 |     # Force re-compile everything
86 |     torch._dynamo.reset_code_caches()
87 |     from torch._inductor.cudagraph_trees import reset_cudagraph_trees
88 | 
89 |     reset_cudagraph_trees()
90 |     return num_modified
91 | 


--------------------------------------------------------------------------------
/engine/backbone/dinov3/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This software may be used and distributed in accordance with
 4 | # the terms of the DINOv3 License Agreement.
 5 | 
 6 | from .dtype import as_torch_dtype
 7 | from .utils import (
 8 |     cat_keep_shapes,
 9 |     count_parameters,
10 |     fix_random_seeds,
11 |     get_conda_env,
12 |     get_sha,
13 |     named_apply,
14 |     named_replace,
15 |     uncat_with_shapes,
16 | )
17 | 


--------------------------------------------------------------------------------
/engine/backbone/dinov3/utils/cluster.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | #
  3 | # This software may be used and distributed in accordance with
  4 | # the terms of the DINOv3 License Agreement.
  5 | 
  6 | import os
  7 | from enum import Enum
  8 | from pathlib import Path
  9 | from typing import Any, Dict, Optional
 10 | 
 11 | 
 12 | class ClusterType(Enum):
 13 |     CW = "cw"
 14 | 
 15 | 
 16 | def _guess_cluster_type() -> ClusterType:
 17 |     return ClusterType.CW
 18 | 
 19 | 
 20 | def get_cluster_type(
 21 |     cluster_type: Optional[ClusterType] = None,
 22 | ) -> Optional[ClusterType]:
 23 |     if cluster_type is None:
 24 |         return _guess_cluster_type()
 25 | 
 26 |     return cluster_type
 27 | 
 28 | 
 29 | def get_slurm_account(cluster_type: Optional[ClusterType] = None) -> Optional[str]:
 30 |     cluster_type = get_cluster_type(cluster_type)
 31 |     if cluster_type is None:
 32 |         return None
 33 |     return {
 34 |         ClusterType.CW: "fair_amaia_cw_explore",
 35 |     }[cluster_type]
 36 | 
 37 | 
 38 | def get_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
 39 |     cluster_type = get_cluster_type(cluster_type)
 40 |     if cluster_type is None:
 41 |         return None
 42 | 
 43 |     CHECKPOINT_DIRNAMES = {
 44 |         ClusterType.CW: "",
 45 |     }
 46 |     return Path("/") / CHECKPOINT_DIRNAMES[cluster_type]
 47 | 
 48 | 
 49 | def get_user_checkpoint_path(
 50 |     cluster_type: Optional[ClusterType] = None,
 51 | ) -> Optional[Path]:
 52 |     checkpoint_path = get_checkpoint_path(cluster_type)
 53 |     if checkpoint_path is None:
 54 |         return None
 55 | 
 56 |     username = os.environ.get("USER")
 57 |     assert username is not None
 58 |     return checkpoint_path / username
 59 | 
 60 | 
 61 | def get_slurm_qos(cluster_type: Optional[ClusterType] = None) -> Optional[str]:
 62 |     cluster_type = get_cluster_type(cluster_type)
 63 |     if cluster_type is None:
 64 |         return None
 65 | 
 66 |     return {
 67 |         ClusterType.CW: "explore",
 68 |     }.get(cluster_type)
 69 | 
 70 | 
 71 | def get_slurm_partition(cluster_type: Optional[ClusterType] = None) -> Optional[str]:
 72 |     cluster_type = get_cluster_type(cluster_type)
 73 |     if cluster_type is None:
 74 |         return None
 75 | 
 76 |     SLURM_PARTITIONS = {
 77 |         ClusterType.CW: "learn",
 78 |     }
 79 |     return SLURM_PARTITIONS[cluster_type]
 80 | 
 81 | 
 82 | def get_slurm_executor_parameters(
 83 |     nodes: int,
 84 |     num_gpus_per_node: int,
 85 |     cluster_type: Optional[ClusterType] = None,
 86 |     **kwargs,
 87 | ) -> Dict[str, Any]:
 88 |     # create default parameters
 89 |     params = {
 90 |         "mem_gb": 0,  # Requests all memory on a node, see https://slurm.schedmd.com/sbatch.html
 91 |         "gpus_per_node": num_gpus_per_node,
 92 |         "tasks_per_node": num_gpus_per_node,  # one task per GPU
 93 |         "cpus_per_task": 10,
 94 |         "nodes": nodes,
 95 |         "slurm_partition": get_slurm_partition(cluster_type),
 96 |     }
 97 |     # apply cluster-specific adjustments
 98 |     cluster_type = get_cluster_type(cluster_type)
 99 |     if cluster_type == ClusterType.CW:
100 |         params["cpus_per_task"] = 16
101 |     # set additional parameters / apply overrides
102 |     params.update(kwargs)
103 |     return params
104 | 


--------------------------------------------------------------------------------
/engine/backbone/dinov3/utils/custom_callable.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This software may be used and distributed in accordance with
 4 | # the terms of the DINOv3 License Agreement.
 5 | 
 6 | import contextlib
 7 | import importlib
 8 | import inspect
 9 | import os
10 | import sys
11 | from pathlib import Path
12 | 
13 | 
14 | @contextlib.contextmanager
15 | def _load_modules_from_dir(dir_: str):
16 |     sys.path.insert(0, dir_)
17 |     yield
18 |     sys.path.pop(0)
19 | 
20 | 
21 | def load_custom_callable(module_path: str | Path, callable_name: str):
22 |     module_full_path = os.path.realpath(module_path)
23 |     assert os.path.exists(module_full_path), f"module {module_full_path} does not exist"
24 |     module_dir, module_filename = os.path.split(module_full_path)
25 |     module_name, _ = os.path.splitext(module_filename)
26 | 
27 |     with _load_modules_from_dir(module_dir):
28 |         module = importlib.import_module(module_name)
29 |         if inspect.getfile(module) != module_full_path:
30 |             importlib.reload(module)
31 |         callable_ = getattr(module, callable_name)
32 | 
33 |     return callable_
34 | 
35 | 
36 | @contextlib.contextmanager
37 | def change_working_dir_and_pythonpath(new_dir):
38 |     old_dir = Path.cwd()
39 |     new_dir = Path(new_dir).expanduser().resolve().as_posix()
40 |     old_pythonpath = sys.path.copy()
41 |     sys.path.insert(0, new_dir)
42 |     os.chdir(new_dir)
43 |     try:
44 |         yield
45 |     finally:
46 |         os.chdir(old_dir)
47 |         sys.path = old_pythonpath
48 | 


--------------------------------------------------------------------------------
/engine/backbone/dinov3/utils/dtype.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This software may be used and distributed in accordance with
 4 | # the terms of the DINOv3 License Agreement.
 5 | 
 6 | from typing import Dict, Union
 7 | 
 8 | import numpy as np
 9 | import torch
10 | 
11 | TypeSpec = Union[str, np.dtype, torch.dtype]
12 | 
13 | 
14 | _NUMPY_TO_TORCH_DTYPE: Dict[np.dtype, torch.dtype] = {
15 |     np.dtype("bool"): torch.bool,
16 |     np.dtype("uint8"): torch.uint8,
17 |     np.dtype("int8"): torch.int8,
18 |     np.dtype("int16"): torch.int16,
19 |     np.dtype("int32"): torch.int32,
20 |     np.dtype("int64"): torch.int64,
21 |     np.dtype("float16"): torch.float16,
22 |     np.dtype("float32"): torch.float32,
23 |     np.dtype("float64"): torch.float64,
24 |     np.dtype("complex64"): torch.complex64,
25 |     np.dtype("complex128"): torch.complex128,
26 | }
27 | 
28 | 
29 | def as_torch_dtype(dtype: TypeSpec) -> torch.dtype:
30 |     if isinstance(dtype, torch.dtype):
31 |         return dtype
32 |     if isinstance(dtype, str):
33 |         dtype = np.dtype(dtype)
34 |     assert isinstance(dtype, np.dtype), f"Expected an instance of nunpy dtype, got {type(dtype)}"
35 |     return _NUMPY_TO_TORCH_DTYPE[dtype]
36 | 


--------------------------------------------------------------------------------
/engine/backbone/dinov3/utils/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | #
  3 | # This software may be used and distributed in accordance with
  4 | # the terms of the DINOv3 License Agreement.
  5 | 
  6 | import logging
  7 | import os
  8 | import random
  9 | import subprocess
 10 | from typing import Callable, List, Optional, Tuple
 11 | 
 12 | import numpy as np
 13 | import torch
 14 | from torch import Tensor, nn
 15 | 
 16 | logger = logging.getLogger("dinov3")
 17 | 
 18 | 
 19 | def cat_keep_shapes(x_list: List[Tensor]) -> Tuple[Tensor, List[Tuple[int]], List[int]]:
 20 |     shapes = [x.shape for x in x_list]
 21 |     num_tokens = [x.select(dim=-1, index=0).numel() for x in x_list]
 22 |     flattened = torch.cat([x.flatten(0, -2) for x in x_list])
 23 |     return flattened, shapes, num_tokens
 24 | 
 25 | 
 26 | def uncat_with_shapes(flattened: Tensor, shapes: List[Tuple[int]], num_tokens: List[int]) -> List[Tensor]:
 27 |     outputs_splitted = torch.split_with_sizes(flattened, num_tokens, dim=0)
 28 |     shapes_adjusted = [shape[:-1] + torch.Size([flattened.shape[-1]]) for shape in shapes]
 29 |     outputs_reshaped = [o.reshape(shape) for o, shape in zip(outputs_splitted, shapes_adjusted)]
 30 |     return outputs_reshaped
 31 | 
 32 | 
 33 | def named_replace(
 34 |     fn: Callable,
 35 |     module: nn.Module,
 36 |     name: str = "",
 37 |     depth_first: bool = True,
 38 |     include_root: bool = False,
 39 | ) -> nn.Module:
 40 |     if not depth_first and include_root:
 41 |         module = fn(module=module, name=name)
 42 |     for child_name_o, child_module in list(module.named_children()):
 43 |         child_name = ".".join((name, child_name_o)) if name else child_name_o
 44 |         new_child = named_replace(
 45 |             fn=fn,
 46 |             module=child_module,
 47 |             name=child_name,
 48 |             depth_first=depth_first,
 49 |             include_root=True,
 50 |         )
 51 |         setattr(module, child_name_o, new_child)
 52 | 
 53 |     if depth_first and include_root:
 54 |         module = fn(module=module, name=name)
 55 |     return module
 56 | 
 57 | 
 58 | def named_apply(
 59 |     fn: Callable,
 60 |     module: nn.Module,
 61 |     name: str = "",
 62 |     depth_first: bool = True,
 63 |     include_root: bool = False,
 64 | ) -> nn.Module:
 65 |     if not depth_first and include_root:
 66 |         fn(module=module, name=name)
 67 |     for child_name, child_module in module.named_children():
 68 |         child_name = ".".join((name, child_name)) if name else child_name
 69 |         named_apply(
 70 |             fn=fn,
 71 |             module=child_module,
 72 |             name=child_name,
 73 |             depth_first=depth_first,
 74 |             include_root=True,
 75 |         )
 76 |     if depth_first and include_root:
 77 |         fn(module=module, name=name)
 78 |     return module
 79 | 
 80 | 
 81 | def fix_random_seeds(seed: int = 31):
 82 |     """
 83 |     Fix random seeds.
 84 |     """
 85 |     torch.manual_seed(seed)
 86 |     torch.cuda.manual_seed_all(seed)
 87 |     np.random.seed(seed)
 88 |     random.seed(seed)
 89 | 
 90 | 
 91 | def get_sha() -> str:
 92 |     cwd = os.path.dirname(os.path.abspath(__file__))
 93 | 
 94 |     def _run(command):
 95 |         return subprocess.check_output(command, cwd=cwd).decode("ascii").strip()
 96 | 
 97 |     sha = "N/A"
 98 |     diff = "clean"
 99 |     branch = "N/A"
100 |     try:
101 |         sha = _run(["git", "rev-parse", "HEAD"])
102 |         subprocess.check_output(["git", "diff"], cwd=cwd)
103 |         diff = _run(["git", "diff-index", "HEAD"])
104 |         diff = "has uncommited changes" if diff else "clean"
105 |         branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"])
106 |     except Exception:
107 |         pass
108 |     message = f"sha: {sha}, status: {diff}, branch: {branch}"
109 |     return message
110 | 
111 | 
112 | def get_conda_env() -> Tuple[Optional[str], Optional[str]]:
113 |     conda_env_name = os.environ.get("CONDA_DEFAULT_ENV")
114 |     conda_env_path = os.environ.get("CONDA_PREFIX")
115 |     return conda_env_name, conda_env_path
116 | 
117 | 
118 | def count_parameters(module: nn.Module) -> int:
119 |     c = 0
120 |     for m in module.parameters():
121 |         c += m.nelement()
122 |     return c
123 | 
124 | 
125 | def has_batchnorms(model: nn.Module) -> bool:
126 |     bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)
127 |     for _, module in model.named_modules():
128 |         if isinstance(module, bn_types):
129 |             return True
130 |     return False
131 | 


--------------------------------------------------------------------------------
/engine/backbone/test_resnet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from collections import OrderedDict
 6 | 
 7 | 
 8 | from ..core import register
 9 | 
10 | 
11 | class BasicBlock(nn.Module):
12 |     expansion = 1
13 | 
14 |     def __init__(self, in_planes, planes, stride=1):
15 |         super(BasicBlock, self).__init__()
16 | 
17 |         self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
18 |         self.bn1 = nn.BatchNorm2d(planes)
19 | 
20 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,stride=1, padding=1, bias=False)
21 |         self.bn2 = nn.BatchNorm2d(planes)
22 | 
23 |         self.shortcut = nn.Sequential()
24 |         if stride != 1 or in_planes != self.expansion*planes:
25 |             self.shortcut = nn.Sequential(
26 |                 nn.Conv2d(in_planes, self.expansion*planes,kernel_size=1, stride=stride, bias=False),
27 |                 nn.BatchNorm2d(self.expansion*planes)
28 |             )
29 |     def forward(self, x):
30 |         out = F.relu(self.bn1(self.conv1(x)))
31 |         out = self.bn2(self.conv2(out))
32 |         out += self.shortcut(x)
33 |         out = F.relu(out)
34 |         return out
35 | 
36 | 
37 | 
38 | class _ResNet(nn.Module):
39 |     def __init__(self, block, num_blocks, num_classes=10):
40 |         super().__init__()
41 |         self.in_planes = 64
42 | 
43 |         self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
44 |         self.bn1 = nn.BatchNorm2d(64)
45 | 
46 |         self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
47 |         self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
48 |         self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
49 |         self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
50 | 
51 |         self.linear = nn.Linear(512 * block.expansion, num_classes)
52 | 
53 |     def _make_layer(self, block, planes, num_blocks, stride):
54 |         strides = [stride] + [1]*(num_blocks-1)
55 |         layers = []
56 |         for stride in strides:
57 |             layers.append(block(self.in_planes, planes, stride))
58 |             self.in_planes = planes * block.expansion
59 |         return nn.Sequential(*layers)
60 | 
61 |     def forward(self, x):
62 |         out = F.relu(self.bn1(self.conv1(x)))
63 |         out = self.layer1(out)
64 |         out = self.layer2(out)
65 |         out = self.layer3(out)
66 |         out = self.layer4(out)
67 |         out = F.avg_pool2d(out, 4)
68 |         out = out.view(out.size(0), -1)
69 |         out = self.linear(out)
70 |         return out
71 | 
72 | 
73 | @register()
74 | class MResNet(nn.Module):
75 |     def __init__(self, num_classes=10, num_blocks=[2, 2, 2, 2]) -> None:
76 |         super().__init__()
77 |         self.model = _ResNet(BasicBlock, num_blocks, num_classes)
78 | 
79 |     def forward(self, x):
80 |         return self.model(x)
81 | 


--------------------------------------------------------------------------------
/engine/backbone/timm_model.py:
--------------------------------------------------------------------------------
 1 | """Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 2 | 
 3 | https://towardsdatascience.com/getting-started-with-pytorch-image-models-timm-a-practitioners-guide-4e77b4bf9055#0583
 4 | """
 5 | import torch
 6 | from torchvision.models.feature_extraction import get_graph_node_names, create_feature_extractor
 7 | 
 8 | from .utils import IntermediateLayerGetter
 9 | from ..core import register
10 | 
11 | 
12 | @register()
13 | class TimmModel(torch.nn.Module):
14 |     def __init__(self, \
15 |         name,
16 |         return_layers,
17 |         pretrained=False,
18 |         exportable=True,
19 |         features_only=True,
20 |         **kwargs) -> None:
21 | 
22 |         super().__init__()
23 | 
24 |         import timm
25 |         model = timm.create_model(
26 |             name,
27 |             pretrained=pretrained,
28 |             exportable=exportable,
29 |             features_only=features_only,
30 |             **kwargs
31 |         )
32 |         # nodes, _ = get_graph_node_names(model)
33 |         # print(nodes)
34 |         # features = {'': ''}
35 |         # model = create_feature_extractor(model, return_nodes=features)
36 | 
37 |         assert set(return_layers).issubset(model.feature_info.module_name()), \
38 |             f'return_layers should be a subset of {model.feature_info.module_name()}'
39 | 
40 |         # self.model = model
41 |         self.model = IntermediateLayerGetter(model, return_layers)
42 | 
43 |         return_idx = [model.feature_info.module_name().index(name) for name in return_layers]
44 |         self.strides = [model.feature_info.reduction()[i] for i in return_idx]
45 |         self.channels = [model.feature_info.channels()[i] for i in return_idx]
46 |         self.return_idx = return_idx
47 |         self.return_layers = return_layers
48 | 
49 |     def forward(self, x: torch.Tensor):
50 |         outputs = self.model(x)
51 |         # outputs = [outputs[i] for i in self.return_idx]
52 |         return outputs
53 | 
54 | 
55 | if __name__ == '__main__':
56 | 
57 |     model = TimmModel(name='resnet34', return_layers=['layer2', 'layer3'])
58 |     data = torch.rand(1, 3, 640, 640)
59 |     outputs = model(data)
60 | 
61 |     for output in outputs:
62 |         print(output.shape)
63 | 
64 |     """
65 |     model:
66 |         type: TimmModel
67 |         name: resnet34
68 |         return_layers: ['layer2', 'layer4']
69 |     """
70 | 


--------------------------------------------------------------------------------
/engine/backbone/torchvision_model.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 4 | """
 5 | 
 6 | import torch
 7 | import torchvision
 8 | 
 9 | from ..core import register
10 | from .utils import IntermediateLayerGetter
11 | 
12 | __all__ = ['TorchVisionModel']
13 | 
14 | @register()
15 | class TorchVisionModel(torch.nn.Module):
16 |     def __init__(self, name, return_layers, weights=None, **kwargs) -> None:
17 |         super().__init__()
18 | 
19 |         if weights is not None:
20 |             weights = getattr(torchvision.models.get_model_weights(name), weights)
21 | 
22 |         model = torchvision.models.get_model(name, weights=weights, **kwargs)
23 | 
24 |         if hasattr(model, 'features'):
25 |             model = IntermediateLayerGetter(model.features, return_layers)
26 |         else:
27 |             model = IntermediateLayerGetter(model, return_layers)
28 | 
29 |         self.model = model
30 | 
31 |     def forward(self, x):
32 |         return self.model(x)
33 | 
34 | 
35 | # TorchVisionModel('swin_t', return_layers=['5', '7'])
36 | # TorchVisionModel('resnet34', return_layers=['layer2','layer3', 'layer4'])
37 | 
38 | # TorchVisionModel:
39 | #     name: swin_t
40 | #     return_layers: ['5', '7']
41 | #     weights: DEFAULT
42 | 
43 | 
44 | # model:
45 | #     type: TorchVisionModel
46 | #     name: resnet34
47 | #     return_layers: ['layer2','layer3', 'layer4']
48 | #     weights: DEFAULT
49 | 


--------------------------------------------------------------------------------
/engine/backbone/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | https://github.com/pytorch/vision/blob/main/torchvision/models/_utils.py
 3 | 
 4 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 5 | """
 6 | 
 7 | from collections import OrderedDict
 8 | from typing import Dict, List
 9 | 
10 | 
11 | import torch.nn as nn
12 | 
13 | 
14 | class IntermediateLayerGetter(nn.ModuleDict):
15 |     """
16 |     Module wrapper that returns intermediate layers from a model
17 | 
18 |     It has a strong assumption that the modules have been registered
19 |     into the model in the same order as they are used.
20 |     This means that one should **not** reuse the same nn.Module
21 |     twice in the forward if you want this to work.
22 | 
23 |     Additionally, it is only able to query submodules that are directly
24 |     assigned to the model. So if `model` is passed, `model.feature1` can
25 |     be returned, but not `model.feature1.layer2`.
26 |     """
27 | 
28 |     _version = 3
29 | 
30 |     def __init__(self, model: nn.Module, return_layers: List[str]) -> None:
31 |         if not set(return_layers).issubset([name for name, _ in model.named_children()]):
32 |             raise ValueError("return_layers are not present in model. {}"\
33 |                 .format([name for name, _ in model.named_children()]))
34 |         orig_return_layers = return_layers
35 |         return_layers = {str(k): str(k)  for k in return_layers}
36 |         layers = OrderedDict()
37 |         for name, module in model.named_children():
38 |             layers[name] = module
39 |             if name in return_layers:
40 |                 del return_layers[name]
41 |             if not return_layers:
42 |                 break
43 | 
44 |         super().__init__(layers)
45 |         self.return_layers = orig_return_layers
46 | 
47 |     def forward(self, x):
48 |         outputs = []
49 |         for name, module in self.items():
50 |             x = module(x)
51 |             if name in self.return_layers:
52 |                 outputs.append(x)
53 | 
54 |         return outputs
55 | 


--------------------------------------------------------------------------------
/engine/core/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 4 | """
 5 | 
 6 | from .workspace import GLOBAL_CONFIG, register, create
 7 | from .yaml_utils import *
 8 | from ._config import BaseConfig
 9 | from .yaml_config import YAMLConfig
10 | 


--------------------------------------------------------------------------------
/engine/core/yaml_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
  3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
  4 | """
  5 | 
  6 | import os
  7 | import copy
  8 | import yaml
  9 | from typing import Any, Dict, Optional, List
 10 | 
 11 | from .workspace import GLOBAL_CONFIG
 12 | 
 13 | __all__ = [
 14 |     'load_config',
 15 |     'merge_config',
 16 |     'merge_dict',
 17 |     'parse_cli',
 18 | ]
 19 | 
 20 | 
 21 | INCLUDE_KEY = '__include__'
 22 | 
 23 | 
 24 | def load_config(file_path, cfg=dict()):
 25 |     """load config
 26 |     """
 27 |     _, ext = os.path.splitext(file_path)
 28 |     assert ext in ['.yml', '.yaml'], "only support yaml files"
 29 | 
 30 |     with open(file_path) as f:
 31 |         file_cfg = yaml.load(f, Loader=yaml.Loader)
 32 |         if file_cfg is None:
 33 |             return {}
 34 | 
 35 |     if INCLUDE_KEY in file_cfg:
 36 |         base_yamls = list(file_cfg[INCLUDE_KEY])
 37 |         for base_yaml in base_yamls:
 38 |             if base_yaml.startswith('~'):
 39 |                 base_yaml = os.path.expanduser(base_yaml)
 40 | 
 41 |             if not base_yaml.startswith('/'):
 42 |                 base_yaml = os.path.join(os.path.dirname(file_path), base_yaml)
 43 | 
 44 |             with open(base_yaml) as f:
 45 |                 base_cfg = load_config(base_yaml, cfg)
 46 |                 merge_dict(cfg, base_cfg)
 47 | 
 48 |     return merge_dict(cfg, file_cfg)
 49 | 
 50 | 
 51 | def merge_dict(dct, another_dct, inplace=True) -> Dict:
 52 |     """merge another_dct into dct
 53 |     """
 54 |     def _merge(dct, another) -> Dict:
 55 |         for k in another:
 56 |             if (k in dct and isinstance(dct[k], dict) and isinstance(another[k], dict)):
 57 |                 _merge(dct[k], another[k])
 58 |             else:
 59 |                 dct[k] = another[k]
 60 | 
 61 |         return dct
 62 | 
 63 |     if not inplace:
 64 |         dct = copy.deepcopy(dct)
 65 | 
 66 |     return _merge(dct, another_dct)
 67 | 
 68 | 
 69 | def dictify(s: str, v: Any) -> Dict:
 70 |     if '.' not in s:
 71 |         return {s: v}
 72 |     key, rest = s.split('.', 1)
 73 |     return {key: dictify(rest, v)}
 74 | 
 75 | 
 76 | def parse_cli(nargs: List[str]) -> Dict:
 77 |     """
 78 |     parse command-line arguments
 79 |         convert `a.c=3 b=10` to `{'a': {'c': 3}, 'b': 10}`
 80 |     """
 81 |     cfg = {}
 82 |     if nargs is None or len(nargs) == 0:
 83 |         return cfg
 84 | 
 85 |     for s in nargs:
 86 |         s = s.strip()
 87 |         k, v = s.split('=', 1)
 88 |         d = dictify(k, yaml.load(v, Loader=yaml.Loader))
 89 |         cfg = merge_dict(cfg, d)
 90 | 
 91 |     return cfg
 92 | 
 93 | 
 94 | 
 95 | def merge_config(cfg, another_cfg=GLOBAL_CONFIG, inplace: bool=False, overwrite: bool=False):
 96 |     """
 97 |     Merge another_cfg into cfg, return the merged config
 98 | 
 99 |     Example:
100 | 
101 |         cfg1 = load_config('./dfine_r18vd_6x_coco.yml')
102 |         cfg1 = merge_config(cfg, inplace=True)
103 | 
104 |         cfg2 = load_config('./dfine_r50vd_6x_coco.yml')
105 |         cfg2 = merge_config(cfg2, inplace=True)
106 | 
107 |         model1 = create(cfg1['model'], cfg1)
108 |         model2 = create(cfg2['model'], cfg2)
109 |     """
110 |     def _merge(dct, another):
111 |         for k in another:
112 |             if k not in dct:
113 |                 dct[k] = another[k]
114 | 
115 |             elif isinstance(dct[k], dict) and isinstance(another[k], dict):
116 |                 _merge(dct[k], another[k])
117 | 
118 |             elif overwrite:
119 |                 dct[k] = another[k]
120 | 
121 |         return cfg
122 | 
123 |     if not inplace:
124 |         cfg = copy.deepcopy(cfg)
125 | 
126 |     return _merge(cfg, another_cfg)
127 | 


--------------------------------------------------------------------------------
/engine/data/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 4 | """
 5 | 
 6 | from .dataset import *
 7 | from .transforms import *
 8 | from .dataloader import *
 9 | 
10 | from ._misc import convert_to_tv_tensor
11 | 
12 | 
13 | 
14 | 
15 | # def set_epoch(self, epoch) -> None:
16 | #     self.epoch = epoch
17 | # def _set_epoch_func(datasets):
18 | #     """Add `set_epoch` for datasets
19 | #     """
20 | #     from ..core import register
21 | #     for ds in datasets:
22 | #         register(ds)(set_epoch)
23 | # _set_epoch_func([CIFAR10, VOCDetection, CocoDetection])
24 | 


--------------------------------------------------------------------------------
/engine/data/_misc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 4 | """
 5 | 
 6 | import importlib.metadata
 7 | from torch import Tensor
 8 | 
 9 | if '0.15.2' in importlib.metadata.version('torchvision'):
10 |     import torchvision
11 |     torchvision.disable_beta_transforms_warning()
12 | 
13 |     from torchvision.datapoints import BoundingBox as BoundingBoxes
14 |     from torchvision.datapoints import BoundingBoxFormat, Mask, Image, Video
15 |     from torchvision.transforms.v2 import SanitizeBoundingBox as SanitizeBoundingBoxes
16 |     _boxes_keys = ['format', 'spatial_size']
17 | 
18 | elif '0.17' > importlib.metadata.version('torchvision') >= '0.16':
19 |     import torchvision
20 |     torchvision.disable_beta_transforms_warning()
21 | 
22 |     from torchvision.transforms.v2 import SanitizeBoundingBoxes
23 |     from torchvision.tv_tensors import (
24 |         BoundingBoxes, BoundingBoxFormat, Mask, Image, Video)
25 |     _boxes_keys = ['format', 'canvas_size']
26 | 
27 | elif importlib.metadata.version('torchvision') >= '0.17':
28 |     import torchvision
29 |     from torchvision.transforms.v2 import SanitizeBoundingBoxes
30 |     from torchvision.tv_tensors import (
31 |         BoundingBoxes, BoundingBoxFormat, Mask, Image, Video)
32 |     _boxes_keys = ['format', 'canvas_size']
33 | 
34 | else:
35 |     raise RuntimeError('Please make sure torchvision version >= 0.15.2')
36 | 
37 | 
38 | 
39 | def convert_to_tv_tensor(tensor: Tensor, key: str, box_format='xyxy', spatial_size=None) -> Tensor:
40 |     """
41 |     Args:
42 |         tensor (Tensor): input tensor
43 |         key (str): transform to key
44 | 
45 |     Return:
46 |         Dict[str, TV_Tensor]
47 |     """
48 |     assert key in ('boxes', 'masks', ), "Only support 'boxes' and 'masks'"
49 | 
50 |     if key == 'boxes':
51 |         box_format = getattr(BoundingBoxFormat, box_format.upper())
52 |         _kwargs = dict(zip(_boxes_keys, [box_format, spatial_size]))
53 |         return BoundingBoxes(tensor, **_kwargs)
54 | 
55 |     if key == 'masks':
56 |        return Mask(tensor)
57 | 


--------------------------------------------------------------------------------
/engine/data/dataset/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 4 | """
 5 | 
 6 | # from ._dataset import DetDataset
 7 | from .coco_dataset import CocoDetection
 8 | from .coco_dataset import (
 9 |     mscoco_category2name,
10 |     mscoco_category2label,
11 |     mscoco_label2category,
12 | )
13 | from .coco_eval import CocoEvaluator
14 | from .coco_utils import get_coco_api_from_dataset
15 | from .voc_detection import VOCDetection
16 | from .voc_eval import VOCEvaluator
17 | 


--------------------------------------------------------------------------------
/engine/data/dataset/_dataset.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 4 | """
 5 | 
 6 | import torch
 7 | import torch.utils.data as data
 8 | 
 9 | class DetDataset(data.Dataset):
10 |     def __getitem__(self, index):
11 |         img, target = self.load_item(index)
12 |         if self.transforms is not None:
13 |             img, target, _ = self.transforms(img, target, self)
14 |         return img, target
15 | 
16 |     def load_item(self, index):
17 |         raise NotImplementedError("Please implement this function to return item before `transforms`.")
18 | 
19 |     def set_epoch(self, epoch) -> None:
20 |         self._epoch = epoch
21 | 
22 |     @property
23 |     def epoch(self):
24 |         return self._epoch if hasattr(self, '_epoch') else -1
25 | 


--------------------------------------------------------------------------------
/engine/data/dataset/voc_detection.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 4 | """
 5 | 
 6 | from sympy import im
 7 | import torch
 8 | import torchvision
 9 | import torchvision.transforms.functional as TVF
10 | 
11 | import os
12 | from PIL import Image
13 | from typing import Optional, Callable
14 | 
15 | try:
16 |     from defusedxml.ElementTree import parse as ET_parse
17 | except ImportError:
18 |     from xml.etree.ElementTree import parse as ET_parse
19 | 
20 | from ._dataset import DetDataset
21 | from .._misc import convert_to_tv_tensor
22 | from ...core import register
23 | 
24 | @register()
25 | class VOCDetection(torchvision.datasets.VOCDetection, DetDataset):
26 |     __inject__ = ['transforms', ]
27 | 
28 |     def __init__(self, root: str, ann_file: str = "trainval.txt", label_file: str = "label_list.txt", transforms: Optional[Callable] = None):
29 | 
30 |         with open(os.path.join(root, ann_file), 'r') as f:
31 |             lines = [x.strip() for x in f.readlines()]
32 |             lines = [x.split(' ') for x in lines]
33 | 
34 |         self.images = [os.path.join(root, lin[0]) for lin in lines]
35 |         self.targets = [os.path.join(root, lin[1]) for lin in lines]
36 |         assert len(self.images) == len(self.targets)
37 | 
38 |         with open(os.path.join(root + label_file), 'r') as f:
39 |             labels = f.readlines()
40 |             labels = [lab.strip() for lab in labels]
41 | 
42 |         self.transforms = transforms
43 |         self.labels_map = {lab: i for i, lab in enumerate(labels)}
44 | 
45 |     def __getitem__(self, index: int):
46 |         image, target = self.load_item(index)
47 |         if self.transforms is not None:
48 |             image, target, _ = self.transforms(image, target, self)
49 |         # target["orig_size"] = torch.tensor(TVF.get_image_size(image))
50 |         return image, target
51 | 
52 |     def load_item(self, index: int):
53 |         image = Image.open(self.images[index]).convert("RGB")
54 |         target = self.parse_voc_xml(ET_parse(self.annotations[index]).getroot())
55 | 
56 |         output = {}
57 |         output["image_id"] = torch.tensor([index])
58 |         for k in ['area', 'boxes', 'labels', 'iscrowd']:
59 |             output[k] = []
60 | 
61 |         for blob in target['annotation']['object']:
62 |             box = [float(v) for v in blob['bndbox'].values()]
63 |             output["boxes"].append(box)
64 |             output["labels"].append(blob['name'])
65 |             output["area"].append((box[2] - box[0]) * (box[3] - box[1]))
66 |             output["iscrowd"].append(0)
67 | 
68 |         w, h = image.size
69 |         boxes = torch.tensor(output["boxes"]) if len(output["boxes"]) > 0 else torch.zeros(0, 4)
70 |         output['boxes'] = convert_to_tv_tensor(boxes, 'boxes', box_format='xyxy', spatial_size=[h, w])
71 |         output['labels'] = torch.tensor([self.labels_map[lab] for lab in output["labels"]])
72 |         output['area'] = torch.tensor(output['area'])
73 |         output["iscrowd"] = torch.tensor(output["iscrowd"])
74 |         output["orig_size"] = torch.tensor([w, h])
75 | 
76 |         return image, output
77 | 


--------------------------------------------------------------------------------
/engine/data/dataset/voc_eval.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 4 | """
 5 | 
 6 | import torch
 7 | import torchvision
 8 | 
 9 | 
10 | class VOCEvaluator(object):
11 |     def __init__(self) -> None:
12 |         pass
13 | 


--------------------------------------------------------------------------------
/engine/data/transforms/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 4 | """
 5 | 
 6 | 
 7 | from ._transforms import (
 8 |     EmptyTransform,
 9 |     RandomPhotometricDistort,
10 |     RandomZoomOut,
11 |     RandomIoUCrop,
12 |     RandomHorizontalFlip,
13 |     Resize,
14 |     PadToSize,
15 |     SanitizeBoundingBoxes,
16 |     RandomCrop,
17 |     Normalize,
18 |     ConvertBoxes,
19 |     ConvertPILImage,
20 | )
21 | from .container import Compose
22 | from .mosaic import Mosaic


--------------------------------------------------------------------------------
/engine/data/transforms/_transforms.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
  3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
  4 | """
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | 
  9 | import torchvision
 10 | import torchvision.transforms.v2 as T
 11 | import torchvision.transforms.v2.functional as F
 12 | 
 13 | import PIL
 14 | import PIL.Image
 15 | 
 16 | from typing import Any, Dict, List, Optional
 17 | 
 18 | from .._misc import convert_to_tv_tensor, _boxes_keys
 19 | from .._misc import Image, Video, Mask, BoundingBoxes
 20 | from .._misc import SanitizeBoundingBoxes
 21 | 
 22 | from ...core import register
 23 | torchvision.disable_beta_transforms_warning()
 24 | 
 25 | 
 26 | RandomPhotometricDistort = register()(T.RandomPhotometricDistort)
 27 | RandomZoomOut = register()(T.RandomZoomOut)
 28 | RandomHorizontalFlip = register()(T.RandomHorizontalFlip)
 29 | Resize = register()(T.Resize)
 30 | # ToImageTensor = register()(T.ToImageTensor)
 31 | # ConvertDtype = register()(T.ConvertDtype)
 32 | # PILToTensor = register()(T.PILToTensor)
 33 | SanitizeBoundingBoxes = register(name='SanitizeBoundingBoxes')(SanitizeBoundingBoxes)
 34 | RandomCrop = register()(T.RandomCrop)
 35 | Normalize = register()(T.Normalize)
 36 | 
 37 | 
 38 | @register()
 39 | class EmptyTransform(T.Transform):
 40 |     def __init__(self, ) -> None:
 41 |         super().__init__()
 42 | 
 43 |     def forward(self, *inputs):
 44 |         inputs = inputs if len(inputs) > 1 else inputs[0]
 45 |         return inputs
 46 | 
 47 | 
 48 | @register()
 49 | class PadToSize(T.Pad):
 50 |     _transformed_types = (
 51 |         PIL.Image.Image,
 52 |         Image,
 53 |         Video,
 54 |         Mask,
 55 |         BoundingBoxes,
 56 |     )
 57 |     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
 58 |         sp = F.get_spatial_size(flat_inputs[0])
 59 |         h, w = self.size[1] - sp[0], self.size[0] - sp[1]
 60 |         self.padding = [0, 0, w, h]
 61 |         return dict(padding=self.padding)
 62 | 
 63 |     def __init__(self, size, fill=0, padding_mode='constant') -> None:
 64 |         if isinstance(size, int):
 65 |             size = (size, size)
 66 |         self.size = size
 67 |         super().__init__(0, fill, padding_mode)
 68 | 
 69 |     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 70 |         fill = self._fill[type(inpt)]
 71 |         padding = params['padding']
 72 |         return F.pad(inpt, padding=padding, fill=fill, padding_mode=self.padding_mode)  # type: ignore[arg-type]
 73 | 
 74 |     def __call__(self, *inputs: Any) -> Any:
 75 |         outputs = super().forward(*inputs)
 76 |         if len(outputs) > 1 and isinstance(outputs[1], dict):
 77 |             outputs[1]['padding'] = torch.tensor(self.padding)
 78 |         return outputs
 79 | 
 80 | 
 81 | @register()
 82 | class RandomIoUCrop(T.RandomIoUCrop):
 83 |     def __init__(self, min_scale: float = 0.3, max_scale: float = 1, min_aspect_ratio: float = 0.5, max_aspect_ratio: float = 2, sampler_options: Optional[List[float]] = None, trials: int = 40, p: float = 1.0):
 84 |         super().__init__(min_scale, max_scale, min_aspect_ratio, max_aspect_ratio, sampler_options, trials)
 85 |         self.p = p
 86 | 
 87 |     def __call__(self, *inputs: Any) -> Any:
 88 |         if torch.rand(1) >= self.p:
 89 |             return inputs if len(inputs) > 1 else inputs[0]
 90 | 
 91 |         return super().forward(*inputs)
 92 | 
 93 | 
 94 | @register()
 95 | class ConvertBoxes(T.Transform):
 96 |     _transformed_types = (
 97 |         BoundingBoxes,
 98 |     )
 99 |     def __init__(self, fmt='', normalize=False) -> None:
100 |         super().__init__()
101 |         self.fmt = fmt
102 |         self.normalize = normalize
103 | 
104 |     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
105 |         spatial_size = getattr(inpt, _boxes_keys[1])
106 |         if self.fmt:
107 |             in_fmt = inpt.format.value.lower()
108 |             inpt = torchvision.ops.box_convert(inpt, in_fmt=in_fmt, out_fmt=self.fmt.lower())
109 |             inpt = convert_to_tv_tensor(inpt, key='boxes', box_format=self.fmt.upper(), spatial_size=spatial_size)
110 | 
111 |         if self.normalize:
112 |             inpt = inpt / torch.tensor(spatial_size[::-1]).tile(2)[None]
113 | 
114 |         return inpt
115 | 
116 | 
117 | @register()
118 | class ConvertPILImage(T.Transform):
119 |     _transformed_types = (
120 |         PIL.Image.Image,
121 |     )
122 |     def __init__(self, dtype='float32', scale=True) -> None:
123 |         super().__init__()
124 |         self.dtype = dtype
125 |         self.scale = scale
126 | 
127 |     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
128 |         inpt = F.pil_to_tensor(inpt)
129 |         if self.dtype == 'float32':
130 |             inpt = inpt.float()
131 | 
132 |         if self.scale:
133 |             inpt = inpt / 255.
134 | 
135 |         inpt = Image(inpt)
136 | 
137 |         return inpt
138 | 


--------------------------------------------------------------------------------
/engine/data/transforms/container.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DEIM: DETR with Improved Matching for Fast Convergence
  3 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
  4 | ---------------------------------------------------------------------------------
  5 | Modified from D-FINE (https://github.com/Peterande/D-FINE)
  6 | Copyright (c) 2024 D-FINE authors. All Rights Reserved.
  7 | """
  8 | 
  9 | import torch
 10 | import torch.nn as nn
 11 | 
 12 | import torchvision
 13 | import torchvision.transforms.v2 as T
 14 | 
 15 | from typing import Any, Dict, List, Optional
 16 | 
 17 | from ._transforms import EmptyTransform
 18 | from ...core import register, GLOBAL_CONFIG
 19 | torchvision.disable_beta_transforms_warning()
 20 | import random
 21 | 
 22 | 
 23 | @register()
 24 | class Compose(T.Compose):
 25 |     def __init__(self, ops, policy=None, mosaic_prob=-0.1) -> None:
 26 |         transforms = []
 27 |         if ops is not None:
 28 |             for op in ops:
 29 |                 if isinstance(op, dict):
 30 |                     name = op.pop('type')
 31 |                     transform = getattr(GLOBAL_CONFIG[name]['_pymodule'], GLOBAL_CONFIG[name]['_name'])(**op)
 32 |                     transforms.append(transform)
 33 |                     op['type'] = name
 34 |                     print("     ### Transform @{} ###    ".format(type(transform).__name__))
 35 | 
 36 |                 elif isinstance(op, nn.Module):
 37 |                     transforms.append(op)
 38 | 
 39 |                 else:
 40 |                     raise ValueError('')
 41 |         else:
 42 |             transforms =[EmptyTransform(), ]
 43 | 
 44 |         super().__init__(transforms=transforms)
 45 | 
 46 |         self.mosaic_prob = mosaic_prob
 47 |         if policy is None:
 48 |             policy = {'name': 'default'}
 49 |         else:
 50 |             if self.mosaic_prob > 0: 
 51 |                 print("     ### Mosaic with Prob.@{} and ZoomOut/IoUCrop existed ### ".format(self.mosaic_prob))
 52 |             print("     ### ImgTransforms Epochs: {} ### ".format(policy['epoch']))
 53 |             print('     ### Policy_ops@{} ###'.format(policy['ops']))
 54 |         self.global_samples = 0
 55 |         self.policy = policy
 56 | 
 57 |     def forward(self, *inputs: Any) -> Any:
 58 |         return self.get_forward(self.policy['name'])(*inputs)
 59 | 
 60 |     def get_forward(self, name):
 61 |         forwards = {
 62 |             'default': self.default_forward,
 63 |             'stop_epoch': self.stop_epoch_forward,
 64 |             'stop_sample': self.stop_sample_forward,
 65 |         }
 66 |         return forwards[name]
 67 | 
 68 |     def default_forward(self, *inputs: Any) -> Any:
 69 |         sample = inputs if len(inputs) > 1 else inputs[0]
 70 |         for transform in self.transforms:
 71 |             sample = transform(sample)
 72 |         return sample
 73 | 
 74 |     def stop_epoch_forward(self, *inputs: Any):
 75 |         sample = inputs if len(inputs) > 1 else inputs[0]
 76 |         dataset = sample[-1]
 77 |         cur_epoch = dataset.epoch
 78 |         policy_ops = self.policy['ops']
 79 |         policy_epoch = self.policy['epoch']
 80 | 
 81 |         if isinstance(policy_epoch, list) and len(policy_epoch) == 3:     # 4-stages
 82 |             if policy_epoch[0] <= cur_epoch < policy_epoch[1]:
 83 |                 with_mosaic = random.random() <= self.mosaic_prob       # Probility for Mosaic
 84 |             else:
 85 |                 with_mosaic = False
 86 |             for transform in self.transforms:
 87 |                 if (type(transform).__name__ in policy_ops and cur_epoch < policy_epoch[0]):   # first stage: NoAug
 88 |                     pass
 89 |                 elif (type(transform).__name__ in policy_ops and cur_epoch >= policy_epoch[-1]):    # last stage: NoAug
 90 |                     pass
 91 |                 else:
 92 |                     # Using Mosaic for [policy_epoch[0], policy_epoch[1]] with probability
 93 |                     if (type(transform).__name__ == 'Mosaic' and not with_mosaic):      
 94 |                         pass
 95 |                     # Mosaic and Zoomout/IoUCrop can not be co-existed in the same sample
 96 |                     elif (type(transform).__name__ == 'RandomZoomOut' or type(transform).__name__ == 'RandomIoUCrop') and with_mosaic:      
 97 |                         pass
 98 |                     else:
 99 |                         sample = transform(sample)
100 |         else:   # the default data scheduler
101 |             for transform in self.transforms:
102 |                 if type(transform).__name__ in policy_ops and cur_epoch >= policy_epoch:
103 |                     pass
104 |                 else:
105 |                     sample = transform(sample)
106 | 
107 |         return sample
108 | 
109 | 
110 |     def stop_sample_forward(self, *inputs: Any):
111 |         sample = inputs if len(inputs) > 1 else inputs[0]
112 |         dataset = sample[-1]
113 | 
114 |         cur_epoch = dataset.epoch
115 |         policy_ops = self.policy['ops']
116 |         policy_sample = self.policy['sample']
117 | 
118 |         for transform in self.transforms:
119 |             if type(transform).__name__ in policy_ops and self.global_samples >= policy_sample:
120 |                 pass
121 |             else:
122 |                 sample = transform(sample)
123 | 
124 |         self.global_samples += 1
125 | 
126 |         return sample
127 | 


--------------------------------------------------------------------------------
/engine/deim/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | DEIM: DETR with Improved Matching for Fast Convergence
 3 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
 4 | ---------------------------------------------------------------------------------
 5 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 6 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 7 | """
 8 | 
 9 | 
10 | from .deim import DEIM
11 | 
12 | from .matcher import HungarianMatcher
13 | 
14 | from .hybrid_encoder import HybridEncoder
15 | from .lite_encoder import LiteEncoder
16 | 
17 | 
18 | from .dfine_decoder import DFINETransformer
19 | from .rtdetrv2_decoder import RTDETRTransformerv2
20 | 
21 | from .postprocessor import PostProcessor
22 | from .deim_criterion import DEIMCriterion
23 | from .deim_decoder import DEIMTransformer


--------------------------------------------------------------------------------
/engine/deim/box_ops.py:
--------------------------------------------------------------------------------
 1 | """
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | https://github.com/facebookresearch/detr/blob/main/util/box_ops.py
 4 | """
 5 | 
 6 | import torch
 7 | from torch import Tensor
 8 | from torchvision.ops.boxes import box_area
 9 | 
10 | 
11 | def box_cxcywh_to_xyxy(x):
12 |     x_c, y_c, w, h = x.unbind(-1)
13 |     b = [(x_c - 0.5 * w.clamp(min=0.0)), (y_c - 0.5 * h.clamp(min=0.0)),
14 |          (x_c + 0.5 * w.clamp(min=0.0)), (y_c + 0.5 * h.clamp(min=0.0))]
15 |     return torch.stack(b, dim=-1)
16 | 
17 | 
18 | def box_xyxy_to_cxcywh(x: Tensor) -> Tensor:
19 |     x0, y0, x1, y1 = x.unbind(-1)
20 |     b = [(x0 + x1) / 2, (y0 + y1) / 2,
21 |          (x1 - x0), (y1 - y0)]
22 |     return torch.stack(b, dim=-1)
23 | 
24 | 
25 | # modified from torchvision to also return the union
26 | def box_iou(boxes1: Tensor, boxes2: Tensor):
27 |     area1 = box_area(boxes1)
28 |     area2 = box_area(boxes2)
29 | 
30 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
31 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
32 | 
33 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
34 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
35 | 
36 |     union = area1[:, None] + area2 - inter
37 | 
38 |     iou = inter / union
39 |     return iou, union
40 | 
41 | 
42 | def generalized_box_iou(boxes1, boxes2):
43 |     """
44 |     Generalized IoU from https://giou.stanford.edu/
45 | 
46 |     The boxes should be in [x0, y0, x1, y1] format
47 | 
48 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
49 |     and M = len(boxes2)
50 |     """
51 |     # degenerate boxes gives inf / nan results
52 |     # so do an early check
53 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
54 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
55 |     iou, union = box_iou(boxes1, boxes2)
56 | 
57 |     lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
58 |     rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
59 | 
60 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
61 |     area = wh[:, :, 0] * wh[:, :, 1]
62 | 
63 |     return iou - (area - union) / area
64 | 
65 | 
66 | def masks_to_boxes(masks):
67 |     """Compute the bounding boxes around the provided masks
68 | 
69 |     The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
70 | 
71 |     Returns a [N, 4] tensors, with the boxes in xyxy format
72 |     """
73 |     if masks.numel() == 0:
74 |         return torch.zeros((0, 4), device=masks.device)
75 | 
76 |     h, w = masks.shape[-2:]
77 | 
78 |     y = torch.arange(0, h, dtype=torch.float)
79 |     x = torch.arange(0, w, dtype=torch.float)
80 |     y, x = torch.meshgrid(y, x)
81 | 
82 |     x_mask = (masks * x.unsqueeze(0))
83 |     x_max = x_mask.flatten(1).max(-1)[0]
84 |     x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
85 | 
86 |     y_mask = (masks * y.unsqueeze(0))
87 |     y_max = y_mask.flatten(1).max(-1)[0]
88 |     y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
89 | 
90 |     return torch.stack([x_min, y_min, x_max, y_max], 1)


--------------------------------------------------------------------------------
/engine/deim/deim.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
 3 | """
 4 | 
 5 | import torch.nn as nn
 6 | from ..core import register
 7 | 
 8 | 
 9 | __all__ = ['DEIM', ]
10 | 
11 | 
12 | @register()
13 | class DEIM(nn.Module):
14 |     __inject__ = ['backbone', 'encoder', 'decoder', ]
15 | 
16 |     def __init__(self, \
17 |         backbone: nn.Module,
18 |         encoder: nn.Module,
19 |         decoder: nn.Module,
20 |     ):
21 |         super().__init__()
22 |         self.backbone = backbone
23 |         self.decoder = decoder
24 |         self.encoder = encoder
25 | 
26 |     def forward(self, x, targets=None):
27 |         x = self.backbone(x)
28 |         x = self.encoder(x)
29 |         x = self.decoder(x, targets)
30 | 
31 |         return x
32 | 
33 |     def deploy(self, ):
34 |         self.eval()
35 |         for m in self.modules():
36 |             if hasattr(m, 'convert_to_deploy'):
37 |                 m.convert_to_deploy()
38 |         return self
39 | 


--------------------------------------------------------------------------------
/engine/deim/deim_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import torch.nn.init as init
 5 | 
 6 | from .utils import get_activation, bias_init_with_prob
 7 | 
 8 | 
 9 | class RMSNorm(nn.Module):
10 |     def __init__(self, dim: int, eps: float = 1e-6):
11 |         super().__init__()
12 |         self.dim = dim
13 |         self.eps = eps
14 |         self.scale = nn.Parameter(torch.ones(dim))
15 | 
16 |     def _norm(self, x):
17 |         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
18 | 
19 |     def forward(self, x):
20 |         output = self._norm(x.float()).type_as(x)
21 |         output = output * self.scale
22 |         return output
23 | 
24 |     def extra_repr(self) -> str:
25 |         return f'dim={self.dim}, eps={self.eps}'
26 | 
27 | # default 3-layer MLP
28 | class MLP(nn.Module):
29 |     def __init__(self, input_dim, hidden_dim, output_dim, num_layers=3, act='relu'):
30 |         super().__init__()
31 |         self.num_layers = num_layers
32 |         h = [hidden_dim] * (num_layers - 1)
33 |         self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
34 |         self.act = get_activation(act)
35 | 
36 |     def forward(self, x):
37 |         for i, layer in enumerate(self.layers):
38 |             x = self.act(layer(x)) if i < self.num_layers - 1 else layer(x)
39 |         return x
40 | 
41 | # Taken from: https://github.com/facebookresearch/dinov2/blob/main/dinov2/layers/swiglu_ffn.py#L14-L34
42 | class SwiGLUFFN(nn.Module):
43 |     def __init__(
44 |         self,
45 |         in_features: int,
46 |         hidden_features: int,
47 |         out_features: int,
48 |         bias: bool = True,
49 |     ) -> None:
50 |         super().__init__()
51 |         out_features = out_features or in_features
52 |         hidden_features = hidden_features or in_features
53 |         self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
54 |         self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
55 |         self._reset_parameters()
56 | 
57 |     def _reset_parameters(self):
58 |         init.xavier_uniform_(self.w12.weight)
59 |         init.constant_(self.w12.bias, 0)
60 |         init.xavier_uniform_(self.w3.weight)
61 |         init.constant_(self.w3.bias, 0)
62 | 
63 |     def forward(self, x):
64 |         x12 = self.w12(x)
65 |         x1, x2 = x12.chunk(2, dim=-1)
66 |         hidden = F.silu(x1) * x2
67 |         return self.w3(hidden)
68 | 
69 | 
70 | class Gate(nn.Module):
71 |     def __init__(self, d_model, use_rmsnorm=False):
72 |         super(Gate, self).__init__()
73 |         self.gate = nn.Linear(2 * d_model, 2 * d_model)
74 |         bias = bias_init_with_prob(0.5)
75 |         init.constant_(self.gate.bias, bias)
76 |         init.constant_(self.gate.weight, 0)
77 |         self.norm = RMSNorm(d_model) if use_rmsnorm else nn.LayerNorm(d_model)
78 | 
79 |     def forward(self, x1, x2):
80 |         gate_input = torch.cat([x1, x2], dim=-1)
81 |         gates = torch.sigmoid(self.gate(gate_input))
82 |         gate1, gate2 = gates.chunk(2, dim=-1)
83 |         return self.norm(gate1 * x1 + gate2 * x2)


--------------------------------------------------------------------------------
/engine/deim/denoising.py:
--------------------------------------------------------------------------------
  1 | """Copyright(c) 2023 lyuwenyu. All Rights Reserved.
  2 | Modifications Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
  3 | """
  4 | 
  5 | import torch
  6 | 
  7 | from .utils import inverse_sigmoid
  8 | from .box_ops import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh
  9 | 
 10 | 
 11 | 
 12 | def get_contrastive_denoising_training_group(targets,
 13 |                                              num_classes,
 14 |                                              num_queries,
 15 |                                              class_embed,
 16 |                                              num_denoising=100,
 17 |                                              label_noise_ratio=0.5,
 18 |                                              box_noise_scale=1.0,):
 19 |     """cnd"""
 20 |     if num_denoising <= 0:
 21 |         return None, None, None, None
 22 | 
 23 |     num_gts = [len(t['labels']) for t in targets]
 24 |     device = targets[0]['labels'].device
 25 | 
 26 |     max_gt_num = max(num_gts)
 27 |     if max_gt_num == 0:
 28 |         return None, None, None, None
 29 | 
 30 |     num_group = num_denoising // max_gt_num
 31 |     num_group = 1 if num_group == 0 else num_group
 32 |     # pad gt to max_num of a batch
 33 |     bs = len(num_gts)
 34 | 
 35 |     input_query_class = torch.full([bs, max_gt_num], num_classes, dtype=torch.int32, device=device)
 36 |     input_query_bbox = torch.zeros([bs, max_gt_num, 4], device=device)
 37 |     pad_gt_mask = torch.zeros([bs, max_gt_num], dtype=torch.bool, device=device)
 38 | 
 39 |     for i in range(bs):
 40 |         num_gt = num_gts[i]
 41 |         if num_gt > 0:
 42 |             input_query_class[i, :num_gt] = targets[i]['labels']
 43 |             input_query_bbox[i, :num_gt] = targets[i]['boxes']
 44 |             pad_gt_mask[i, :num_gt] = 1
 45 |     # each group has positive and negative queries.
 46 |     input_query_class = input_query_class.tile([1, 2 * num_group])
 47 |     input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1])
 48 |     pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group])
 49 |     # positive and negative mask
 50 |     negative_gt_mask = torch.zeros([bs, max_gt_num * 2, 1], device=device)
 51 |     negative_gt_mask[:, max_gt_num:] = 1
 52 |     negative_gt_mask = negative_gt_mask.tile([1, num_group, 1])
 53 |     positive_gt_mask = 1 - negative_gt_mask
 54 |     # contrastive denoising training positive index
 55 |     positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
 56 |     dn_positive_idx = torch.nonzero(positive_gt_mask)[:, 1]
 57 |     dn_positive_idx = torch.split(dn_positive_idx, [n * num_group for n in num_gts])
 58 |     # total denoising queries
 59 |     num_denoising = int(max_gt_num * 2 * num_group)
 60 | 
 61 |     if label_noise_ratio > 0:
 62 |         mask = torch.rand_like(input_query_class, dtype=torch.float) < (label_noise_ratio * 0.5)
 63 |         # randomly put a new one here
 64 |         new_label = torch.randint_like(mask, 0, num_classes, dtype=input_query_class.dtype)
 65 |         input_query_class = torch.where(mask & pad_gt_mask, new_label, input_query_class)
 66 | 
 67 |     if box_noise_scale > 0:
 68 |         known_bbox = box_cxcywh_to_xyxy(input_query_bbox)
 69 |         diff = torch.tile(input_query_bbox[..., 2:] * 0.5, [1, 1, 2]) * box_noise_scale
 70 |         rand_sign = torch.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
 71 |         rand_part = torch.rand_like(input_query_bbox)
 72 |         rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (1 - negative_gt_mask)
 73 |         known_bbox += (rand_sign * rand_part * diff)
 74 |         known_bbox = torch.clip(known_bbox, min=0.0, max=1.0)
 75 |         input_query_bbox = box_xyxy_to_cxcywh(known_bbox)
 76 |         input_query_bbox[input_query_bbox < 0] *= -1
 77 |         input_query_bbox_unact = inverse_sigmoid(input_query_bbox)
 78 | 
 79 |     input_query_logits = class_embed(input_query_class)
 80 | 
 81 |     tgt_size = num_denoising + num_queries
 82 |     attn_mask = torch.full([tgt_size, tgt_size], False, dtype=torch.bool, device=device)
 83 |     # match query cannot see the reconstruction
 84 |     attn_mask[num_denoising:, :num_denoising] = True
 85 | 
 86 |     # reconstruct cannot see each other
 87 |     for i in range(num_group):
 88 |         if i == 0:
 89 |             attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1): num_denoising] = True
 90 |         if i == num_group - 1:
 91 |             attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), :max_gt_num * i * 2] = True
 92 |         else:
 93 |             attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1): num_denoising] = True
 94 |             attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), :max_gt_num * 2 * i] = True
 95 | 
 96 |     dn_meta = {
 97 |         "dn_positive_idx": dn_positive_idx,
 98 |         "dn_num_group": num_group,
 99 |         "dn_num_split": [num_denoising, num_queries]
100 |     }
101 | 
102 |     # print(input_query_class.shape) # torch.Size([4, 196, 256])
103 |     # print(input_query_bbox.shape) # torch.Size([4, 196, 4])
104 |     # print(attn_mask.shape) # torch.Size([496, 496])
105 | 
106 |     return input_query_logits, input_query_bbox_unact, attn_mask, dn_meta
107 | 


--------------------------------------------------------------------------------
/engine/deim/lite_encoder.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DEIM: DETR with Improved Matching for Fast Convergence
  3 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
  4 | ---------------------------------------------------------------------------------
  5 | Modified from D-FINE (https://github.com/Peterande/D-FINE/)
  6 | Copyright (c) 2024 D-FINE Authors. All Rights Reserved.
  7 | """
  8 | 
  9 | import copy
 10 | from collections import OrderedDict
 11 | 
 12 | import torch
 13 | import torch.nn as nn
 14 | import torch.nn.functional as F
 15 | from functools import partial
 16 | 
 17 | from .utils import get_activation
 18 | 
 19 | from ..core import register
 20 | from .hybrid_encoder import ConvNormLayer_fuse
 21 | from .hybrid_encoder import RepNCSPELAN4
 22 | 
 23 | __all__ = ['LiteEncoder']
 24 | 
 25 | 
 26 | # Copy from https://github.com/meituan/YOLOv6/blob/main/yolov6/layers/common.py#L695
 27 | class GAP_Fusion(nn.Module):
 28 |     '''BiFusion Block in PAN'''
 29 |     def __init__(self, in_channels, out_channels, act=None):
 30 |         super().__init__()
 31 |         self.cv = ConvNormLayer_fuse(out_channels, out_channels, 1, 1, act=act)
 32 | 
 33 |     def forward(self, x):
 34 |         # global average pooling
 35 |         gap = F.adaptive_avg_pool2d(x, 1)
 36 |         x = x + gap
 37 |         return self.cv(x)
 38 |         
 39 | # Two-scale encoder
 40 | @register()
 41 | class LiteEncoder(nn.Module):
 42 |     __share__ = ['eval_spatial_size', ]
 43 | 
 44 |     def __init__(self,
 45 |                  in_channels=[512],
 46 |                  feat_strides=[16],
 47 |                  hidden_dim=256,
 48 |                  expansion=1.0,
 49 |                  depth_mult=1.0,
 50 |                  act='silu',
 51 |                  eval_spatial_size=None,
 52 |                  csp_type='csp2',
 53 |                  ):
 54 |         super().__init__()
 55 |         self.in_channels = in_channels
 56 |         self.feat_strides = feat_strides
 57 |         self.hidden_dim = hidden_dim
 58 |         self.eval_spatial_size = eval_spatial_size
 59 |         self.out_channels = [hidden_dim for _ in range(len(in_channels))]
 60 |         self.out_strides = feat_strides
 61 |         
 62 |         # channel projection: unify the channel dimension of the input features
 63 |         self.input_proj = nn.ModuleList()
 64 |         for in_channel in in_channels:
 65 |             proj = nn.Sequential(OrderedDict([
 66 |                     ('conv', nn.Conv2d(in_channel, hidden_dim, kernel_size=1, bias=False)),
 67 |                     ('norm', nn.BatchNorm2d(hidden_dim))
 68 |                 ]))
 69 | 
 70 |             self.input_proj.append(proj)
 71 | 
 72 |         # get the small-scale feature
 73 |         down_sample = nn.Sequential(   # avg pooling
 74 |             nn.AvgPool2d(kernel_size=3, stride=2, padding=1),
 75 |             nn.Conv2d(hidden_dim, hidden_dim, 1, 1, bias=False),
 76 |             nn.BatchNorm2d(hidden_dim),
 77 |             get_activation(act)
 78 |         )
 79 |         self.down_sample1 = copy.deepcopy(down_sample)
 80 |         self.down_sample2 = copy.deepcopy(down_sample)
 81 | 
 82 |         # Bi-Fusion
 83 |         self.bi_fusion = GAP_Fusion(hidden_dim, hidden_dim, act=act)
 84 | 
 85 |         # fuse block
 86 |         c1, c2, c3, c4, num_blocks = hidden_dim, hidden_dim, hidden_dim*2, round(expansion * hidden_dim // 2), round(3 * depth_mult)
 87 |         fuse_block = RepNCSPELAN4(c1=c1, c2=c2, c3=c3, c4=c4, n=num_blocks, act=act, csp_type=csp_type)
 88 |         self.fpn_block = copy.deepcopy(fuse_block)  
 89 |         self.pan_block = copy.deepcopy(fuse_block)
 90 | 
 91 |     def forward(self, feats):
 92 |         assert len(feats) == len(self.in_channels)
 93 |         proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
 94 |         proj_feats.append(self.down_sample1(proj_feats[-1]))   # get the small-scale feature
 95 | 
 96 |         # fuse the global feature and the small-scale feature
 97 |         proj_feats[-1] = self.bi_fusion(proj_feats[-1])
 98 | 
 99 |         outs = []
100 |         # fpn
101 |         fuse_feat = proj_feats[0] + F.interpolate(proj_feats[1], scale_factor=2., mode='nearest')
102 |         outs.append(self.fpn_block(fuse_feat))
103 | 
104 |         fuse_feat = proj_feats[1] + self.down_sample2(outs[-1])
105 |         outs.append(self.pan_block(fuse_feat))
106 | 
107 |         return outs


--------------------------------------------------------------------------------
/engine/deim/postprocessor.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 4 | """
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | 
10 | import torchvision
11 | 
12 | from ..core import register
13 | 
14 | 
15 | __all__ = ['PostProcessor']
16 | 
17 | 
18 | def mod(a, b):
19 |     out = a - a // b * b
20 |     return out
21 | 
22 | 
23 | @register()
24 | class PostProcessor(nn.Module):
25 |     __share__ = [
26 |         'num_classes',
27 |         'use_focal_loss',
28 |         'num_top_queries',
29 |         'remap_mscoco_category'
30 |     ]
31 | 
32 |     def __init__(
33 |         self,
34 |         num_classes=80,
35 |         use_focal_loss=True,
36 |         num_top_queries=300,
37 |         remap_mscoco_category=False
38 |     ) -> None:
39 |         super().__init__()
40 |         self.use_focal_loss = use_focal_loss
41 |         self.num_top_queries = num_top_queries
42 |         self.num_classes = int(num_classes)
43 |         self.remap_mscoco_category = remap_mscoco_category
44 |         self.deploy_mode = False
45 | 
46 |     def extra_repr(self) -> str:
47 |         return f'use_focal_loss={self.use_focal_loss}, num_classes={self.num_classes}, num_top_queries={self.num_top_queries}'
48 | 
49 |     # def forward(self, outputs, orig_target_sizes):
50 |     def forward(self, outputs, orig_target_sizes: torch.Tensor):
51 |         logits, boxes = outputs['pred_logits'], outputs['pred_boxes']
52 |         # orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
53 | 
54 |         bbox_pred = torchvision.ops.box_convert(boxes, in_fmt='cxcywh', out_fmt='xyxy')
55 |         bbox_pred *= orig_target_sizes.repeat(1, 2).unsqueeze(1)
56 | 
57 |         if self.use_focal_loss:
58 |             scores = F.sigmoid(logits)
59 |             scores, index = torch.topk(scores.flatten(1), self.num_top_queries, dim=-1)
60 |             # labels = index % self.num_classes
61 |             labels = mod(index, self.num_classes)
62 |             index = index // self.num_classes
63 |             boxes = bbox_pred.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, bbox_pred.shape[-1]))
64 | 
65 |         else:
66 |             scores = F.softmax(logits)[:, :, :-1]
67 |             scores, labels = scores.max(dim=-1)
68 |             if scores.shape[1] > self.num_top_queries:
69 |                 scores, index = torch.topk(scores, self.num_top_queries, dim=-1)
70 |                 labels = torch.gather(labels, dim=1, index=index)
71 |                 boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1]))
72 | 
73 |         if self.deploy_mode:
74 |             return labels, boxes, scores
75 | 
76 |         if self.remap_mscoco_category:
77 |             from ..data.dataset import mscoco_label2category
78 |             labels = torch.tensor([mscoco_label2category[int(x.item())] for x in labels.flatten()])\
79 |                 .to(boxes.device).reshape(labels.shape)
80 | 
81 |         results = []
82 |         for lab, box, sco in zip(labels, boxes, scores):
83 |             result = dict(labels=lab, boxes=box, scores=sco)
84 |             results.append(result)
85 | 
86 |         return results
87 | 
88 | 
89 |     def deploy(self, ):
90 |         self.eval()
91 |         self.deploy_mode = True
92 |         return self
93 | 


--------------------------------------------------------------------------------
/engine/misc/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 4 | """
 5 | 
 6 | from .logger import *
 7 | from .visualizer import *
 8 | from .dist_utils import setup_seed, setup_print
 9 | from .profiler_utils import stats
10 | 


--------------------------------------------------------------------------------
/engine/misc/box_ops.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
  3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
  4 | """
  5 | 
  6 | import torch
  7 | import torchvision
  8 | from torch import Tensor
  9 | from typing import List, Tuple
 10 | 
 11 | 
 12 | def generalized_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
 13 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
 14 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
 15 |     return torchvision.ops.generalized_box_iou(boxes1, boxes2)
 16 | 
 17 | 
 18 | # elementwise
 19 | def elementwise_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
 20 |     """
 21 |     Args:
 22 |         boxes1, [N, 4]
 23 |         boxes2, [N, 4]
 24 |     Returns:
 25 |         iou, [N, ]
 26 |         union, [N, ]
 27 |     """
 28 |     area1 = torchvision.ops.box_area(boxes1) # [N, ]
 29 |     area2 = torchvision.ops.box_area(boxes2) # [N, ]
 30 |     lt = torch.max(boxes1[:, :2], boxes2[:, :2])  # [N, 2]
 31 |     rb = torch.min(boxes1[:, 2:], boxes2[:, 2:])  # [N, 2]
 32 |     wh = (rb - lt).clamp(min=0)  # [N, 2]
 33 |     inter = wh[:, 0] * wh[:, 1]  # [N, ]
 34 |     union = area1 + area2 - inter
 35 |     iou = inter / union
 36 |     return iou, union
 37 | 
 38 | 
 39 | def elementwise_generalized_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
 40 |     """
 41 |     Args:
 42 |         boxes1, [N, 4] with [x1, y1, x2, y2]
 43 |         boxes2, [N, 4] with [x1, y1, x2, y2]
 44 |     Returns:
 45 |         giou, [N, ]
 46 |     """
 47 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
 48 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
 49 |     iou, union = elementwise_box_iou(boxes1, boxes2)
 50 |     lt = torch.min(boxes1[:, :2], boxes2[:, :2]) # [N, 2]
 51 |     rb = torch.max(boxes1[:, 2:], boxes2[:, 2:]) # [N, 2]
 52 |     wh = (rb - lt).clamp(min=0)  # [N, 2]
 53 |     area = wh[:, 0] * wh[:, 1]
 54 |     return iou - (area - union) / area
 55 | 
 56 | 
 57 | def check_point_inside_box(points: Tensor, boxes: Tensor, eps=1e-9) -> Tensor:
 58 |     """
 59 |     Args:
 60 |         points, [K, 2], (x, y)
 61 |         boxes, [N, 4], (x1, y1, y2, y2)
 62 |     Returns:
 63 |         Tensor (bool), [K, N]
 64 |     """
 65 |     x, y = [p.unsqueeze(-1) for p in points.unbind(-1)]
 66 |     x1, y1, x2, y2 = [x.unsqueeze(0) for x in boxes.unbind(-1)]
 67 | 
 68 |     l = x - x1
 69 |     t = y - y1
 70 |     r = x2 - x
 71 |     b = y2 - y
 72 | 
 73 |     ltrb = torch.stack([l, t, r, b], dim=-1)
 74 |     mask = ltrb.min(dim=-1).values > eps
 75 | 
 76 |     return mask
 77 | 
 78 | 
 79 | def point_box_distance(points: Tensor, boxes: Tensor) -> Tensor:
 80 |     """
 81 |     Args:
 82 |         boxes, [N, 4], (x1, y1, x2, y2)
 83 |         points, [N, 2], (x, y)
 84 |     Returns:
 85 |         Tensor (N, 4), (l, t, r, b)
 86 |     """
 87 |     x1y1, x2y2 = torch.split(boxes, 2, dim=-1)
 88 |     lt = points - x1y1
 89 |     rb = x2y2 - points
 90 |     return torch.concat([lt, rb], dim=-1)
 91 | 
 92 | 
 93 | def point_distance_box(points: Tensor, distances: Tensor) -> Tensor:
 94 |     """
 95 |     Args:
 96 |         points (Tensor), [N, 2], (x, y)
 97 |         distances (Tensor), [N, 4], (l, t, r, b)
 98 |     Returns:
 99 |         boxes (Tensor),  (N, 4), (x1, y1, x2, y2)
100 |     """
101 |     lt, rb = torch.split(distances, 2, dim=-1)
102 |     x1y1 = -lt + points
103 |     x2y2 = rb + points
104 |     boxes = torch.concat([x1y1, x2y2], dim=-1)
105 |     return boxes
106 | 


--------------------------------------------------------------------------------
/engine/misc/lazy_loader.py:
--------------------------------------------------------------------------------
 1 | """
 2 | https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/util/lazy_loader.py
 3 | """
 4 | 
 5 | 
 6 | import types
 7 | import importlib
 8 | 
 9 | class LazyLoader(types.ModuleType):
10 |   """Lazily import a module, mainly to avoid pulling in large dependencies.
11 | 
12 |   `paddle`, and `ffmpeg` are examples of modules that are large and not always
13 |   needed, and this allows them to only be loaded when they are used.
14 |   """
15 | 
16 |   # The lint error here is incorrect.
17 |   def __init__(self, local_name, parent_module_globals, name, warning=None):
18 |     self._local_name = local_name
19 |     self._parent_module_globals = parent_module_globals
20 |     self._warning = warning
21 | 
22 |     # These members allows doctest correctly process this module member without
23 |     # triggering self._load(). self._load() mutates parant_module_globals and
24 |     # triggers a dict mutated during iteration error from doctest.py.
25 |     # - for from_module()
26 |     self.__module__ = name.rsplit(".", 1)[0]
27 |     # - for is_routine()
28 |     self.__wrapped__ = None
29 | 
30 |     super(LazyLoader, self).__init__(name)
31 | 
32 |   def _load(self):
33 |     """Load the module and insert it into the parent's globals."""
34 |     # Import the target module and insert it into the parent's namespace
35 |     module = importlib.import_module(self.__name__)
36 |     self._parent_module_globals[self._local_name] = module
37 | 
38 |     # Emit a warning if one was specified
39 |     if self._warning:
40 |       # logging.warning(self._warning)
41 |       # Make sure to only warn once.
42 |       self._warning = None
43 | 
44 |     # Update this object's dict so that if someone keeps a reference to the
45 |     #   LazyLoader, lookups are efficient (__getattr__ is only called on lookups
46 |     #   that fail).
47 |     self.__dict__.update(module.__dict__)
48 | 
49 |     return module
50 | 
51 |   def __getattr__(self, item):
52 |     module = self._load()
53 |     return getattr(module, item)
54 | 
55 |   def __repr__(self):
56 |     # Carefully to not trigger _load, since repr may be called in very
57 |     # sensitive places.
58 |     return f"<LazyLoader {self.__name__} as {self._local_name}>"
59 | 
60 |   def __dir__(self):
61 |     module = self._load()
62 |     return dir(module)
63 | 
64 | 
65 | # import paddle.nn as nn
66 | # nn = LazyLoader("nn", globals(), "paddle.nn")
67 | 
68 | # class M(nn.Layer):
69 | #     def __init__(self) -> None:
70 | #       super().__init__()
71 | 


--------------------------------------------------------------------------------
/engine/misc/profiler_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
 3 | """
 4 | 
 5 | import copy
 6 | from calflops import calculate_flops
 7 | from typing import Tuple
 8 | 
 9 | def stats(
10 |     cfg,
11 |     input_shape: Tuple=(1, 3, 640, 640), ) -> Tuple[int, dict]:
12 | 
13 |     base_size = cfg.train_dataloader.collate_fn.base_size
14 |     input_shape = (1, 3, base_size, base_size)
15 | 
16 |     model_for_info = copy.deepcopy(cfg.model).deploy()
17 | 
18 |     flops, macs, _ = calculate_flops(model=model_for_info,
19 |                                         input_shape=input_shape,
20 |                                         output_as_string=True,
21 |                                         output_precision=4,
22 |                                         print_detailed=False)
23 |     params = sum(p.numel() for p in model_for_info.parameters())
24 |     del model_for_info
25 | 
26 |     return params, {"Model FLOPs:%s   MACs:%s   Params:%s" %(flops, macs, params)}
27 | 


--------------------------------------------------------------------------------
/engine/misc/visualizer.py:
--------------------------------------------------------------------------------
 1 | """"
 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 4 | """
 5 | 
 6 | import PIL
 7 | import torch
 8 | import torch.utils.data
 9 | import torchvision
10 | torchvision.disable_beta_transforms_warning()
11 | 
12 | __all__ = ['show_sample']
13 | 
14 | def show_sample(sample):
15 |     """for coco dataset/dataloader
16 |     """
17 |     import matplotlib.pyplot as plt
18 |     from torchvision.transforms.v2 import functional as F
19 |     from torchvision.utils import draw_bounding_boxes
20 | 
21 |     image, target = sample
22 |     if isinstance(image, PIL.Image.Image):
23 |         image = F.to_image_tensor(image)
24 | 
25 |     image = F.convert_dtype(image, torch.uint8)
26 |     annotated_image = draw_bounding_boxes(image, target["boxes"], colors="yellow", width=3)
27 | 
28 |     fig, ax = plt.subplots()
29 |     ax.imshow(annotated_image.permute(1, 2, 0).numpy())
30 |     ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
31 |     fig.tight_layout()
32 |     fig.show()
33 |     plt.show()
34 | 


--------------------------------------------------------------------------------
/engine/optim/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 4 | """
 5 | 
 6 | from .ema import *
 7 | from .optim import *
 8 | from .amp import *
 9 | from .warmup import *
10 | 


--------------------------------------------------------------------------------
/engine/optim/amp.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 4 | """
 5 | 
 6 | 
 7 | import torch.cuda.amp as amp
 8 | 
 9 | from ..core import register
10 | 
11 | 
12 | __all__ = ['GradScaler']
13 | 
14 | GradScaler = register()(amp.grad_scaler.GradScaler)
15 | 


--------------------------------------------------------------------------------
/engine/optim/ema.py:
--------------------------------------------------------------------------------
  1 | """
  2 | D-FINE: Redefine Regression Task of DETRs as Fine-grained Distribution Refinement
  3 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
  4 | ---------------------------------------------------------------------------------
  5 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
  6 | Copyright (c) 2023 lyuwenyu. All Rights Reserved.
  7 | """
  8 | 
  9 | 
 10 | import torch
 11 | import torch.nn as nn
 12 | 
 13 | import math
 14 | from copy import deepcopy
 15 | 
 16 | from ..core import register
 17 | from ..misc import dist_utils
 18 | 
 19 | __all__ = ['ModelEMA']
 20 | 
 21 | 
 22 | @register()
 23 | class ModelEMA(object):
 24 |     """
 25 |     Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models
 26 |     Keep a moving average of everything in the model state_dict (parameters and buffers).
 27 |     This is intended to allow functionality like
 28 |     https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
 29 |     A smoothed version of the weights is necessary for some training schemes to perform well.
 30 |     This class is sensitive where it is initialized in the sequence of model init,
 31 |     GPU assignment and distributed training wrappers.
 32 |     """
 33 |     def __init__(self, model: nn.Module, decay: float=0.9999, warmups: int=1000, start: int=0):
 34 |         super().__init__()
 35 | 
 36 |         self.module = deepcopy(dist_utils.de_parallel(model)).eval()
 37 |         # if next(model.parameters()).device.type != 'cpu':
 38 |         #     self.module.half()  # FP16 EMA
 39 | 
 40 |         self.decay = decay
 41 |         self.warmups = warmups
 42 |         self.before_start = 0
 43 |         self.start = start
 44 |         self.updates = 0  # number of EMA updates
 45 |         if warmups == 0:
 46 |             self.decay_fn = lambda x: decay
 47 |         else:
 48 |             self.decay_fn = lambda x: decay * (1 - math.exp(-x / warmups))  # decay exponential ramp (to help early epochs)
 49 | 
 50 |         for p in self.module.parameters():
 51 |             p.requires_grad_(False)
 52 | 
 53 | 
 54 |     def update(self, model: nn.Module):
 55 |         if self.before_start < self.start:
 56 |             self.before_start += 1
 57 |             return
 58 |         # Update EMA parameters
 59 |         with torch.no_grad():
 60 |             self.updates += 1
 61 |             d = self.decay_fn(self.updates)
 62 |             msd = dist_utils.de_parallel(model).state_dict()
 63 |             for k, v in self.module.state_dict().items():
 64 |                 if v.dtype.is_floating_point:
 65 |                     v *= d
 66 |                     v += (1 - d) * msd[k].detach()
 67 | 
 68 |     def to(self, *args, **kwargs):
 69 |         self.module = self.module.to(*args, **kwargs)
 70 |         return self
 71 | 
 72 |     def state_dict(self, ):
 73 |         return dict(module=self.module.state_dict(), updates=self.updates)
 74 | 
 75 |     def load_state_dict(self, state, strict=True):
 76 |         self.module.load_state_dict(state['module'], strict=strict)
 77 |         if 'updates' in state:
 78 |             self.updates = state['updates']
 79 | 
 80 |     def forwad(self, ):
 81 |         raise RuntimeError('ema...')
 82 | 
 83 |     def extra_repr(self) -> str:
 84 |         return f'decay={self.decay}, warmups={self.warmups}'
 85 | 
 86 | 
 87 | 
 88 | class ExponentialMovingAverage(torch.optim.swa_utils.AveragedModel):
 89 |     """Maintains moving averages of model parameters using an exponential decay.
 90 |     ``ema_avg = decay * avg_model_param + (1 - decay) * model_param``
 91 |     `torch.optim.swa_utils.AveragedModel <https://pytorch.org/docs/stable/optim.html#custom-averaging-strategies>`_
 92 |     is used to compute the EMA.
 93 |     """
 94 |     def __init__(self, model, decay, device="cpu", use_buffers=True):
 95 | 
 96 |         self.decay_fn = lambda x: decay * (1 - math.exp(-x / 2000))
 97 | 
 98 |         def ema_avg(avg_model_param, model_param, num_averaged):
 99 |             decay = self.decay_fn(num_averaged)
100 |             return decay * avg_model_param + (1 - decay) * model_param
101 | 
102 |         super().__init__(model, device, ema_avg, use_buffers=use_buffers)
103 | 


--------------------------------------------------------------------------------
/engine/optim/lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | """
 2 | DEIM: DETR with Improved Matching for Fast Convergence
 3 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
 4 | """
 5 | 
 6 | import math
 7 | from functools import partial
 8 | 
 9 | 
10 | def flat_cosine_schedule(total_iter, warmup_iter, flat_iter, no_aug_iter, current_iter, init_lr, min_lr):
11 |     """
12 |     Computes the learning rate using a warm-up, flat, and cosine decay schedule.
13 | 
14 |     Args:
15 |         total_iter (int): Total number of iterations.
16 |         warmup_iter (int): Number of iterations for warm-up phase.
17 |         flat_iter (int): Number of iterations for flat phase.
18 |         no_aug_iter (int): Number of iterations for no-augmentation phase.
19 |         current_iter (int): Current iteration.
20 |         init_lr (float): Initial learning rate.
21 |         min_lr (float): Minimum learning rate.
22 | 
23 |     Returns:
24 |         float: Calculated learning rate.
25 |     """
26 |     if current_iter <= warmup_iter:
27 |         return init_lr * (current_iter / float(warmup_iter)) ** 2
28 |     elif warmup_iter < current_iter <= flat_iter:
29 |         return init_lr
30 |     elif current_iter >= total_iter - no_aug_iter:
31 |         return min_lr
32 |     else:
33 |         cosine_decay = 0.5 * (1 + math.cos(math.pi * (current_iter - flat_iter) /
34 |                                            (total_iter - flat_iter - no_aug_iter)))
35 |         return min_lr + (init_lr - min_lr) * cosine_decay
36 | 
37 | 
38 | class FlatCosineLRScheduler:
39 |     """
40 |     Learning rate scheduler with warm-up, optional flat phase, and cosine decay following RTMDet.
41 | 
42 |     Args:
43 |         optimizer (torch.optim.Optimizer): Optimizer instance.
44 |         lr_gamma (float): Scaling factor for the minimum learning rate.
45 |         iter_per_epoch (int): Number of iterations per epoch.
46 |         total_epochs (int): Total number of training epochs.
47 |         warmup_epochs (int): Number of warm-up epochs.
48 |         flat_epochs (int): Number of flat epochs (for flat-cosine scheduler).
49 |         no_aug_epochs (int): Number of no-augmentation epochs.
50 |     """
51 |     def __init__(self, optimizer, lr_gamma, iter_per_epoch, total_epochs, 
52 |                  warmup_iter, flat_epochs, no_aug_epochs, scheduler_type="cosine"):
53 |         self.base_lrs = [group["initial_lr"] for group in optimizer.param_groups]
54 |         self.min_lrs = [base_lr * lr_gamma for base_lr in self.base_lrs]
55 | 
56 |         total_iter = int(iter_per_epoch * total_epochs)
57 |         no_aug_iter = int(iter_per_epoch * no_aug_epochs)
58 |         flat_iter = int(iter_per_epoch * flat_epochs)
59 | 
60 |         print(self.base_lrs, self.min_lrs, total_iter, warmup_iter, flat_iter, no_aug_iter)
61 |         self.lr_func = partial(flat_cosine_schedule, total_iter, warmup_iter, flat_iter, no_aug_iter)
62 | 
63 |     def step(self, current_iter, optimizer):
64 |         """
65 |         Updates the learning rate of the optimizer at the current iteration.
66 | 
67 |         Args:
68 |             current_iter (int): Current iteration.
69 |             optimizer (torch.optim.Optimizer): Optimizer instance.
70 |         """
71 |         for i, group in enumerate(optimizer.param_groups):
72 |             group["lr"] = self.lr_func(current_iter, self.base_lrs[i], self.min_lrs[i])
73 |         return optimizer
74 | 


--------------------------------------------------------------------------------
/engine/optim/optim.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 4 | """
 5 | 
 6 | 
 7 | import torch.optim as optim
 8 | import torch.optim.lr_scheduler as lr_scheduler
 9 | 
10 | from ..core import register
11 | 
12 | 
13 | __all__ = ['AdamW', 'SGD', 'Adam', 'MultiStepLR', 'CosineAnnealingLR', 'OneCycleLR', 'LambdaLR']
14 | 
15 | 
16 | 
17 | SGD = register()(optim.SGD)
18 | Adam = register()(optim.Adam)
19 | AdamW = register()(optim.AdamW)
20 | 
21 | 
22 | MultiStepLR = register()(lr_scheduler.MultiStepLR)
23 | CosineAnnealingLR = register()(lr_scheduler.CosineAnnealingLR)
24 | OneCycleLR = register()(lr_scheduler.OneCycleLR)
25 | LambdaLR = register()(lr_scheduler.LambdaLR)
26 | 


--------------------------------------------------------------------------------
/engine/optim/warmup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 4 | """
 5 | 
 6 | from torch.optim.lr_scheduler import LRScheduler
 7 | 
 8 | from ..core import register
 9 | 
10 | 
11 | class Warmup(object):
12 |     def __init__(self, lr_scheduler: LRScheduler, warmup_duration: int, last_step: int=-1) -> None:
13 |         self.lr_scheduler = lr_scheduler
14 |         self.warmup_end_values = [pg['lr'] for pg in lr_scheduler.optimizer.param_groups]
15 |         self.last_step = last_step
16 |         self.warmup_duration = warmup_duration
17 |         self.step()
18 | 
19 |     def state_dict(self):
20 |         return {k: v for k, v in self.__dict__.items() if k != 'lr_scheduler'}
21 | 
22 |     def load_state_dict(self, state_dict):
23 |         self.__dict__.update(state_dict)
24 | 
25 |     def get_warmup_factor(self, step, **kwargs):
26 |         raise NotImplementedError
27 | 
28 |     def step(self, ):
29 |         self.last_step += 1
30 |         if self.last_step >= self.warmup_duration:
31 |             return
32 |         factor = self.get_warmup_factor(self.last_step)
33 |         for i, pg in enumerate(self.lr_scheduler.optimizer.param_groups):
34 |             pg['lr'] = factor * self.warmup_end_values[i]
35 | 
36 |     def finished(self, ):
37 |         if self.last_step >= self.warmup_duration:
38 |             return True
39 |         return False
40 | 
41 | 
42 | @register()
43 | class LinearWarmup(Warmup):
44 |     def __init__(self, lr_scheduler: LRScheduler, warmup_duration: int, last_step: int = -1) -> None:
45 |         super().__init__(lr_scheduler, warmup_duration, last_step)
46 | 
47 |     def get_warmup_factor(self, step):
48 |         return min(1.0, (step + 1) / self.warmup_duration)
49 | 


--------------------------------------------------------------------------------
/engine/solver/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 4 | """
 5 | 
 6 | from ._solver import BaseSolver
 7 | from .clas_solver import ClasSolver
 8 | from .det_solver import DetSolver
 9 | 
10 | 
11 | 
12 | from typing import Dict
13 | 
14 | TASKS :Dict[str, BaseSolver] = {
15 |     'classification': ClasSolver,
16 |     'detection': DetSolver,
17 | }
18 | 


--------------------------------------------------------------------------------
/engine/solver/clas_engine.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 4 | """
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | 
 9 | from ..misc import (MetricLogger, SmoothedValue, reduce_dict)
10 | 
11 | 
12 | def train_one_epoch(model: nn.Module, criterion: nn.Module, dataloader, optimizer, ema, epoch, device):
13 |     """
14 |     """
15 |     model.train()
16 | 
17 |     metric_logger = MetricLogger(delimiter="  ")
18 |     metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}'))
19 |     print_freq = 100
20 |     header = 'Epoch: [{}]'.format(epoch)
21 | 
22 |     for imgs, labels in metric_logger.log_every(dataloader, print_freq, header):
23 |         imgs = imgs.to(device)
24 |         labels = labels.to(device)
25 | 
26 |         preds = model(imgs)
27 |         loss: torch.Tensor = criterion(preds, labels, epoch)
28 | 
29 |         optimizer.zero_grad()
30 |         loss.backward()
31 |         optimizer.step()
32 | 
33 |         if ema is not None:
34 |             ema.update(model)
35 | 
36 |         loss_reduced_values = {k: v.item() for k, v in reduce_dict({'loss': loss}).items()}
37 |         metric_logger.update(**loss_reduced_values)
38 |         metric_logger.update(lr=optimizer.param_groups[0]["lr"])
39 | 
40 |     metric_logger.synchronize_between_processes()
41 |     print("Averaged stats:", metric_logger)
42 | 
43 |     stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
44 |     return stats
45 | 
46 | 
47 | 
48 | @torch.no_grad()
49 | def evaluate(model, criterion, dataloader, device):
50 |     model.eval()
51 | 
52 |     metric_logger = MetricLogger(delimiter="  ")
53 |     # metric_logger.add_meter('acc', SmoothedValue(window_size=1, fmt='{global_avg:.4f}'))
54 |     # metric_logger.add_meter('loss', SmoothedValue(window_size=1, fmt='{value:.2f}'))
55 |     metric_logger.add_meter('acc', SmoothedValue(window_size=1))
56 |     metric_logger.add_meter('loss', SmoothedValue(window_size=1))
57 | 
58 |     header = 'Test:'
59 |     for imgs, labels in metric_logger.log_every(dataloader, 10, header):
60 |         imgs, labels = imgs.to(device), labels.to(device)
61 |         preds = model(imgs)
62 | 
63 |         acc = (preds.argmax(dim=-1) == labels).sum() / preds.shape[0]
64 |         loss = criterion(preds, labels)
65 | 
66 |         dict_reduced = reduce_dict({'acc': acc, 'loss': loss})
67 |         reduced_values = {k: v.item() for k, v in dict_reduced.items()}
68 |         metric_logger.update(**reduced_values)
69 | 
70 |     metric_logger.synchronize_between_processes()
71 |     print("Averaged stats:", metric_logger)
72 | 
73 |     stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
74 |     return stats
75 | 


--------------------------------------------------------------------------------
/engine/solver/clas_solver.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 4 | """
 5 | 
 6 | import time
 7 | import json
 8 | import datetime
 9 | from pathlib import Path
10 | 
11 | import torch
12 | import torch.nn as nn
13 | 
14 | from ..misc import dist_utils
15 | from ._solver import BaseSolver
16 | from .clas_engine import train_one_epoch, evaluate
17 | 
18 | 
19 | class ClasSolver(BaseSolver):
20 | 
21 |     def fit(self, ):
22 |         print("Start training")
23 |         self.train()
24 |         args = self.cfg
25 | 
26 |         n_parameters = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
27 |         print('Number of params:', n_parameters)
28 | 
29 |         output_dir = Path(args.output_dir)
30 |         output_dir.mkdir(exist_ok=True)
31 | 
32 |         start_time = time.time()
33 |         start_epoch = self.last_epoch + 1
34 |         for epoch in range(start_epoch, args.epoches):
35 | 
36 |             if dist_utils.is_dist_available_and_initialized():
37 |                 self.train_dataloader.sampler.set_epoch(epoch)
38 | 
39 |             train_stats = train_one_epoch(self.model,
40 |                                         self.criterion,
41 |                                         self.train_dataloader,
42 |                                         self.optimizer,
43 |                                         self.ema,
44 |                                         epoch=epoch,
45 |                                         device=self.device)
46 |             self.lr_scheduler.step()
47 |             self.last_epoch += 1
48 | 
49 |             if output_dir:
50 |                 checkpoint_paths = [output_dir / 'checkpoint.pth']
51 |                 # extra checkpoint before LR drop and every 100 epochs
52 |                 if (epoch + 1) % args.checkpoint_freq == 0:
53 |                     checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth')
54 |                 for checkpoint_path in checkpoint_paths:
55 |                     dist_utils.save_on_master(self.state_dict(epoch), checkpoint_path)
56 | 
57 |             module = self.ema.module if self.ema else self.model
58 |             test_stats = evaluate(module, self.criterion, self.val_dataloader, self.device)
59 | 
60 |             log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
61 |                          **{f'test_{k}': v for k, v in test_stats.items()},
62 |                          'epoch': epoch,
63 |                          'n_parameters': n_parameters}
64 | 
65 |             if output_dir and dist_utils.is_main_process():
66 |                 with (output_dir / "log.txt").open("a") as f:
67 |                     f.write(json.dumps(log_stats) + "\n")
68 | 
69 |         total_time = time.time() - start_time
70 |         total_time_str = str(datetime.timedelta(seconds=int(total_time)))
71 |         print('Training time {}'.format(total_time_str))
72 | 


--------------------------------------------------------------------------------
/figures/deimv2_coco_AP_vs_GFLOPs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intellindust-AI-Lab/DEIMv2/19d5b19a58c229dd7ad5f079947bbe398e005d01/figures/deimv2_coco_AP_vs_GFLOPs.png


--------------------------------------------------------------------------------
/figures/deimv2_coco_AP_vs_Params.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intellindust-AI-Lab/DEIMv2/19d5b19a58c229dd7ad5f079947bbe398e005d01/figures/deimv2_coco_AP_vs_Params.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==2.5.1
2 | torchvision==0.20.1
3 | faster-coco-eval>=1.6.7
4 | PyYAML
5 | tensorboard
6 | scipy
7 | calflops
8 | transformers
9 | 


--------------------------------------------------------------------------------
/tools/benchmark/dataset.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
  3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
  4 | """
  5 | 
  6 | import os
  7 | import glob
  8 | from PIL import Image
  9 | 
 10 | import torch
 11 | import torch.utils.data as data
 12 | import torchvision
 13 | import torchvision.transforms as T
 14 | import torchvision.transforms.functional as F
 15 | 
 16 | Image.MAX_IMAGE_PIXELS = None
 17 | 
 18 | class ToTensor(T.ToTensor):
 19 |     def __init__(self) -> None:
 20 |         super().__init__()
 21 | 
 22 |     def __call__(self, pic):
 23 |         if isinstance(pic, torch.Tensor):
 24 |             return pic
 25 |         return super().__call__(pic)
 26 | 
 27 | class PadToSize(T.Pad):
 28 |     def __init__(self, size, fill=0, padding_mode='constant'):
 29 |         super().__init__(0, fill, padding_mode)
 30 |         self.size = size
 31 |         self.fill = fill
 32 | 
 33 |     def __call__(self, img):
 34 |         """
 35 |         Args:
 36 |             img (PIL Image or Tensor): Image to be padded.
 37 | 
 38 |         Returns:
 39 |             PIL Image or Tensor: Padded image.
 40 |         """
 41 |         w, h = F.get_image_size(img)
 42 |         padding = (0, 0, self.size[0] - w, self.size[1] - h)
 43 |         return F.pad(img, padding, self.fill, self.padding_mode)
 44 | 
 45 | 
 46 | class Dataset(data.Dataset):
 47 |     def __init__(self, img_dir: str='', preprocess: T.Compose=None, device='cuda:0') -> None:
 48 |         super().__init__()
 49 | 
 50 |         self.device = device
 51 |         self.size = 640
 52 | 
 53 |         self.im_path_list = list(glob.glob(os.path.join(img_dir, '*.jpg')))
 54 | 
 55 |         if preprocess is None:
 56 |             self.preprocess = T.Compose([
 57 |                     T.Resize(size=639, max_size=640),
 58 |                     PadToSize(size=(640, 640), fill=114),
 59 |                     ToTensor(),
 60 |                     T.ConvertImageDtype(torch.float),
 61 |             ])
 62 |         else:
 63 |             self.preprocess = preprocess
 64 | 
 65 |     def __len__(self, ):
 66 |         return len(self.im_path_list)
 67 | 
 68 |     def __getitem__(self, index):
 69 |         # im = Image.open(self.img_path_list[index]).convert('RGB')
 70 |         im = torchvision.io.read_file(self.im_path_list[index])
 71 |         im = torchvision.io.decode_jpeg(im, mode=torchvision.io.ImageReadMode.RGB, device=self.device)
 72 |         _, h, w = im.shape # c,h,w
 73 | 
 74 |         im = self.preprocess(im)
 75 | 
 76 |         blob = {
 77 |             'images': im,
 78 |             'im_shape': torch.tensor([self.size, self.size]).to(im.device),
 79 |             'scale_factor': torch.tensor([self.size / h, self.size / w]).to(im.device),
 80 |             'orig_target_sizes': torch.tensor([w, h]).to(im.device),
 81 |         }
 82 | 
 83 |         return blob
 84 | 
 85 |     @staticmethod
 86 |     def post_process():
 87 |         pass
 88 | 
 89 |     @staticmethod
 90 |     def collate_fn():
 91 |         pass
 92 | 
 93 | 
 94 | def draw_nms_result(blob, outputs, draw_score_threshold=0.25, name=''):
 95 |     '''show result
 96 |     Keys:
 97 |         'num_dets', 'det_boxes', 'det_scores', 'det_classes'
 98 |     '''
 99 |     for i in range(blob['image'].shape[0]):
100 |         det_scores = outputs['det_scores'][i]
101 |         det_boxes = outputs['det_boxes'][i][det_scores > draw_score_threshold]
102 | 
103 |         im = (blob['image'][i] * 255).to(torch.uint8)
104 |         im = torchvision.utils.draw_bounding_boxes(im, boxes=det_boxes, width=2)
105 |         Image.fromarray(im.permute(1, 2, 0).cpu().numpy()).save(f'test_{name}_{i}.jpg')
106 | 


--------------------------------------------------------------------------------
/tools/benchmark/get_info.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..'))
 8 | 
 9 | import argparse
10 | from calflops import calculate_flops
11 | from engine.core import YAMLConfig
12 | 
13 | import torch
14 | import torch.nn as nn
15 | 
16 | def custom_repr(self):
17 |     return f'{{Tensor:{tuple(self.shape)}}} {original_repr(self)}'
18 | original_repr = torch.Tensor.__repr__
19 | torch.Tensor.__repr__ = custom_repr
20 | 
21 | def main(args, ):
22 |     """main
23 |     """
24 |     cfg = YAMLConfig(args.config, resume=None)
25 |     class Model_for_flops(nn.Module):
26 |         def __init__(self, ) -> None:
27 |             super().__init__()
28 |             self.model = cfg.model.deploy()
29 | 
30 |         def forward(self, images):
31 |             outputs = self.model(images)
32 |             return outputs
33 | 
34 |     model = Model_for_flops().eval()
35 | 
36 |     flops, macs, _ = calculate_flops(model=model,
37 |                                      input_shape=(1, 3, 640, 640),
38 |                                      output_as_string=True,
39 |                                      output_precision=4)
40 |     params = sum(p.numel() for p in model.parameters())
41 |     print("Model FLOPs:%s   MACs:%s   Params:%s \n" %(flops, macs, params))
42 | 
43 | 
44 | if __name__ == '__main__':
45 | 
46 |     parser = argparse.ArgumentParser()
47 |     parser.add_argument('--config', '-c', default= "configs/dfine/dfine_hgnetv2_l_coco.yml", type=str)
48 |     args = parser.parse_args()
49 | 
50 |     main(args)
51 | 


--------------------------------------------------------------------------------
/tools/benchmark/requirements.txt:
--------------------------------------------------------------------------------
1 | onnxruntime
2 | tensorrt
3 | pycuda
4 | calflops
5 | tqdm
6 | # onnx_graphsurgeon # for YOLOs
7 | 


--------------------------------------------------------------------------------
/tools/benchmark/utils.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import contextlib
 3 | import numpy as np
 4 | from PIL import Image
 5 | from collections import OrderedDict
 6 | 
 7 | import onnx
 8 | import torch
 9 | import onnx_graphsurgeon
10 | 
11 | 
12 | def to_binary_data(path, size=(640, 640), output_name='input_tensor.bin'):
13 |     '''--loadInputs='image:input_tensor.bin'
14 |     '''
15 |     im = Image.open(path).resize(size)
16 |     data = np.asarray(im, dtype=np.float32).transpose(2, 0, 1)[None] / 255.
17 |     data.tofile(output_name)
18 | 
19 | 
20 | def yolo_insert_nms(path, score_threshold=0.01, iou_threshold=0.7, max_output_boxes=300, simplify=False):
21 |     '''
22 |     http://www.xavierdupre.fr/app/onnxcustom/helpsphinx/api/onnxops/onnx__EfficientNMS_TRT.html
23 |     https://huggingface.co/spaces/muttalib1326/Punjabi_Character_Detection/blob/3dd1e17054c64e5f6b2254278f96cfa2bf418cd4/utils/add_nms.py
24 |     '''
25 |     onnx_model = onnx.load(path)
26 | 
27 |     if simplify:
28 |         from onnxsim import simplify
29 |         onnx_model, _ = simplify(onnx_model,  overwrite_input_shapes={'image': [1, 3, 640, 640]})
30 | 
31 |     graph = onnx_graphsurgeon.import_onnx(onnx_model)
32 |     graph.toposort()
33 |     graph.fold_constants()
34 |     graph.cleanup()
35 | 
36 |     topk = max_output_boxes
37 |     attrs = OrderedDict(plugin_version='1',
38 |                         background_class=-1,
39 |                         max_output_boxes=topk,
40 |                         score_threshold=score_threshold,
41 |                         iou_threshold=iou_threshold,
42 |                         score_activation=False,
43 |                         box_coding=0, )
44 | 
45 |     outputs = [onnx_graphsurgeon.Variable('num_dets', np.int32, [-1, 1]),
46 |                onnx_graphsurgeon.Variable('det_boxes', np.float32, [-1, topk, 4]),
47 |                onnx_graphsurgeon.Variable('det_scores', np.float32, [-1, topk]),
48 |                onnx_graphsurgeon.Variable('det_classes', np.int32, [-1, topk])]
49 | 
50 |     graph.layer(op='EfficientNMS_TRT',
51 |                 name="batched_nms",
52 |                 inputs=[graph.outputs[0],
53 |                         graph.outputs[1]],
54 |                 outputs=outputs,
55 |                 attrs=attrs, )
56 | 
57 |     graph.outputs = outputs
58 |     graph.cleanup().toposort()
59 | 
60 |     onnx.save(onnx_graphsurgeon.export_onnx(graph), 'yolo_w_nms.onnx')
61 | 
62 | 
63 | class TimeProfiler(contextlib.ContextDecorator):
64 |     def __init__(self, ):
65 |         self.total = 0
66 | 
67 |     def __enter__(self, ):
68 |         self.start = self.time()
69 |         return self
70 | 
71 |     def __exit__(self, type, value, traceback):
72 |         self.total += self.time() - self.start
73 | 
74 |     def reset(self, ):
75 |         self.total = 0
76 | 
77 |     def time(self, ):
78 |         if torch.cuda.is_available():
79 |             torch.cuda.synchronize()
80 |         return time.time()
81 | 


--------------------------------------------------------------------------------
/tools/deployment/export_onnx.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DEIMv2: Real-Time Object Detection Meets DINOv3
  3 | Copyright (c) 2025 The DEIMv2 Authors. All Rights Reserved.
  4 | ---------------------------------------------------------------------------------
  5 | D-FINE: Redefine Regression Task of DETRs as Fine-grained Distribution Refinement
  6 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
  7 | ---------------------------------------------------------------------------------
  8 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
  9 | Copyright (c) 2023 lyuwenyu. All Rights Reserved.
 10 | """
 11 | 
 12 | import os
 13 | import sys
 14 | 
 15 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..'))
 16 | 
 17 | import torch
 18 | import torch.nn as nn
 19 | 
 20 | from engine.core import YAMLConfig
 21 | 
 22 | 
 23 | def main(args, ):
 24 |     """main
 25 |     """
 26 |     cfg = YAMLConfig(args.config, resume=args.resume)
 27 | 
 28 |     if 'HGNetv2' in cfg.yaml_cfg:
 29 |         cfg.yaml_cfg['HGNetv2']['pretrained'] = False
 30 | 
 31 |     if args.resume:
 32 |         checkpoint = torch.load(args.resume, map_location='cpu')
 33 |         if 'ema' in checkpoint:
 34 |             state = checkpoint['ema']['module']
 35 |         else:
 36 |             state = checkpoint['model']
 37 | 
 38 |         # NOTE load train mode state -> convert to deploy mode
 39 |         cfg.model.load_state_dict(state)
 40 | 
 41 |     else:
 42 |         # raise AttributeError('Only support resume to load model.state_dict by now.')
 43 |         print('not load model.state_dict, use default init state dict...')
 44 | 
 45 |     class Model(nn.Module):
 46 |         def __init__(self, ) -> None:
 47 |             super().__init__()
 48 |             self.model = cfg.model.deploy()
 49 |             self.postprocessor = cfg.postprocessor.deploy()
 50 | 
 51 |         def forward(self, images, orig_target_sizes):
 52 |             outputs = self.model(images)
 53 |             outputs = self.postprocessor(outputs, orig_target_sizes)
 54 |             return outputs
 55 | 
 56 |     model = Model()
 57 | 
 58 |     img_size = cfg.yaml_cfg["eval_spatial_size"]
 59 |     data = torch.rand(32, 3, *img_size)
 60 |     size = torch.tensor([img_size])
 61 |     _ = model(data, size)
 62 | 
 63 |     dynamic_axes = {
 64 |         'images': {0: 'N', },
 65 |         'orig_target_sizes': {0: 'N'}
 66 |     }
 67 | 
 68 |     output_file = args.resume.replace('.pth', '.onnx') if args.resume else 'model.onnx'
 69 | 
 70 |     torch.onnx.export(
 71 |         model,
 72 |         (data, size),
 73 |         output_file,
 74 |         input_names=['images', 'orig_target_sizes'],
 75 |         output_names=['labels', 'boxes', 'scores'],
 76 |         dynamic_axes=dynamic_axes,
 77 |         opset_version=args.opset,
 78 |         verbose=False,
 79 |         do_constant_folding=True,
 80 |     )
 81 | 
 82 |     if args.check:
 83 |         import onnx
 84 |         onnx_model = onnx.load(output_file)
 85 |         onnx.checker.check_model(onnx_model)
 86 |         print('Check export onnx model done...')
 87 | 
 88 |     if args.simplify:
 89 |         import onnx
 90 |         import onnxsim
 91 |         dynamic = True
 92 |         # input_shapes = {'images': [1, 3, 640, 640], 'orig_target_sizes': [1, 2]} if dynamic else None
 93 |         input_shapes = {'images': data.shape, 'orig_target_sizes': size.shape} if dynamic else None
 94 |         onnx_model_simplify, check = onnxsim.simplify(output_file, test_input_shapes=input_shapes)
 95 |         onnx.save(onnx_model_simplify, output_file)
 96 |         print(f'Simplify onnx model {check}...')
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 | 
101 |     import argparse
102 |     parser = argparse.ArgumentParser()
103 |     parser.add_argument('--config', '-c', default='configs/dfine/dfine_hgnetv2_l_coco.yml', type=str, )
104 |     parser.add_argument('--resume', '-r', type=str, )
105 |     parser.add_argument('--opset', type=int, default=17,)
106 |     parser.add_argument('--check',  action='store_true')
107 |     parser.add_argument('--simplify',  action='store_true')
108 |     args = parser.parse_args()
109 |     main(args)
110 | 


--------------------------------------------------------------------------------
/tools/deployment/export_yolo_w_nms.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchvision
 3 | 
 4 | import numpy as np
 5 | import onnxruntime as ort
 6 | 
 7 | from utils import yolo_insert_nms
 8 | 
 9 | class YOLO11(torch.nn.Module):
10 |     def __init__(self, name) -> None:
11 |         super().__init__()
12 |         from ultralytics import YOLO
13 |         # Load a model
14 |         # build a new model from scratch
15 |         # model = YOLO(f'{name}.yaml')
16 | 
17 |         # load a pretrained model (recommended for training)
18 |         model = YOLO("yolo11n.pt")
19 |         self.model = model.model
20 | 
21 |     def forward(self, x):
22 |         '''https://github.com/ultralytics/ultralytics/blob/main/ultralytics/nn/tasks.py#L216
23 |         '''
24 |         pred: torch.Tensor = self.model(x)[0] # n 84 8400,
25 |         pred = pred.permute(0, 2, 1)
26 |         boxes, scores = pred.split([4, 80], dim=-1)
27 |         boxes = torchvision.ops.box_convert(boxes, in_fmt='cxcywh', out_fmt='xyxy')
28 | 
29 |         return boxes, scores
30 | 
31 | 
32 | 
33 | def export_onnx(name='yolov8n'):
34 |     '''export onnx
35 |     '''
36 |     m = YOLO11(name)
37 | 
38 |     x = torch.rand(1, 3, 640, 640)
39 |     dynamic_axes = {
40 |         'image': {0: '-1'}
41 |     }
42 |     torch.onnx.export(m, x, f'{name}.onnx',
43 |                       input_names=['image'],
44 |                       output_names=['boxes', 'scores'],
45 |                       opset_version=13,
46 |                       dynamic_axes=dynamic_axes)
47 | 
48 |     data = np.random.rand(1, 3, 640, 640).astype(np.float32)
49 |     sess = ort.InferenceSession(f'{name}.onnx')
50 |     _ = sess.run(output_names=None, input_feed={'image': data})
51 | 
52 |     import onnx
53 |     import onnxslim
54 |     model_onnx = onnx.load(f'{name}.onnx')
55 |     model_onnx = onnxslim.slim(model_onnx)
56 |     onnx.save(model_onnx, f'{name}.onnx')
57 | 
58 | 
59 | if __name__ == '__main__':
60 | 
61 |     import argparse
62 |     parser = argparse.ArgumentParser()
63 |     parser.add_argument('--name', type=str, default='yolo11n_tuned')
64 |     parser.add_argument('--score_threshold', type=float, default=0.01)
65 |     parser.add_argument('--iou_threshold', type=float, default=0.6)
66 |     parser.add_argument('--max_output_boxes', type=int, default=300)
67 |     args = parser.parse_args()
68 | 
69 |     export_onnx(name=args.name)
70 | 
71 |     yolo_insert_nms(path=f'{args.name}.onnx',
72 |                     score_threshold=args.score_threshold,
73 |                     iou_threshold=args.iou_threshold,
74 |                     max_output_boxes=args.max_output_boxes, )
75 | 


--------------------------------------------------------------------------------
/tools/inference/openvino_inf.py:
--------------------------------------------------------------------------------
1 | """
2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved.
4 | """
5 | 
6 | 
7 | # please reference: https://github.com/guojin-yan/RT-DETR-OpenVINO
8 | 


--------------------------------------------------------------------------------
/tools/inference/requirements.txt:
--------------------------------------------------------------------------------
1 | onnxruntime
2 | tensorrt
3 | 


--------------------------------------------------------------------------------
/tools/inference/torch_inf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DEIMv2: Real-Time Object Detection Meets DINOv3
  3 | Copyright (c) 2025 The DEIMv2 Authors. All Rights Reserved.
  4 | ---------------------------------------------------------------------------------
  5 | Modified from D-FINE (https://github.com/Peterande/D-FINE)
  6 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
  7 | """
  8 | 
  9 | import os
 10 | import sys
 11 | 
 12 | import cv2  # Added for video processing
 13 | import numpy as np
 14 | import torch
 15 | import torch.nn as nn
 16 | import torchvision.transforms as T
 17 | from PIL import Image, ImageDraw
 18 | 
 19 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
 20 | from engine.core import YAMLConfig
 21 | 
 22 | 
 23 | def draw(images, labels, boxes, scores, thrh=0.4):
 24 |     for i, im in enumerate(images):
 25 |         draw = ImageDraw.Draw(im)
 26 | 
 27 |         scr = scores[i]
 28 |         lab = labels[i][scr > thrh]
 29 |         box = boxes[i][scr > thrh]
 30 |         scrs = scr[scr > thrh]
 31 | 
 32 |         for j, b in enumerate(box):
 33 |             draw.rectangle(list(b), outline='red')
 34 |             draw.text((b[0], b[1]), text=f"{lab[j].item()} {round(scrs[j].item(), 2)}", fill='blue', )
 35 | 
 36 |         im.save('torch_results.jpg')
 37 | 
 38 | 
 39 | def process_image(model, device, file_path, size=(640, 640)):
 40 |     im_pil = Image.open(file_path).convert('RGB')
 41 |     w, h = im_pil.size
 42 |     orig_size = torch.tensor([[w, h]]).to(device)
 43 | 
 44 |     transforms = T.Compose([
 45 |         T.Resize(size),
 46 |         T.ToTensor(),
 47 |     ])
 48 |     im_data = transforms(im_pil).unsqueeze(0).to(device)
 49 | 
 50 |     output = model(im_data, orig_size)
 51 |     labels, boxes, scores = output
 52 | 
 53 |     draw([im_pil], labels, boxes, scores)
 54 | 
 55 | 
 56 | def process_video(model, device, file_path, size=(640, 640)):
 57 |     cap = cv2.VideoCapture(file_path)
 58 | 
 59 |     # Get video properties
 60 |     fps = cap.get(cv2.CAP_PROP_FPS)
 61 |     orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
 62 |     orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
 63 | 
 64 |     # Define the codec and create VideoWriter object
 65 |     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
 66 |     out = cv2.VideoWriter('torch_results.mp4', fourcc, fps, (orig_w, orig_h))
 67 | 
 68 |     transforms = T.Compose([
 69 |         T.Resize(size),
 70 |         T.ToTensor(),
 71 |     ])
 72 | 
 73 |     frame_count = 0
 74 |     print("Processing video frames...")
 75 |     while cap.isOpened():
 76 |         ret, frame = cap.read()
 77 |         if not ret:
 78 |             break
 79 | 
 80 |         # Convert frame to PIL image
 81 |         frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
 82 | 
 83 |         w, h = frame_pil.size
 84 |         orig_size = torch.tensor([[w, h]]).to(device)
 85 | 
 86 |         im_data = transforms(frame_pil).unsqueeze(0).to(device)
 87 | 
 88 |         output = model(im_data, orig_size)
 89 |         labels, boxes, scores = output
 90 | 
 91 |         # Draw detections on the frame
 92 |         draw([frame_pil], labels, boxes, scores)
 93 | 
 94 |         # Convert back to OpenCV image
 95 |         frame = cv2.cvtColor(np.array(frame_pil), cv2.COLOR_RGB2BGR)
 96 | 
 97 |         # Write the frame
 98 |         out.write(frame)
 99 |         frame_count += 1
100 | 
101 |         if frame_count % 10 == 0:
102 |             print(f"Processed {frame_count} frames...")
103 | 
104 |     cap.release()
105 |     out.release()
106 |     print("Video processing complete. Result saved as 'results_video.mp4'.")
107 | 
108 | 
109 | def main(args):
110 |     """Main function"""
111 |     cfg = YAMLConfig(args.config, resume=args.resume)
112 | 
113 |     if 'HGNetv2' in cfg.yaml_cfg:
114 |         cfg.yaml_cfg['HGNetv2']['pretrained'] = False
115 | 
116 |     if args.resume:
117 |         checkpoint = torch.load(args.resume, map_location='cpu')
118 |         if 'ema' in checkpoint:
119 |             state = checkpoint['ema']['module']
120 |         else:
121 |             state = checkpoint['model']
122 |     else:
123 |         raise AttributeError('Only support resume to load model.state_dict by now.')
124 | 
125 |     # Load train mode state and convert to deploy mode
126 |     cfg.model.load_state_dict(state)
127 | 
128 |     class Model(nn.Module):
129 |         def __init__(self):
130 |             super().__init__()
131 |             self.model = cfg.model.deploy()
132 |             self.postprocessor = cfg.postprocessor.deploy()
133 | 
134 |         def forward(self, images, orig_target_sizes):
135 |             outputs = self.model(images)
136 |             outputs = self.postprocessor(outputs, orig_target_sizes)
137 |             return outputs
138 | 
139 |     device = args.device
140 |     model = Model().to(device)
141 |     img_size = cfg.yaml_cfg["eval_spatial_size"]
142 | 
143 |     # Check if the input file is an image or a video
144 |     file_path = args.input
145 |     if os.path.splitext(file_path)[-1].lower() in ['.jpg', '.jpeg', '.png', '.bmp']:
146 |         # Process as image
147 |         process_image(model, device, file_path, img_size)
148 |         print("Image processing complete.")
149 |     else:
150 |         # Process as video
151 |         process_video(model, device, file_path, img_size)
152 | 
153 | 
154 | if __name__ == '__main__':
155 |     import argparse
156 |     parser = argparse.ArgumentParser()
157 |     parser.add_argument('-c', '--config', type=str, required=True)
158 |     parser.add_argument('-r', '--resume', type=str, required=True)
159 |     parser.add_argument('-i', '--input', type=str, required=True)
160 |     parser.add_argument('-d', '--device', type=str, default='cpu')
161 |     args = parser.parse_args()
162 |     main(args)
163 | 


--------------------------------------------------------------------------------
/tools/reference/convert_weight.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os
 3 | import argparse
 4 | 
 5 | def save_only_ema_weights(checkpoint_file):
 6 |     """Extract and save only the EMA weights."""
 7 |     checkpoint = torch.load(checkpoint_file, map_location='cpu')
 8 | 
 9 |     weights = {}
10 |     if 'ema' in checkpoint:
11 |         weights['model'] = checkpoint['ema']['module']
12 |     else:
13 |         raise ValueError("The checkpoint does not contain 'ema'.")
14 | 
15 |     dir_name, base_name = os.path.split(checkpoint_file)
16 |     name, ext = os.path.splitext(base_name)
17 |     output_file = os.path.join(dir_name, f"{name}_converted{ext}")
18 | 
19 |     torch.save(weights, output_file)
20 |     print(f"EMA weights saved to {output_file}")
21 | 
22 | if __name__ == '__main__':
23 |     parser = argparse.ArgumentParser(description="Extract and save only EMA weights.")
24 |     parser.add_argument('checkpoint_dir', type=str, help="Path to the input checkpoint file.")
25 | 
26 |     args = parser.parse_args()
27 |     for file in os.listdir(args.checkpoint_dir):
28 |         if '.pth' in file and '_converted' not in file:
29 |             save_only_ema_weights(os.path.join(args.checkpoint_dir, file))
30 | 


--------------------------------------------------------------------------------
/tools/reference/safe_training.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Function to display the menu for selecting model size
 4 | select_model_size() {
 5 |     echo "Select model size:"
 6 |     select size in s m l x; do
 7 |         case $size in
 8 |             s|m|l|x)
 9 |                 echo "You selected model size: $size"
10 |                 MODEL_SIZE=$size
11 |                 break
12 |                 ;;
13 |             *)
14 |                 echo "Invalid selection. Please try again."
15 |                     ;;
16 |         esac
17 |     done
18 | }
19 | 
20 | # Function to display the menu for selecting task
21 | select_task() {
22 |     echo "Select task:"
23 |     select task in obj365 obj2coco coco; do
24 |         case $task in
25 |             obj365|obj2coco|coco)
26 |                 echo "You selected task: $task"
27 |                 TASK=$task
28 |                 break
29 |                 ;;
30 |             *)
31 |                 echo "Invalid selection. Please try again."
32 |                 ;;
33 |         esac
34 |     done
35 | }
36 | 
37 | # Function to ask if the user wants to save logs to a txt file
38 | ask_save_logs() {
39 |     while true; do
40 |         read -p "Do you want to save logs to a txt file? (y/n): " yn
41 |         case $yn in
42 |             [Yy]* )
43 |                 SAVE_LOGS=true
44 |                 break
45 |                 ;;
46 |             [Nn]* )
47 |                 SAVE_LOGS=false
48 |                 break
49 |                 ;;
50 |             * ) echo "Please answer yes or no.";;
51 |         esac
52 |     done
53 | }
54 | 
55 | # Call the functions to let the user select
56 | select_model_size
57 | select_task
58 | ask_save_logs
59 | 
60 | # Set config file and output directory based on selection
61 | if [ "$TASK" = "coco" ]; then
62 |     CONFIG_FILE="configs/dfine/dfine_hgnetv2_${MODEL_SIZE}_${TASK}.yml"
63 | else
64 |     CONFIG_FILE="configs/dfine/objects365/dfine_hgnetv2_${MODEL_SIZE}_${TASK}.yml"
65 | fi
66 | 
67 | OUTPUT_DIR="output/${MODEL_SIZE}_${TASK}"
68 | 
69 | # Construct the training command
70 | TRAIN_CMD="CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=7777 --nproc_per_node=4 train.py -c $CONFIG_FILE --use-amp --seed=0 --output-dir $OUTPUT_DIR"
71 | 
72 | # Append log redirection if SAVE_LOGS is true
73 | if [ "$SAVE_LOGS" = true ]; then
74 |     LOG_FILE="${MODEL_SIZE}_${TASK}.txt"
75 |     TRAIN_CMD="$TRAIN_CMD &> \"$LOG_FILE\" 2>&1 &"
76 | else
77 |     TRAIN_CMD="$TRAIN_CMD &"
78 | fi
79 | 
80 | # Run the training command
81 | eval $TRAIN_CMD
82 | if [ $? -ne 0 ]; then
83 |     echo "First training failed, restarting with resume option..."
84 |     while true; do
85 |         RESUME_CMD="CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=7777 --nproc_per_node=4 train.py -c $CONFIG_FILE --use-amp --seed=0 --output-dir $OUTPUT_DIR -r ${OUTPUT_DIR}/last.pth"
86 |         if [ "$SAVE_LOGS" = true ]; then
87 |             LOG_FILE="${MODEL_SIZE}_${TASK}_2.txt"
88 |             RESUME_CMD="$RESUME_CMD &> \"$LOG_FILE\" 2>&1 &"
89 |         else
90 |             RESUME_CMD="$RESUME_CMD &"
91 |         fi
92 |         eval $RESUME_CMD
93 |         if [ $? -eq 0 ]; then
94 |             break
95 |         fi
96 |     done
97 | fi
98 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | """
 2 | DEIMv2: Real-Time Object Detection Meets DINOv3
 3 | Copyright (c) 2025 The DEIMv2 Authors. All Rights Reserved.
 4 | ---------------------------------------------------------------------------------
 5 | DEIM: DETR with Improved Matching for Fast Convergence
 6 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
 7 | ---------------------------------------------------------------------------------
 8 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
 9 | Copyright (c) 2023 lyuwenyu. All Rights Reserved.
10 | """
11 | 
12 | import os
13 | import sys
14 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
15 | 
16 | import argparse
17 | 
18 | from engine.misc import dist_utils
19 | from engine.core import YAMLConfig, yaml_utils
20 | from engine.solver import TASKS
21 | 
22 | debug=False
23 | 
24 | if debug:
25 |     import torch
26 |     def custom_repr(self):
27 |         return f'{{Tensor:{tuple(self.shape)}}} {original_repr(self)}'
28 |     original_repr = torch.Tensor.__repr__
29 |     torch.Tensor.__repr__ = custom_repr
30 | 
31 | def main(args, ) -> None:
32 |     """main
33 |     """
34 |     dist_utils.setup_distributed(args.print_rank, args.print_method, seed=args.seed)
35 | 
36 |     assert not all([args.tuning, args.resume]), \
37 |         'Only support from_scrach or resume or tuning at one time'
38 | 
39 | 
40 |     update_dict = yaml_utils.parse_cli(args.update)
41 |     update_dict.update({k: v for k, v in args.__dict__.items() \
42 |         if k not in ['update', ] and v is not None})
43 | 
44 |     cfg = YAMLConfig(args.config, **update_dict)
45 | 
46 |     if args.resume or args.tuning:
47 |         if 'HGNetv2' in cfg.yaml_cfg:
48 |             cfg.yaml_cfg['HGNetv2']['pretrained'] = False
49 | 
50 |     print('cfg: ', cfg.__dict__)
51 | 
52 |     solver = TASKS[cfg.yaml_cfg['task']](cfg)
53 | 
54 |     if args.test_only:
55 |         solver.val()
56 |     else:
57 |         solver.fit()
58 | 
59 |     dist_utils.cleanup()
60 | 
61 | 
62 | if __name__ == '__main__':
63 | 
64 |     parser = argparse.ArgumentParser()
65 | 
66 |     # priority 0
67 |     parser.add_argument('-c', '--config', type=str, default='')
68 |     parser.add_argument('-r', '--resume', type=str, help='resume from checkpoint')
69 |     parser.add_argument('-t', '--tuning', type=str, help='tuning from checkpoint')
70 |     parser.add_argument('-d', '--device', type=str, help='device',)
71 |     parser.add_argument('--seed', type=int, default=0, help='exp reproducibility')
72 |     parser.add_argument('--use-amp', action='store_true', help='auto mixed precision training')
73 |     parser.add_argument('--output-dir', type=str, help='output directoy')
74 |     parser.add_argument('--summary-dir', type=str, help='tensorboard summry')
75 |     parser.add_argument('--test-only', action='store_true', default=False,)
76 | 
77 |     # priority 1
78 |     parser.add_argument('-u', '--update', nargs='+', help='update yaml config')
79 | 
80 |     # env
81 |     parser.add_argument('--print-method', type=str, default='builtin', help='print method')
82 |     parser.add_argument('--print-rank', type=int, default=0, help='print rank id')
83 | 
84 |     parser.add_argument('--local-rank', type=int, help='local rank id')
85 |     args = parser.parse_args()
86 | 
87 |     main(args)
88 | 


--------------------------------------------------------------------------------