├── .gitignore ├── LICENSE ├── README.md ├── configs ├── base │ ├── dataloader.yml │ ├── deim.yml │ ├── deimv2.yml │ ├── dfine_hgnetv2.yml │ ├── optimizer.yml │ ├── rt_deim.yml │ ├── rt_optimizer.yml │ └── rtdetrv2_r50vd.yml ├── dataset │ ├── coco_detection.yml │ ├── crowdhuman_detection.yml │ ├── custom_detection.yml │ ├── obj365_detection.yml │ └── voc_detection.yml ├── deim_dfine │ ├── deim_hgnetv2_l_coco.yml │ ├── deim_hgnetv2_m_coco.yml │ ├── deim_hgnetv2_n_coco.yml │ ├── deim_hgnetv2_s_coco.yml │ ├── deim_hgnetv2_x_coco.yml │ ├── dfine_hgnetv2_l_coco.yml │ ├── dfine_hgnetv2_m_coco.yml │ ├── dfine_hgnetv2_n_coco.yml │ ├── dfine_hgnetv2_s_coco.yml │ ├── dfine_hgnetv2_x_coco.yml │ └── object365 │ │ ├── deim_hgnetv2_x_obj2coco_24e.yml │ │ └── dfine_hgnetv2_x_obj2coco.yml ├── deim_rtdetrv2 │ ├── deim_r101vd_60e_coco.yml │ ├── deim_r18vd_120e_coco.yml │ ├── deim_r34vd_120e_coco.yml │ ├── deim_r50vd_60e_coco.yml │ ├── deim_r50vd_m_60e_coco.yml │ ├── rtdetrv2_r101vd_6x_coco.yml │ ├── rtdetrv2_r18vd_120e_coco.yml │ ├── rtdetrv2_r34vd_120e_coco.yml │ ├── rtdetrv2_r50vd_6x_coco.yml │ └── rtdetrv2_r50vd_m_7x_coco.yml ├── deimv2 │ ├── deimv2_dinov3_l_coco.yml │ ├── deimv2_dinov3_m_coco.yml │ ├── deimv2_dinov3_s_coco.yml │ ├── deimv2_dinov3_x_coco.yml │ ├── deimv2_hgnetv2_atto_coco.yml │ ├── deimv2_hgnetv2_femto_coco.yml │ ├── deimv2_hgnetv2_l_coco.yml │ ├── deimv2_hgnetv2_m_coco.yml │ ├── deimv2_hgnetv2_n_coco.yml │ ├── deimv2_hgnetv2_pico_coco.yml │ ├── deimv2_hgnetv2_s_coco.yml │ └── deimv2_hgnetv2_x_coco.yml └── runtime.yml ├── engine ├── __init__.py ├── backbone │ ├── __init__.py │ ├── common.py │ ├── csp_darknet.py │ ├── csp_resnet.py │ ├── dinov3 │ │ ├── __init__.py │ │ ├── layers │ │ │ ├── __init__.py │ │ │ ├── attention.py │ │ │ ├── block.py │ │ │ ├── dino_head.py │ │ │ ├── ffn_layers.py │ │ │ ├── fp8_linear.py │ │ │ ├── layer_scale.py │ │ │ ├── patch_embed.py │ │ │ ├── rms_norm.py │ │ │ ├── rope_position_encoding.py │ │ │ └── sparse_linear.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ ├── cluster.py │ │ │ ├── custom_callable.py │ │ │ ├── dtype.py │ │ │ └── utils.py │ │ └── vision_transformer.py │ ├── dinov3_adapter.py │ ├── hgnetv2.py │ ├── ms_deform_attn.py │ ├── presnet.py │ ├── test_resnet.py │ ├── timm_model.py │ ├── torchvision_model.py │ ├── utils.py │ └── vit_tiny.py ├── core │ ├── __init__.py │ ├── _config.py │ ├── workspace.py │ ├── yaml_config.py │ └── yaml_utils.py ├── data │ ├── __init__.py │ ├── _misc.py │ ├── dataloader.py │ ├── dataset │ │ ├── __init__.py │ │ ├── _dataset.py │ │ ├── coco_dataset.py │ │ ├── coco_eval.py │ │ ├── coco_utils.py │ │ ├── voc_detection.py │ │ └── voc_eval.py │ └── transforms │ │ ├── __init__.py │ │ ├── _transforms.py │ │ ├── container.py │ │ ├── functional.py │ │ └── mosaic.py ├── deim │ ├── __init__.py │ ├── box_ops.py │ ├── deim.py │ ├── deim_criterion.py │ ├── deim_decoder.py │ ├── deim_utils.py │ ├── denoising.py │ ├── dfine_decoder.py │ ├── dfine_utils.py │ ├── hybrid_encoder.py │ ├── lite_encoder.py │ ├── matcher.py │ ├── postprocessor.py │ ├── rtdetrv2_decoder.py │ └── utils.py ├── misc │ ├── __init__.py │ ├── box_ops.py │ ├── dist_utils.py │ ├── lazy_loader.py │ ├── logger.py │ ├── profiler_utils.py │ └── visualizer.py ├── optim │ ├── __init__.py │ ├── amp.py │ ├── ema.py │ ├── lr_scheduler.py │ ├── optim.py │ └── warmup.py └── solver │ ├── __init__.py │ ├── _solver.py │ ├── clas_engine.py │ ├── clas_solver.py │ ├── det_engine.py │ └── det_solver.py ├── figures ├── deimv2_coco_AP_vs_GFLOPs.png └── deimv2_coco_AP_vs_Params.png ├── requirements.txt ├── tools ├── benchmark │ ├── dataset.py │ ├── get_info.py │ ├── requirements.txt │ ├── trt_benchmark.py │ └── utils.py ├── dataset │ ├── remap_obj365.py │ └── resize_obj365.py ├── deployment │ ├── export_onnx.py │ └── export_yolo_w_nms.py ├── inference │ ├── onnx_inf.py │ ├── openvino_inf.py │ ├── requirements.txt │ ├── torch_inf.py │ ├── torch_inf_vis.py │ └── trt_inf.py ├── reference │ ├── convert_weight.py │ └── safe_training.sh └── visualization │ └── fiftyone_vis.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignored Files 2 | outputs/ 3 | ckpts/ 4 | testenv/ 5 | backup 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | pip-wheel-metadata/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | *.py,cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 100 | __pypackages__/ 101 | 102 | # Celery stuff 103 | celerybeat-schedule 104 | celerybeat.pid 105 | 106 | # SageMath parsed files 107 | *.sage.py 108 | 109 | # Environments 110 | .env 111 | .venv 112 | env/ 113 | venv/ 114 | ENV/ 115 | env.bak/ 116 | venv.bak/ 117 | 118 | # Spyder project settings 119 | .spyderproject 120 | .spyproject 121 | 122 | # Rope project settings 123 | .ropeproject 124 | 125 | # mkdocs documentation 126 | /site 127 | 128 | # mypy 129 | .mypy_cache/ 130 | .dmypy.json 131 | dmypy.json 132 | 133 | # Pyre type checker 134 | .pyre/ 135 | 136 | # PyCharm 137 | .idea 138 | .vscode/ 139 | *.pt 140 | *.pth 141 | *.onnx 142 | *.zip 143 | *.html 144 | .DS_Store 145 | -------------------------------------------------------------------------------- /configs/base/dataloader.yml: -------------------------------------------------------------------------------- 1 | 2 | train_dataloader: 3 | dataset: 4 | transforms: 5 | ops: 6 | - {type: RandomPhotometricDistort, p: 0.5} 7 | - {type: RandomZoomOut, fill: 0} 8 | - {type: RandomIoUCrop, p: 0.8} 9 | - {type: SanitizeBoundingBoxes, min_size: 1} 10 | - {type: RandomHorizontalFlip} 11 | - {type: Resize, size: [640, 640], } 12 | - {type: SanitizeBoundingBoxes, min_size: 1} 13 | - {type: ConvertPILImage, dtype: 'float32', scale: True} 14 | - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} 15 | policy: 16 | name: stop_epoch 17 | epoch: 72 # epoch in [71, ~) stop `ops` 18 | ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop'] 19 | 20 | collate_fn: 21 | type: BatchImageCollateFunction 22 | base_size: 640 23 | base_size_repeat: 3 24 | stop_epoch: 72 # epoch in [72, ~) stop `multiscales` 25 | 26 | shuffle: True 27 | total_batch_size: 32 # total batch size equals to 32 (4 * 8) 28 | num_workers: 4 29 | 30 | 31 | val_dataloader: 32 | dataset: 33 | transforms: 34 | ops: 35 | - {type: Resize, size: [640, 640], } 36 | - {type: ConvertPILImage, dtype: 'float32', scale: True} 37 | shuffle: False 38 | total_batch_size: 64 39 | num_workers: 4 40 | -------------------------------------------------------------------------------- /configs/base/deim.yml: -------------------------------------------------------------------------------- 1 | # Dense O2O 2 | train_dataloader: 3 | dataset: 4 | transforms: 5 | ops: 6 | - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5], 7 | probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True} 8 | - {type: RandomPhotometricDistort, p: 0.5} 9 | - {type: RandomZoomOut, fill: 0} 10 | - {type: RandomIoUCrop, p: 0.8} 11 | - {type: SanitizeBoundingBoxes, min_size: 1} 12 | - {type: RandomHorizontalFlip} 13 | - {type: Resize, size: [640, 640], } 14 | - {type: SanitizeBoundingBoxes, min_size: 1} 15 | - {type: ConvertPILImage, dtype: 'float32', scale: True} 16 | - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} 17 | policy: 18 | epoch: [4, 29, 50] # list 19 | ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop'] 20 | mosaic_prob: 0.5 21 | 22 | collate_fn: 23 | mixup_prob: 0.5 24 | mixup_epochs: [4, 29] 25 | stop_epoch: 50 # epoch in [72, ~) stop `multiscales` 26 | 27 | # Unfreezing BN 28 | HGNetv2: 29 | freeze_at: -1 # 0 default 30 | freeze_norm: False # True default 31 | 32 | # Activation 33 | DFINETransformer: 34 | activation: silu 35 | mlp_act: silu 36 | 37 | ## Our LR-Scheduler 38 | lrsheduler: flatcosine 39 | lr_gamma: 0.5 40 | warmup_iter: 2000 41 | flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 42 | no_aug_epoch: 8 43 | 44 | ## Our Loss 45 | DEIMCriterion: 46 | weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5} 47 | losses: ['mal', 'boxes', 'local'] 48 | gamma: 1.5 -------------------------------------------------------------------------------- /configs/base/deimv2.yml: -------------------------------------------------------------------------------- 1 | task: detection 2 | 3 | model: DEIM 4 | criterion: DEIMCriterion 5 | postprocessor: PostProcessor 6 | 7 | use_focal_loss: True 8 | eval_spatial_size: [640, 640] # h w 9 | checkpoint_freq: 5 # save freq 10 | 11 | DEIM: 12 | backbone: HGNetv2 13 | encoder: HybridEncoder 14 | decoder: DEIMTransformer 15 | 16 | HGNetv2: 17 | name: 'B4' 18 | return_idx: [1, 2, 3] 19 | freeze_at: -1 # 0 default 20 | freeze_stem_only: True 21 | freeze_norm: False # True default 22 | pretrained: True 23 | local_model_dir: ./weight/hgnetv2/ 24 | 25 | HybridEncoder: 26 | in_channels: [512, 1024, 2048] 27 | feat_strides: [8, 16, 32] 28 | 29 | # intra 30 | hidden_dim: 256 31 | use_encoder_idx: [2] 32 | num_encoder_layers: 1 33 | nhead: 8 34 | dim_feedforward: 1024 35 | dropout: 0. 36 | enc_act: 'gelu' 37 | 38 | # cross 39 | expansion: 1.0 40 | depth_mult: 1 41 | act: 'silu' 42 | 43 | # New 44 | version: deim 45 | csp_type: csp2 46 | fuse_op: sum 47 | 48 | DEIMTransformer: 49 | feat_channels: [256, 256, 256] 50 | feat_strides: [8, 16, 32] 51 | hidden_dim: 256 52 | num_levels: 3 53 | 54 | num_layers: 6 55 | eval_idx: -1 56 | num_queries: 300 57 | 58 | num_denoising: 100 59 | label_noise_ratio: 0.5 60 | box_noise_scale: 1.0 61 | 62 | reg_max: 32 63 | reg_scale: 4 64 | layer_scale: 1 # 2 65 | 66 | num_points: [3, 6, 3] # [4, 4, 4] [3, 6, 3] 67 | cross_attn_method: default # default, discrete 68 | query_select_method: default # default, agnostic 69 | 70 | # Act 71 | activation: silu 72 | mlp_act: silu 73 | 74 | # FFN 75 | dim_feedforward: 2048 76 | 77 | PostProcessor: 78 | num_top_queries: 300 79 | 80 | 81 | ## DEIM LR-Scheduler 82 | epoches: 58 # 72 + 2n # Increase to search for the optimal ema 83 | 84 | lrsheduler: flatcosine 85 | lr_gamma: 0.5 86 | warmup_iter: 2000 87 | flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 88 | no_aug_epoch: 8 89 | 90 | ## Dense O2O: Mosaic + Mixup + CopyBlend 91 | train_dataloader: 92 | dataset: 93 | transforms: 94 | ops: 95 | - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5], 96 | probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True} 97 | - {type: RandomPhotometricDistort, p: 0.5} 98 | - {type: RandomZoomOut, fill: 0} 99 | - {type: RandomIoUCrop, p: 0.8} 100 | - {type: SanitizeBoundingBoxes, min_size: 1} 101 | - {type: RandomHorizontalFlip} 102 | - {type: Resize, size: [640, 640], } 103 | - {type: SanitizeBoundingBoxes, min_size: 1} 104 | - {type: ConvertPILImage, dtype: 'float32', scale: True} 105 | - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} 106 | # Mosaic options 107 | policy: 108 | epoch: [4, 29, 50] # list 109 | ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop'] 110 | mosaic_prob: 0.5 111 | 112 | collate_fn: 113 | # Mixup options 114 | mixup_prob: 0.5 115 | mixup_epochs: [4, 29] 116 | stop_epoch: 50 # epoch in [72, ~) stop `multiscales` 117 | # CopyBlend options 118 | copyblend_prob: 0.5 119 | copyblend_epochs: [4, 50] 120 | area_threshold: 100 121 | num_objects: 3 122 | with_expand: True 123 | expand_ratios: [0.1, 0.25] 124 | 125 | ema_restart_decay: 0.9999 126 | base_size_repeat: 4 127 | 128 | ## DEIM Loss 129 | DEIMCriterion: 130 | weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5} 131 | losses: ['mal', 'boxes', 'local'] 132 | gamma: 1.5 133 | alpha: 0.75 134 | reg_max: 32 135 | 136 | matcher: 137 | type: HungarianMatcher 138 | weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2} 139 | alpha: 0.25 140 | gamma: 2.0 141 | # change matcher 142 | change_matcher: True 143 | iou_order_alpha: 4.0 144 | matcher_change_epoch: 45 -------------------------------------------------------------------------------- /configs/base/dfine_hgnetv2.yml: -------------------------------------------------------------------------------- 1 | task: detection 2 | 3 | model: DEIM 4 | criterion: DEIMCriterion 5 | postprocessor: PostProcessor 6 | 7 | use_focal_loss: True 8 | eval_spatial_size: [640, 640] # h w 9 | checkpoint_freq: 4 # save freq 10 | 11 | DEIM: 12 | backbone: HGNetv2 13 | encoder: HybridEncoder 14 | decoder: DFINETransformer 15 | 16 | # Add, default for step lr scheduler 17 | lrsheduler: flatcosine 18 | lr_gamma: 1 19 | warmup_iter: 500 20 | flat_epoch: 4000000 21 | no_aug_epoch: 0 22 | 23 | HGNetv2: 24 | pretrained: True 25 | local_model_dir: ../RT-DETR-main/D-FINE/weight/hgnetv2/ 26 | 27 | HybridEncoder: 28 | in_channels: [512, 1024, 2048] 29 | feat_strides: [8, 16, 32] 30 | 31 | # intra 32 | hidden_dim: 256 33 | use_encoder_idx: [2] 34 | num_encoder_layers: 1 35 | nhead: 8 36 | dim_feedforward: 1024 37 | dropout: 0. 38 | enc_act: 'gelu' 39 | 40 | # cross 41 | expansion: 1.0 42 | depth_mult: 1 43 | act: 'silu' 44 | 45 | 46 | DFINETransformer: 47 | feat_channels: [256, 256, 256] 48 | feat_strides: [8, 16, 32] 49 | hidden_dim: 256 50 | num_levels: 3 51 | 52 | num_layers: 6 53 | eval_idx: -1 54 | num_queries: 300 55 | 56 | num_denoising: 100 57 | label_noise_ratio: 0.5 58 | box_noise_scale: 1.0 59 | 60 | # NEW 61 | reg_max: 32 62 | reg_scale: 4 63 | 64 | # Auxiliary decoder layers dimension scaling 65 | # "eg. If num_layers: 6 eval_idx: -4, 66 | # then layer 3, 4, 5 are auxiliary decoder layers." 67 | layer_scale: 1 # 2 68 | 69 | 70 | num_points: [3, 6, 3] # [4, 4, 4] [3, 6, 3] 71 | cross_attn_method: default # default, discrete 72 | query_select_method: default # default, agnostic 73 | 74 | 75 | PostProcessor: 76 | num_top_queries: 300 77 | 78 | 79 | DEIMCriterion: 80 | weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5} 81 | losses: ['vfl', 'boxes', 'local'] 82 | alpha: 0.75 83 | gamma: 2.0 84 | reg_max: 32 85 | 86 | matcher: 87 | type: HungarianMatcher 88 | weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2} 89 | alpha: 0.25 90 | gamma: 2.0 -------------------------------------------------------------------------------- /configs/base/optimizer.yml: -------------------------------------------------------------------------------- 1 | use_amp: True 2 | use_ema: True 3 | ema: 4 | type: ModelEMA 5 | decay: 0.9999 6 | warmups: 1000 7 | start: 0 8 | 9 | epoches: 72 10 | clip_max_norm: 0.1 11 | 12 | 13 | optimizer: 14 | type: AdamW 15 | params: 16 | - 17 | params: '^(?=.*backbone)(?!.*norm).*$' 18 | lr: 0.0000125 19 | - 20 | params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' 21 | weight_decay: 0. 22 | 23 | lr: 0.00025 24 | betas: [0.9, 0.999] 25 | weight_decay: 0.000125 26 | 27 | 28 | lr_scheduler: 29 | type: MultiStepLR 30 | milestones: [500] 31 | gamma: 0.1 32 | 33 | lr_warmup_scheduler: 34 | type: LinearWarmup 35 | warmup_duration: 500 36 | -------------------------------------------------------------------------------- /configs/base/rt_deim.yml: -------------------------------------------------------------------------------- 1 | # Dense O2O 2 | train_dataloader: 3 | dataset: 4 | transforms: 5 | ops: 6 | - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5], 7 | probability: 1.0, fill_value: 0, use_cache: False, max_cached_images: 50, random_pop: True} 8 | - {type: RandomPhotometricDistort, p: 0.5} 9 | - {type: RandomZoomOut, fill: 0} 10 | - {type: RandomIoUCrop, p: 0.8} 11 | - {type: SanitizeBoundingBoxes, min_size: 1} 12 | - {type: RandomHorizontalFlip} 13 | - {type: Resize, size: [640, 640], } 14 | - {type: SanitizeBoundingBoxes, min_size: 1} 15 | - {type: ConvertPILImage, dtype: 'float32', scale: True} 16 | - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} 17 | policy: 18 | epoch: [4, 29, 50] # list 19 | ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop'] 20 | mosaic_prob: 0.5 21 | 22 | collate_fn: 23 | mixup_prob: 0.5 24 | mixup_epochs: [4, 29] 25 | stop_epoch: 50 # epoch in [72, ~) stop `multiscales` 26 | 27 | # Unfreezing BN 28 | PResNet: 29 | freeze_at: -1 # default 0 30 | freeze_norm: False # default True 31 | 32 | # Activation 33 | RTDETRTransformerv2: 34 | query_pos_method: as_reg 35 | activation: silu 36 | mlp_act: silu 37 | 38 | ## Our LR-Scheduler 39 | lrsheduler: flatcosine 40 | lr_gamma: 0.5 41 | warmup_iter: 2000 42 | flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 43 | no_aug_epoch: 8 44 | 45 | ## Our Loss 46 | DEIMCriterion: 47 | weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2} 48 | losses: ['mal', 'boxes', ] 49 | gamma: 1.5 -------------------------------------------------------------------------------- /configs/base/rt_optimizer.yml: -------------------------------------------------------------------------------- 1 | use_amp: True 2 | use_ema: True 3 | ema: 4 | type: ModelEMA 5 | decay: 0.9999 6 | warmups: 2000 7 | start: 0 8 | 9 | epoches: 72 10 | clip_max_norm: 0.1 11 | 12 | train_dataloader: 13 | total_batch_size: 16 14 | 15 | optimizer: 16 | type: AdamW 17 | params: 18 | - 19 | params: '^(?=.*backbone)(?!.*norm).*$' 20 | lr: 0.00001 21 | - 22 | params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' 23 | weight_decay: 0. 24 | 25 | lr: 0.0001 26 | betas: [0.9, 0.999] 27 | weight_decay: 0.0001 28 | 29 | lr_scheduler: 30 | type: MultiStepLR 31 | milestones: [1000] 32 | gamma: 0.1 33 | 34 | 35 | lr_warmup_scheduler: 36 | type: LinearWarmup 37 | warmup_duration: 2000 38 | -------------------------------------------------------------------------------- /configs/base/rtdetrv2_r50vd.yml: -------------------------------------------------------------------------------- 1 | task: detection 2 | 3 | model: DEIM 4 | criterion: DEIMCriterion 5 | postprocessor: PostProcessor 6 | 7 | use_focal_loss: True 8 | eval_spatial_size: [640, 640] # h w 9 | checkpoint_freq: 4 # save freq 10 | 11 | DEIM: 12 | backbone: PResNet 13 | encoder: HybridEncoder 14 | decoder: RTDETRTransformerv2 15 | 16 | 17 | # Add, default for step lr scheduler 18 | lrsheduler: flatcosine 19 | lr_gamma: 1 20 | warmup_iter: 2000 21 | flat_epoch: 4000000 22 | no_aug_epoch: 0 23 | 24 | PResNet: 25 | depth: 50 26 | variant: d 27 | freeze_at: 0 28 | return_idx: [1, 2, 3] 29 | num_stages: 4 30 | freeze_norm: True 31 | pretrained: True 32 | local_model_dir: ../RT-DETR-main/rtdetrv2_pytorch/INK1k/ 33 | 34 | 35 | HybridEncoder: 36 | in_channels: [512, 1024, 2048] 37 | feat_strides: [8, 16, 32] 38 | 39 | # intra 40 | hidden_dim: 256 41 | use_encoder_idx: [2] 42 | num_encoder_layers: 1 43 | nhead: 8 44 | dim_feedforward: 1024 45 | dropout: 0. 46 | enc_act: 'gelu' 47 | 48 | # cross 49 | expansion: 1.0 50 | depth_mult: 1 51 | act: 'silu' 52 | version: rt_detrv2 # pay attention to this 53 | 54 | 55 | RTDETRTransformerv2: 56 | feat_channels: [256, 256, 256] 57 | feat_strides: [8, 16, 32] 58 | hidden_dim: 256 59 | num_levels: 3 60 | 61 | num_layers: 6 62 | num_queries: 300 63 | 64 | num_denoising: 100 65 | label_noise_ratio: 0.5 66 | box_noise_scale: 1.0 # 1.0 0.4 67 | 68 | eval_idx: -1 69 | 70 | # NEW, can be chosen 71 | num_points: [4, 4, 4] # [3,3,3] [2,2,2] 72 | cross_attn_method: default # default, discrete 73 | query_select_method: default # default, agnostic 74 | 75 | 76 | PostProcessor: 77 | num_top_queries: 300 78 | 79 | DEIMCriterion: 80 | weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2,} 81 | losses: ['vfl', 'boxes', ] 82 | alpha: 0.75 83 | gamma: 2.0 84 | use_uni_set: False 85 | 86 | matcher: 87 | type: HungarianMatcher 88 | weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2} 89 | alpha: 0.25 90 | gamma: 2.0 -------------------------------------------------------------------------------- /configs/dataset/coco_detection.yml: -------------------------------------------------------------------------------- 1 | task: detection 2 | 3 | evaluator: 4 | type: CocoEvaluator 5 | iou_types: ['bbox', ] 6 | 7 | num_classes: 80 8 | remap_mscoco_category: True 9 | 10 | train_dataloader: 11 | type: DataLoader 12 | dataset: 13 | type: CocoDetection 14 | img_folder: /datassd/COCO/train2017/ 15 | ann_file: /datassd/COCO/annotations/instances_train2017.json 16 | return_masks: False 17 | transforms: 18 | type: Compose 19 | ops: ~ 20 | shuffle: True 21 | num_workers: 4 22 | drop_last: True 23 | collate_fn: 24 | type: BatchImageCollateFunction 25 | 26 | 27 | val_dataloader: 28 | type: DataLoader 29 | dataset: 30 | type: CocoDetection 31 | img_folder: /datassd/COCO/val2017/ 32 | ann_file: /datassd/COCO/annotations/instances_val2017.json 33 | return_masks: False 34 | transforms: 35 | type: Compose 36 | ops: ~ 37 | shuffle: False 38 | num_workers: 4 39 | drop_last: False 40 | collate_fn: 41 | type: BatchImageCollateFunction -------------------------------------------------------------------------------- /configs/dataset/crowdhuman_detection.yml: -------------------------------------------------------------------------------- 1 | task: detection 2 | 3 | evaluator: 4 | type: CocoEvaluator 5 | iou_types: ['bbox', ] 6 | 7 | num_classes: 2 # your dataset classes 8 | remap_mscoco_category: False 9 | 10 | train_dataloader: 11 | type: DataLoader 12 | dataset: 13 | type: CocoDetection 14 | img_folder: /datassd/coco/crowd_human_coco/CrowdHuman_train 15 | ann_file: /datassd/coco/crowd_human_coco/Chuman-train.json 16 | return_masks: False 17 | transforms: 18 | type: Compose 19 | ops: ~ 20 | shuffle: True 21 | num_workers: 4 22 | drop_last: True 23 | collate_fn: 24 | type: BatchImageCollateFunction 25 | 26 | 27 | val_dataloader: 28 | type: DataLoader 29 | dataset: 30 | type: CocoDetection 31 | img_folder: /datassd/coco/crowd_human_coco/CrowdHuman_val 32 | ann_file: /datassd/coco/crowd_human_coco/Chuman-val.json 33 | return_masks: False 34 | transforms: 35 | type: Compose 36 | ops: ~ 37 | shuffle: False 38 | num_workers: 4 39 | drop_last: False 40 | collate_fn: 41 | type: BatchImageCollateFunction 42 | -------------------------------------------------------------------------------- /configs/dataset/custom_detection.yml: -------------------------------------------------------------------------------- 1 | task: detection 2 | 3 | evaluator: 4 | type: CocoEvaluator 5 | iou_types: ['bbox', ] 6 | 7 | num_classes: 777 # your dataset classes 8 | remap_mscoco_category: False 9 | 10 | train_dataloader: 11 | type: DataLoader 12 | dataset: 13 | type: CocoDetection 14 | img_folder: /data/yourdataset/train 15 | ann_file: /data/yourdataset/train/train.json 16 | return_masks: False 17 | transforms: 18 | type: Compose 19 | ops: ~ 20 | shuffle: True 21 | num_workers: 4 22 | drop_last: True 23 | collate_fn: 24 | type: BatchImageCollateFunction 25 | 26 | 27 | val_dataloader: 28 | type: DataLoader 29 | dataset: 30 | type: CocoDetection 31 | img_folder: /data/yourdataset/val 32 | ann_file: /data/yourdataset/val/val.json 33 | return_masks: False 34 | transforms: 35 | type: Compose 36 | ops: ~ 37 | shuffle: False 38 | num_workers: 4 39 | drop_last: False 40 | collate_fn: 41 | type: BatchImageCollateFunction 42 | -------------------------------------------------------------------------------- /configs/dataset/obj365_detection.yml: -------------------------------------------------------------------------------- 1 | task: detection 2 | 3 | evaluator: 4 | type: CocoEvaluator 5 | iou_types: ['bbox', ] 6 | 7 | num_classes: 366 8 | remap_mscoco_category: False 9 | 10 | train_dataloader: 11 | type: DataLoader 12 | dataset: 13 | type: CocoDetection 14 | img_folder: /home/Dataset/objects365/train 15 | ann_file: /home/Dataset/objects365/train/new_zhiyuan_objv2_train_resized640.json 16 | return_masks: False 17 | transforms: 18 | type: Compose 19 | ops: ~ 20 | shuffle: True 21 | num_workers: 4 22 | drop_last: True 23 | collate_fn: 24 | type: BatchImageCollateFunction 25 | 26 | 27 | val_dataloader: 28 | type: DataLoader 29 | dataset: 30 | type: CocoDetection 31 | img_folder: /home/Dataset/objects365/val 32 | ann_file: /home/Dataset/objects365/val/new_zhiyuan_objv2_val_resized640.json 33 | return_masks: False 34 | transforms: 35 | type: Compose 36 | ops: ~ 37 | shuffle: False 38 | num_workers: 4 39 | drop_last: False 40 | collate_fn: 41 | type: BatchImageCollateFunction 42 | -------------------------------------------------------------------------------- /configs/dataset/voc_detection.yml: -------------------------------------------------------------------------------- 1 | task: detection 2 | 3 | evaluator: 4 | type: CocoEvaluator 5 | iou_types: ['bbox', ] 6 | 7 | num_classes: 20 8 | 9 | train_dataloader: 10 | type: DataLoader 11 | dataset: 12 | type: VOCDetection 13 | root: ./dataset/voc/ 14 | ann_file: trainval.txt 15 | label_file: label_list.txt 16 | transforms: 17 | type: Compose 18 | ops: ~ 19 | shuffle: True 20 | num_workers: 4 21 | drop_last: True 22 | collate_fn: 23 | type: BatchImageCollateFunction 24 | 25 | 26 | val_dataloader: 27 | type: DataLoader 28 | dataset: 29 | type: VOCDetection 30 | root: ./dataset/voc/ 31 | ann_file: test.txt 32 | label_file: label_list.txt 33 | transforms: 34 | type: Compose 35 | ops: ~ 36 | shuffle: False 37 | num_workers: 4 38 | drop_last: False 39 | collate_fn: 40 | type: BatchImageCollateFunction 41 | -------------------------------------------------------------------------------- /configs/deim_dfine/deim_hgnetv2_l_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | './dfine_hgnetv2_l_coco.yml', 3 | '../base/deim.yml' 4 | ] 5 | 6 | output_dir: ./outputs/deim_hgnetv2_l_coco 7 | 8 | optimizer: 9 | type: AdamW 10 | params: 11 | - 12 | params: '^(?=.*backbone)(?!.*norm|bn).*$' 13 | lr: 0.000025 14 | - 15 | params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' 16 | weight_decay: 0. 17 | 18 | lr: 0.0005 19 | betas: [0.9, 0.999] 20 | weight_decay: 0.000125 21 | 22 | # Increase to search for the optimal ema 23 | epoches: 58 # 72 + 2n 24 | 25 | ## Our LR-Scheduler 26 | flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 27 | no_aug_epoch: 8 28 | 29 | train_dataloader: 30 | dataset: 31 | transforms: 32 | policy: 33 | epoch: [4, 29, 50] # list 34 | 35 | collate_fn: 36 | mixup_epochs: [4, 29] 37 | stop_epoch: 50 -------------------------------------------------------------------------------- /configs/deim_dfine/deim_hgnetv2_m_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | './dfine_hgnetv2_m_coco.yml', 3 | '../base/deim.yml' 4 | ] 5 | 6 | output_dir: ./outputs/deim_hgnetv2_m_coco 7 | 8 | optimizer: 9 | type: AdamW 10 | params: 11 | - 12 | params: '^(?=.*backbone)(?!.*bn).*$' 13 | lr: 0.00004 14 | - 15 | params: '^(?=.*(?:norm|bn)).*$' 16 | weight_decay: 0. 17 | 18 | lr: 0.0004 19 | betas: [0.9, 0.999] 20 | weight_decay: 0.0001 21 | 22 | 23 | # Increase to search for the optimal ema 24 | epoches: 102 # 120 + 4n 25 | 26 | ## Our LR-Scheduler 27 | flat_epoch: 49 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 28 | no_aug_epoch: 12 29 | 30 | ## Our DataAug 31 | train_dataloader: 32 | dataset: 33 | transforms: 34 | policy: 35 | epoch: [4, 49, 90] # list 36 | 37 | collate_fn: 38 | mixup_epochs: [4, 49] 39 | stop_epoch: 90 -------------------------------------------------------------------------------- /configs/deim_dfine/deim_hgnetv2_n_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | './dfine_hgnetv2_n_coco.yml', 3 | '../base/deim.yml' 4 | ] 5 | 6 | output_dir: ./deim_outputs/deim_hgnetv2_n_coco 7 | 8 | optimizer: 9 | type: AdamW 10 | params: 11 | - 12 | params: '^(?=.*backbone)(?!.*norm|bn).*$' 13 | lr: 0.0004 14 | - 15 | params: '^(?=.*backbone)(?=.*norm|bn).*$' 16 | lr: 0.0004 17 | weight_decay: 0. 18 | - 19 | params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' 20 | weight_decay: 0. 21 | 22 | lr: 0.0008 23 | betas: [0.9, 0.999] 24 | weight_decay: 0.0001 25 | 26 | # Increase to search for the optimal ema 27 | epoches: 160 # 148 + 12 28 | 29 | ## Our LR-Scheduler 30 | flat_epoch: 7800 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 31 | no_aug_epoch: 12 32 | lr_gamma: 1.0 33 | 34 | ## Our DataAug 35 | train_dataloader: 36 | dataset: 37 | transforms: 38 | policy: 39 | epoch: [4, 78, 148] # list 40 | 41 | collate_fn: 42 | mixup_epochs: [4, 78] 43 | stop_epoch: 148 44 | base_size_repeat: ~ -------------------------------------------------------------------------------- /configs/deim_dfine/deim_hgnetv2_s_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | './dfine_hgnetv2_s_coco.yml', 3 | '../base/deim.yml' 4 | ] 5 | 6 | output_dir: ./outputs/deim_hgnetv2_s_coco 7 | 8 | optimizer: 9 | type: AdamW 10 | params: 11 | - 12 | params: '^(?=.*backbone)(?!.*bn).*$' 13 | lr: 0.0002 14 | - 15 | params: '^(?=.*(?:norm|bn)).*$' # except bias 16 | weight_decay: 0. 17 | 18 | lr: 0.0004 19 | betas: [0.9, 0.999] 20 | weight_decay: 0.0001 21 | 22 | 23 | # Increase to search for the optimal ema 24 | epoches: 132 # 120 + 4n 25 | 26 | ## Our LR-Scheduler 27 | flat_epoch: 64 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 28 | no_aug_epoch: 12 29 | 30 | ## Our DataAug 31 | train_dataloader: 32 | dataset: 33 | transforms: 34 | policy: 35 | epoch: [4, 64, 120] # list 36 | 37 | collate_fn: 38 | mixup_epochs: [4, 64] 39 | stop_epoch: 120 -------------------------------------------------------------------------------- /configs/deim_dfine/deim_hgnetv2_x_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | './dfine_hgnetv2_x_coco.yml', 3 | '../base/deim.yml' 4 | ] 5 | 6 | output_dir: ./outputs/deim_hgnetv2_x_coco 7 | 8 | optimizer: 9 | type: AdamW 10 | params: 11 | - 12 | params: '^(?=.*backbone)(?!.*norm|bn).*$' 13 | lr: 0.000005 14 | - 15 | params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' 16 | weight_decay: 0. 17 | 18 | lr: 0.0005 19 | betas: [0.9, 0.999] 20 | weight_decay: 0.000125 21 | 22 | # Increase to search for the optimal ema 23 | epoches: 58 # 72 + 2n 24 | 25 | ## Our LR-Scheduler 26 | flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 27 | no_aug_epoch: 8 28 | 29 | train_dataloader: 30 | dataset: 31 | transforms: 32 | policy: 33 | epoch: [4, 29, 50] # list 34 | 35 | collate_fn: 36 | mixup_epochs: [4, 29] 37 | stop_epoch: 50 -------------------------------------------------------------------------------- /configs/deim_dfine/dfine_hgnetv2_l_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../dataset/coco_detection.yml', 3 | '../runtime.yml', 4 | '../base/dataloader.yml', 5 | '../base/optimizer.yml', 6 | '../base/dfine_hgnetv2.yml', 7 | ] 8 | 9 | output_dir: ./outputs/dfine_hgnetv2_l_coco 10 | 11 | 12 | HGNetv2: 13 | name: 'B4' 14 | return_idx: [1, 2, 3] 15 | freeze_stem_only: True 16 | freeze_at: 0 17 | freeze_norm: True 18 | 19 | optimizer: 20 | type: AdamW 21 | params: 22 | - 23 | params: '^(?=.*backbone)(?!.*norm|bn).*$' 24 | lr: 0.0000125 25 | - 26 | params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' 27 | weight_decay: 0. 28 | 29 | lr: 0.00025 30 | betas: [0.9, 0.999] 31 | weight_decay: 0.000125 32 | 33 | 34 | # Increase to search for the optimal ema 35 | epoches: 80 # 72 + 2n 36 | train_dataloader: 37 | dataset: 38 | transforms: 39 | policy: 40 | epoch: 72 41 | collate_fn: 42 | stop_epoch: 72 43 | ema_restart_decay: 0.9999 44 | base_size_repeat: 4 45 | -------------------------------------------------------------------------------- /configs/deim_dfine/dfine_hgnetv2_m_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../dataset/coco_detection.yml', 3 | '../runtime.yml', 4 | '../base/dataloader.yml', 5 | '../base/optimizer.yml', 6 | '../base/dfine_hgnetv2.yml', 7 | ] 8 | 9 | output_dir: ./output/dfine_hgnetv2_m_coco 10 | 11 | 12 | DEIM: 13 | backbone: HGNetv2 14 | 15 | HGNetv2: 16 | name: 'B2' 17 | return_idx: [1, 2, 3] 18 | freeze_at: -1 19 | freeze_norm: False 20 | use_lab: True 21 | 22 | DFINETransformer: 23 | num_layers: 4 # 5 6 24 | eval_idx: -1 # -2 -3 25 | 26 | HybridEncoder: 27 | in_channels: [384, 768, 1536] 28 | hidden_dim: 256 29 | depth_mult: 0.67 30 | 31 | optimizer: 32 | type: AdamW 33 | params: 34 | - 35 | params: '^(?=.*backbone)(?!.*norm|bn).*$' 36 | lr: 0.00002 37 | - 38 | params: '^(?=.*backbone)(?=.*norm|bn).*$' 39 | lr: 0.00002 40 | weight_decay: 0. 41 | - 42 | params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' 43 | weight_decay: 0. 44 | 45 | lr: 0.0002 46 | betas: [0.9, 0.999] 47 | weight_decay: 0.0001 48 | 49 | 50 | # Increase to search for the optimal ema 51 | epoches: 132 # 120 + 4n 52 | train_dataloader: 53 | dataset: 54 | transforms: 55 | policy: 56 | epoch: 120 57 | collate_fn: 58 | stop_epoch: 120 59 | ema_restart_decay: 0.9999 60 | base_size_repeat: 6 61 | -------------------------------------------------------------------------------- /configs/deim_dfine/dfine_hgnetv2_n_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../dataset/coco_detection.yml', 3 | '../runtime.yml', 4 | '../base/dataloader.yml', 5 | '../base/optimizer.yml', 6 | '../base/dfine_hgnetv2.yml', 7 | ] 8 | 9 | output_dir: ./output/dfine_hgnetv2_n_coco 10 | 11 | 12 | DEIM: 13 | backbone: HGNetv2 14 | 15 | HGNetv2: 16 | name: 'B0' 17 | return_idx: [2, 3] 18 | freeze_at: -1 19 | freeze_norm: False 20 | use_lab: True 21 | 22 | 23 | HybridEncoder: 24 | in_channels: [512, 1024] 25 | feat_strides: [16, 32] 26 | 27 | # intra 28 | hidden_dim: 128 29 | use_encoder_idx: [1] 30 | dim_feedforward: 512 31 | 32 | # cross 33 | expansion: 0.34 34 | depth_mult: 0.5 35 | 36 | 37 | DFINETransformer: 38 | feat_channels: [128, 128] 39 | feat_strides: [16, 32] 40 | hidden_dim: 128 41 | dim_feedforward: 512 42 | num_levels: 2 43 | 44 | num_layers: 3 45 | eval_idx: -1 46 | 47 | num_points: [6, 6] 48 | 49 | optimizer: 50 | type: AdamW 51 | params: 52 | - 53 | params: '^(?=.*backbone)(?!.*norm|bn).*$' 54 | lr: 0.0004 55 | - 56 | params: '^(?=.*backbone)(?=.*norm|bn).*$' 57 | lr: 0.0004 58 | weight_decay: 0. 59 | - 60 | params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' 61 | weight_decay: 0. 62 | 63 | lr: 0.0008 64 | betas: [0.9, 0.999] 65 | weight_decay: 0.0001 66 | 67 | 68 | # Increase to search for the optimal ema 69 | epoches: 160 # 148 + 4n 70 | train_dataloader: 71 | total_batch_size: 128 72 | dataset: 73 | transforms: 74 | policy: 75 | epoch: 148 76 | collate_fn: 77 | stop_epoch: 148 78 | ema_restart_decay: 0.9999 79 | base_size_repeat: ~ 80 | 81 | val_dataloader: 82 | total_batch_size: 256 83 | -------------------------------------------------------------------------------- /configs/deim_dfine/dfine_hgnetv2_s_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../dataset/coco_detection.yml', 3 | '../runtime.yml', 4 | '../base/dataloader.yml', 5 | '../base/optimizer.yml', 6 | '../base/dfine_hgnetv2.yml', 7 | ] 8 | 9 | output_dir: ./output/dfine_hgnetv2_s_coco 10 | 11 | 12 | DEIM: 13 | backbone: HGNetv2 14 | 15 | HGNetv2: 16 | name: 'B0' 17 | return_idx: [1, 2, 3] 18 | freeze_at: -1 19 | freeze_norm: False 20 | use_lab: True 21 | 22 | DFINETransformer: 23 | num_layers: 3 # 4 5 6 24 | eval_idx: -1 # -2 -3 -4 25 | 26 | HybridEncoder: 27 | in_channels: [256, 512, 1024] 28 | hidden_dim: 256 29 | depth_mult: 0.34 30 | expansion: 0.5 31 | 32 | optimizer: 33 | type: AdamW 34 | params: 35 | - 36 | params: '^(?=.*backbone)(?!.*norm|bn).*$' 37 | lr: 0.0001 38 | - 39 | params: '^(?=.*backbone)(?=.*norm|bn).*$' 40 | lr: 0.0001 41 | weight_decay: 0. 42 | - 43 | params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' 44 | weight_decay: 0. 45 | 46 | lr: 0.0002 47 | betas: [0.9, 0.999] 48 | weight_decay: 0.0001 49 | 50 | 51 | # Increase to search for the optimal ema 52 | epoches: 132 # 120 + 4n 53 | train_dataloader: 54 | dataset: 55 | transforms: 56 | policy: 57 | epoch: 120 58 | collate_fn: 59 | stop_epoch: 120 60 | ema_restart_decay: 0.9999 61 | base_size_repeat: 20 62 | -------------------------------------------------------------------------------- /configs/deim_dfine/dfine_hgnetv2_x_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../dataset/coco_detection.yml', 3 | '../runtime.yml', 4 | '../base/dataloader.yml', 5 | '../base/optimizer.yml', 6 | '../base/dfine_hgnetv2.yml', 7 | ] 8 | 9 | output_dir: ./output/dfine_hgnetv2_x_coco 10 | 11 | 12 | DEIM: 13 | backbone: HGNetv2 14 | 15 | HGNetv2: 16 | name: 'B5' 17 | return_idx: [1, 2, 3] 18 | freeze_stem_only: True 19 | freeze_at: 0 20 | freeze_norm: True 21 | 22 | HybridEncoder: 23 | # intra 24 | hidden_dim: 384 25 | dim_feedforward: 2048 26 | 27 | DFINETransformer: 28 | feat_channels: [384, 384, 384] 29 | reg_scale: 8 30 | 31 | optimizer: 32 | type: AdamW 33 | params: 34 | - 35 | params: '^(?=.*backbone)(?!.*norm|bn).*$' 36 | lr: 0.0000025 37 | - 38 | params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' 39 | weight_decay: 0. 40 | 41 | lr: 0.00025 42 | betas: [0.9, 0.999] 43 | weight_decay: 0.000125 44 | 45 | 46 | # Increase to search for the optimal ema 47 | epoches: 80 # 72 + 2n 48 | train_dataloader: 49 | dataset: 50 | transforms: 51 | policy: 52 | epoch: 72 53 | collate_fn: 54 | stop_epoch: 72 55 | ema_restart_decay: 0.9998 56 | base_size_repeat: 3 57 | -------------------------------------------------------------------------------- /configs/deim_dfine/object365/deim_hgnetv2_x_obj2coco_24e.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | './dfine_hgnetv2_x_obj2coco.yml', 3 | '../../base/deim.yml' 4 | ] 5 | 6 | output_dir: ./deim_outputs/deim_hgnetv2_x_obj2coco_24e 7 | 8 | HGNetv2: 9 | freeze_at: 0 # 0 default 10 | freeze_norm: True # True default 11 | 12 | # Activation 13 | DFINETransformer: 14 | activation: relu 15 | mlp_act: relu 16 | 17 | optimizer: 18 | type: AdamW 19 | params: 20 | - 21 | params: '^(?=.*backbone)(?!.*norm|bn).*$' 22 | lr: 0.0000025 23 | - 24 | params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' 25 | weight_decay: 0. 26 | 27 | lr: 0.00025 28 | betas: [0.9, 0.999] 29 | weight_decay: 0.000125 30 | 31 | # Increase to search for the optimal ema 32 | epoches: 24 # 72 + 2n 33 | 34 | ## Our LR-Scheduler 35 | lrsheduler: flatcosine 36 | lr_gamma: 1 37 | warmup_iter: 0 # 0 38 | flat_epoch: 12000 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 39 | no_aug_epoch: 4 40 | 41 | ## Our DataAug 42 | train_dataloader: 43 | dataset: 44 | transforms: 45 | policy: 46 | epoch: [2, 12, 20] # list 47 | 48 | collate_fn: 49 | mixup_epochs: [2, 12] 50 | stop_epoch: 20 -------------------------------------------------------------------------------- /configs/deim_dfine/object365/dfine_hgnetv2_x_obj2coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../../dataset/coco_detection.yml', 3 | '../../runtime.yml', 4 | '../../base/dataloader.yml', 5 | '../../base/optimizer.yml', 6 | '../../base/dfine_hgnetv2.yml', 7 | ] 8 | 9 | output_dir: ./outputs/dfine_hgnetv2_x_obj2coco 10 | 11 | HGNetv2: 12 | name: 'B5' 13 | return_idx: [1, 2, 3] 14 | freeze_stem_only: True 15 | freeze_at: 0 16 | freeze_norm: True 17 | 18 | HybridEncoder: 19 | # intra 20 | hidden_dim: 384 21 | dim_feedforward: 2048 22 | 23 | DFINETransformer: 24 | feat_channels: [384, 384, 384] 25 | reg_scale: 8 26 | 27 | optimizer: 28 | type: AdamW 29 | params: 30 | - 31 | params: '^(?=.*backbone)(?!.*norm|bn).*$' 32 | lr: 0.0000025 33 | - 34 | params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' 35 | weight_decay: 0. 36 | 37 | lr: 0.00025 38 | betas: [0.9, 0.999] 39 | weight_decay: 0.000125 40 | 41 | 42 | epoches: 36 # Early stop 43 | train_dataloader: 44 | dataset: 45 | transforms: 46 | policy: 47 | epoch: 30 48 | collate_fn: 49 | stop_epoch: 30 50 | ema_restart_decay: 0.9999 51 | base_size_repeat: 3 52 | 53 | ema: 54 | warmups: 0 55 | 56 | lr_warmup_scheduler: 57 | warmup_duration: 0 58 | -------------------------------------------------------------------------------- /configs/deim_rtdetrv2/deim_r101vd_60e_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | './rtdetrv2_r101vd_6x_coco.yml', 3 | '../base/rt_deim.yml', 4 | ] 5 | 6 | output_dir: ./outputs/deim_rtdetrv2_r101vd_60e_coco 7 | 8 | optimizer: 9 | type: AdamW 10 | params: 11 | - 12 | params: '^(?=.*backbone)(?!.*norm).*$' 13 | lr: 0.000002 14 | - 15 | params: '^(?=.*(?:norm|bn)).*$' 16 | weight_decay: 0. 17 | 18 | lr: 0.0002 19 | betas: [0.9, 0.999] 20 | weight_decay: 0.0001 21 | 22 | 23 | # change part 24 | epoches: 60 25 | flat_epoch: 34 # 4 + 60 / 2 26 | no_aug_epoch: 2 27 | 28 | train_dataloader: 29 | dataset: 30 | transforms: 31 | policy: 32 | epoch: [4, 34, 58] # list 33 | 34 | collate_fn: 35 | mixup_epochs: [4, 34] 36 | stop_epoch: 58 37 | -------------------------------------------------------------------------------- /configs/deim_rtdetrv2/deim_r18vd_120e_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | './rtdetrv2_r18vd_120e_coco.yml', 3 | '../base/rt_deim.yml', 4 | ] 5 | 6 | output_dir: ./output/deim_rtdetrv2_r18vd_120e_coco 7 | 8 | optimizer: 9 | type: AdamW 10 | params: 11 | - 12 | params: '^(?=.*(?:norm|bn)).*$' 13 | weight_decay: 0. 14 | 15 | lr: 0.0002 16 | betas: [0.9, 0.999] 17 | weight_decay: 0.0001 18 | 19 | # change part 20 | epoches: 120 21 | flat_epoch: 64 # 4 + 120 / 2 22 | no_aug_epoch: 3 23 | 24 | train_dataloader: 25 | dataset: 26 | transforms: 27 | policy: 28 | epoch: [4, 64, 117] # list 29 | 30 | collate_fn: 31 | mixup_epochs: [4, 64] 32 | stop_epoch: 117 -------------------------------------------------------------------------------- /configs/deim_rtdetrv2/deim_r34vd_120e_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | './rtdetrv2_r34vd_120e_coco.yml', 3 | '../base/rt_deim.yml', 4 | ] 5 | 6 | output_dir: ./outputs/deim_rtdetrv2_r34vd_120e_coco 7 | 8 | optimizer: 9 | type: AdamW 10 | params: 11 | - 12 | params: '^(?=.*backbone)(?!.*norm).*$' 13 | lr: 0.0001 14 | - 15 | params: '^(?=.*(?:norm|bn)).*$' 16 | weight_decay: 0. 17 | 18 | lr: 0.0002 19 | betas: [0.9, 0.999] 20 | weight_decay: 0.0001 21 | 22 | 23 | # change part 24 | epoches: 120 25 | flat_epoch: 64 26 | no_aug_epoch: 3 27 | 28 | train_dataloader: 29 | dataset: 30 | transforms: 31 | policy: 32 | epoch: [4, 64, 117] # list 33 | 34 | collate_fn: 35 | mixup_epochs: [4, 64] 36 | stop_epoch: 117 -------------------------------------------------------------------------------- /configs/deim_rtdetrv2/deim_r50vd_60e_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | './rtdetrv2_r50vd_6x_coco.yml', 3 | '../base/rt_deim.yml', 4 | ] 5 | 6 | output_dir: ./outputs/deim_rtdetrv2_r50vd_60e_coco 7 | 8 | optimizer: 9 | type: AdamW 10 | params: 11 | - 12 | params: '^(?=.*backbone)(?!.*norm).*$' 13 | lr: 0.00002 14 | - 15 | params: '^(?=.*(?:norm|bn)).*$' 16 | weight_decay: 0. 17 | 18 | lr: 0.0002 19 | betas: [0.9, 0.999] 20 | weight_decay: 0.0001 21 | 22 | # change part 23 | epoches: 60 24 | flat_epoch: 34 # 4 + 60 / 2 25 | no_aug_epoch: 2 26 | 27 | train_dataloader: 28 | dataset: 29 | transforms: 30 | policy: 31 | epoch: [4, 34, 58] # list 32 | 33 | collate_fn: 34 | mixup_epochs: [4, 34] 35 | stop_epoch: 58 -------------------------------------------------------------------------------- /configs/deim_rtdetrv2/deim_r50vd_m_60e_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | './rtdetrv2_r50vd_m_7x_coco.yml', 3 | '../base/rt_deim.yml', 4 | ] 5 | 6 | output_dir: ./outputs/deim_rtdetrv2_r50vd_m_60e_coco 7 | 8 | RTDETRTransformerv2: 9 | eval_idx: 2 # use 3th decoder layer to eval 10 | num_layers: 3 11 | 12 | optimizer: 13 | type: AdamW 14 | params: 15 | - 16 | params: '^(?=.*backbone)(?!.*norm).*$' 17 | lr: 0.00002 18 | - 19 | params: '^(?=.*(?:norm|bn)).*$' 20 | weight_decay: 0. 21 | 22 | lr: 0.0002 23 | betas: [0.9, 0.999] 24 | weight_decay: 0.0001 25 | 26 | # change part 27 | epoches: 60 28 | flat_epoch: 34 # 4 + 60 / 2 29 | no_aug_epoch: 2 30 | 31 | train_dataloader: 32 | dataset: 33 | transforms: 34 | policy: 35 | epoch: [4, 34, 58] # list 36 | 37 | collate_fn: 38 | mixup_epochs: [4, 34] 39 | stop_epoch: 58 -------------------------------------------------------------------------------- /configs/deim_rtdetrv2/rtdetrv2_r101vd_6x_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../dataset/coco_detection.yml', 3 | '../runtime.yml', 4 | '../base/dataloader.yml', 5 | '../base/rt_optimizer.yml', 6 | '../base/rtdetrv2_r50vd.yml', 7 | ] 8 | 9 | 10 | output_dir: ./outputs/rtdetrv2_r101vd_6x_coco 11 | 12 | 13 | PResNet: 14 | depth: 101 15 | 16 | 17 | HybridEncoder: 18 | # intra 19 | hidden_dim: 384 20 | dim_feedforward: 2048 21 | 22 | 23 | RTDETRTransformerv2: 24 | feat_channels: [384, 384, 384] 25 | 26 | 27 | optimizer: 28 | type: AdamW 29 | params: 30 | - 31 | params: '^(?=.*backbone)(?!.*norm|bn).*$' 32 | lr: 0.000001 33 | - 34 | params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' # only encoder + decoder norm 35 | weight_decay: 0. 36 | 37 | lr: 0.0001 38 | betas: [0.9, 0.999] 39 | weight_decay: 0.0001 40 | 41 | -------------------------------------------------------------------------------- /configs/deim_rtdetrv2/rtdetrv2_r18vd_120e_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../dataset/coco_detection.yml', 3 | '../runtime.yml', 4 | '../base/dataloader.yml', 5 | '../base/rt_optimizer.yml', 6 | '../base/rtdetrv2_r50vd.yml', 7 | ] 8 | 9 | 10 | output_dir: ./output/rtdetrv2_r18vd_120e_coco 11 | 12 | 13 | PResNet: 14 | depth: 18 15 | freeze_at: -1 16 | freeze_norm: False 17 | pretrained: True 18 | 19 | HybridEncoder: 20 | in_channels: [128, 256, 512] 21 | hidden_dim: 256 22 | expansion: 0.5 23 | 24 | RTDETRTransformerv2: 25 | num_layers: 3 26 | 27 | 28 | epoches: 120 29 | 30 | optimizer: 31 | type: AdamW 32 | params: 33 | - 34 | params: '^(?=.*(?:norm|bn)).*$' 35 | weight_decay: 0. 36 | 37 | 38 | train_dataloader: 39 | dataset: 40 | transforms: 41 | policy: 42 | epoch: 117 43 | collate_fn: 44 | scales: ~ -------------------------------------------------------------------------------- /configs/deim_rtdetrv2/rtdetrv2_r34vd_120e_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../dataset/coco_detection.yml', 3 | '../runtime.yml', 4 | '../base/dataloader.yml', 5 | '../base/rt_optimizer.yml', 6 | '../base/rtdetrv2_r50vd.yml', 7 | ] 8 | 9 | 10 | output_dir: ./outputs/rtdetrv2_r34vd_120e_coco 11 | 12 | 13 | PResNet: 14 | depth: 34 15 | freeze_at: -1 16 | freeze_norm: False 17 | pretrained: True 18 | 19 | 20 | HybridEncoder: 21 | in_channels: [128, 256, 512] 22 | hidden_dim: 256 23 | expansion: 0.5 24 | 25 | 26 | RTDETRTransformerv2: 27 | num_layers: 4 28 | 29 | 30 | epoches: 120 31 | 32 | optimizer: 33 | type: AdamW 34 | params: 35 | - 36 | params: '^(?=.*backbone)(?!.*norm|bn).*$' 37 | lr: 0.00005 38 | - 39 | params: '^(?=.*backbone)(?=.*norm|bn).*$' 40 | lr: 0.00005 41 | weight_decay: 0. 42 | - 43 | params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' 44 | weight_decay: 0. 45 | 46 | lr: 0.0001 47 | betas: [0.9, 0.999] 48 | weight_decay: 0.0001 49 | 50 | 51 | train_dataloader: 52 | dataset: 53 | transforms: 54 | policy: 55 | epoch: 117 56 | collate_fn: 57 | stop_epoch: 117 58 | -------------------------------------------------------------------------------- /configs/deim_rtdetrv2/rtdetrv2_r50vd_6x_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../dataset/coco_detection.yml', 3 | '../runtime.yml', 4 | '../base/dataloader.yml', 5 | '../base/rt_optimizer.yml', 6 | '../base/rtdetrv2_r50vd.yml', 7 | ] 8 | 9 | 10 | output_dir: ./outputs/rtdetrv2_r50vd_6x_coco 11 | 12 | 13 | optimizer: 14 | type: AdamW 15 | params: 16 | - 17 | params: '^(?=.*backbone)(?!.*norm).*$' 18 | lr: 0.00001 19 | - 20 | params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' 21 | weight_decay: 0. 22 | 23 | lr: 0.0001 24 | betas: [0.9, 0.999] 25 | weight_decay: 0.0001 -------------------------------------------------------------------------------- /configs/deim_rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../dataset/coco_detection.yml', 3 | '../runtime.yml', 4 | '../base/dataloader.yml', 5 | '../base/rt_optimizer.yml', 6 | '../base/rtdetrv2_r50vd.yml', 7 | ] 8 | 9 | output_dir: ./outputs/rtdetrv2_r50vd_m_6x_coco 10 | 11 | 12 | HybridEncoder: 13 | expansion: 0.5 14 | 15 | 16 | RTDETRTransformerv2: 17 | eval_idx: 2 # use 3th decoder layer to eval 18 | 19 | 20 | epoches: 84 21 | 22 | optimizer: 23 | type: AdamW 24 | params: 25 | - 26 | params: '^(?=.*backbone)(?!.*norm).*$' 27 | lr: 0.00001 28 | - 29 | params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' 30 | weight_decay: 0. 31 | 32 | lr: 0.0001 33 | betas: [0.9, 0.999] 34 | weight_decay: 0.0001 35 | 36 | 37 | train_dataloader: 38 | dataset: 39 | transforms: 40 | policy: 41 | epoch: 81 42 | collate_fn: 43 | stop_epoch: 81 -------------------------------------------------------------------------------- /configs/deimv2/deimv2_dinov3_l_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../dataset/coco_detection.yml', 3 | '../runtime.yml', 4 | '../base/dataloader.yml', 5 | '../base/optimizer.yml', 6 | '../base/deimv2.yml', 7 | ] 8 | 9 | 10 | output_dir: ./outputs/deimv2_dinov3_l_coco 11 | 12 | DEIM: 13 | backbone: DINOv3STAs 14 | 15 | DINOv3STAs: 16 | name: dinov3_vits16 17 | weights_path: ./ckpts/dinov3_vits16_pretrain_lvd1689m-08c60483.pth 18 | interaction_indexes: [5,8,11] # only need the [1/8, 1/16, 1/32] 19 | finetune: True 20 | conv_inplane: 32 21 | hidden_dim: 224 22 | 23 | HybridEncoder: 24 | in_channels: [224, 224, 224] 25 | hidden_dim: 224 26 | dim_feedforward: 896 27 | 28 | DEIMTransformer: 29 | feat_channels: [224, 224, 224] 30 | hidden_dim: 224 31 | num_layers: 4 32 | eval_idx: -1 33 | dim_feedforward: 1792 34 | 35 | ## DEIM LR-Scheduler 36 | epoches: 68 # 72 + 2n # Increase to search for the optimal ema 37 | 38 | lrsheduler: flatcosine 39 | lr_gamma: 0.5 40 | warmup_iter: 2000 41 | flat_epoch: 34 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 42 | no_aug_epoch: 8 43 | 44 | ## Optimizer 45 | optimizer: 46 | type: AdamW 47 | params: 48 | - 49 | # except norm/bn/bias in self.dinov3 50 | params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$' 51 | lr: 0.0000125 52 | - 53 | # including norm/bn/bias in self.dinov3 54 | params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$' 55 | lr: 0.0000125 56 | weight_decay: 0. 57 | - 58 | # including norm/bn/bias except for the self.dinov3 59 | params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$' 60 | weight_decay: 0. 61 | 62 | lr: 0.0005 63 | betas: [0.9, 0.999] 64 | weight_decay: 0.000125 65 | 66 | 67 | ## Dense O2O: Mosaic + Mixup + CopyBlend 68 | train_dataloader: 69 | dataset: 70 | transforms: 71 | ops: 72 | - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5], 73 | probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True} 74 | - {type: RandomPhotometricDistort, p: 0.5} 75 | - {type: RandomZoomOut, fill: 0} 76 | - {type: RandomIoUCrop, p: 0.8} 77 | - {type: SanitizeBoundingBoxes, min_size: 1} 78 | - {type: RandomHorizontalFlip} 79 | - {type: Resize, size: [640, 640], } 80 | - {type: SanitizeBoundingBoxes, min_size: 1} 81 | - {type: ConvertPILImage, dtype: 'float32', scale: True} 82 | - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]} 83 | - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} 84 | policy: 85 | epoch: [4, 34, 60] # list 86 | 87 | collate_fn: 88 | mixup_epochs: [4, 34] 89 | stop_epoch: 60 90 | copyblend_epochs: [4, 60] 91 | base_size_repeat: 3 92 | 93 | val_dataloader: 94 | dataset: 95 | transforms: 96 | ops: 97 | - {type: Resize, size: [640, 640], } 98 | - {type: ConvertPILImage, dtype: 'float32', scale: True} 99 | - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]} 100 | 101 | ## DEIM Loss 102 | DEIMCriterion: 103 | matcher: 104 | matcher_change_epoch: 50 -------------------------------------------------------------------------------- /configs/deimv2/deimv2_dinov3_m_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../dataset/coco_detection.yml', 3 | '../runtime.yml', 4 | '../base/dataloader.yml', 5 | '../base/optimizer.yml', 6 | '../base/deimv2.yml', 7 | ] 8 | 9 | output_dir: ./outputs/deimv2_dinov3_m_coco 10 | 11 | DEIM: 12 | backbone: DINOv3STAs 13 | 14 | DINOv3STAs: 15 | name: vit_tinyplus 16 | embed_dim: 256 17 | weights_path: ./ckpts/vittplus_distill.pt 18 | interaction_indexes: [3, 7, 11] # only need the [1/8, 1/16, 1/32] 19 | num_heads: 4 20 | 21 | HybridEncoder: 22 | in_channels: [256, 256, 256] 23 | depth_mult: 1 24 | expansion: 0.67 25 | hidden_dim: 256 26 | dim_feedforward: 512 27 | 28 | 29 | DEIMTransformer: 30 | feat_channels: [256, 256, 256] 31 | hidden_dim: 256 32 | dim_feedforward: 512 33 | num_layers: 4 # 4 5 6 34 | eval_idx: -1 # -2 -3 -4 35 | 36 | optimizer: 37 | type: AdamW 38 | 39 | params: 40 | - 41 | # except norm/bn/bias in self.dinov3 42 | params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$' 43 | lr: 0.000025 44 | - 45 | # including norm/bn/bias in self.dinov3 46 | params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$' 47 | lr: 0.000025 48 | weight_decay: 0. 49 | - 50 | # including norm/bn/bias except for the self.dinov3 51 | params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$' 52 | weight_decay: 0. 53 | 54 | lr: 0.0005 55 | betas: [0.9, 0.999] 56 | weight_decay: 0.0001 57 | 58 | epoches: 102 # 120 + 4n 59 | 60 | ## Our LR-Scheduler 61 | flat_epoch: 49 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 62 | no_aug_epoch: 12 63 | 64 | 65 | ## Our DataAug 66 | train_dataloader: 67 | dataset: 68 | transforms: 69 | ops: 70 | - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5], 71 | probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True} 72 | - {type: RandomPhotometricDistort, p: 0.5} 73 | - {type: RandomZoomOut, fill: 0} 74 | - {type: RandomIoUCrop, p: 0.8} 75 | - {type: SanitizeBoundingBoxes, min_size: 1} 76 | - {type: RandomHorizontalFlip} 77 | - {type: Resize, size: [640, 640], } 78 | - {type: SanitizeBoundingBoxes, min_size: 1} 79 | - {type: ConvertPILImage, dtype: 'float32', scale: True} 80 | - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]} 81 | - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} 82 | policy: 83 | epoch: [4, 49, 90] # list 84 | 85 | collate_fn: 86 | mixup_prob: 0.5 87 | ema_restart_decay: 0.9999 88 | base_size_repeat: 6 89 | mixup_epochs: [4, 49] 90 | stop_epoch: 90 91 | copyblend_epochs: [4, 90] 92 | 93 | 94 | val_dataloader: 95 | dataset: 96 | transforms: 97 | ops: 98 | - {type: Resize, size: [640, 640], } 99 | - {type: ConvertPILImage, dtype: 'float32', scale: True} 100 | - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]} 101 | 102 | DEIMCriterion: 103 | matcher: 104 | # new matcher 105 | change_matcher: True 106 | iou_order_alpha: 4.0 107 | matcher_change_epoch: 80 108 | -------------------------------------------------------------------------------- /configs/deimv2/deimv2_dinov3_s_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../dataset/coco_detection.yml', 3 | '../runtime.yml', 4 | '../base/dataloader.yml', 5 | '../base/optimizer.yml', 6 | '../base/deimv2.yml', 7 | ] 8 | 9 | output_dir: ./outputs/deimv2_dinov3_s_coco 10 | 11 | DEIM: 12 | backbone: DINOv3STAs 13 | 14 | DINOv3STAs: 15 | name: vit_tiny 16 | embed_dim: 192 17 | weights_path: ./ckpts/vitt_distill.pt 18 | interaction_indexes: [3, 7, 11] # only need the [1/8, 1/16, 1/32] 19 | num_heads: 3 20 | 21 | HybridEncoder: 22 | in_channels: [192, 192, 192] 23 | depth_mult: 0.67 24 | expansion: 0.34 25 | hidden_dim: 192 26 | dim_feedforward: 512 27 | 28 | DEIMTransformer: 29 | feat_channels: [192, 192, 192] 30 | hidden_dim: 192 31 | dim_feedforward: 512 32 | num_layers: 4 # 4 5 6 33 | eval_idx: -1 # -2 -3 -4 34 | 35 | 36 | ## Optimizer 37 | optimizer: 38 | type: AdamW 39 | 40 | params: 41 | - 42 | # except norm/bn/bias in self.dinov3 43 | params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$' 44 | lr: 0.000025 45 | - 46 | # including all norm/bn/bias in self.dinov3 47 | params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$' 48 | lr: 0.000025 49 | weight_decay: 0. 50 | - 51 | # including all norm/bn/bias except for the self.dinov3 52 | params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$' 53 | weight_decay: 0. 54 | 55 | lr: 0.0005 56 | betas: [0.9, 0.999] 57 | weight_decay: 0.0001 58 | 59 | # Increase to search for the optimal ema 60 | epoches: 132 # 120 + 4n 61 | 62 | ## Our LR-Scheduler 63 | flat_epoch: 64 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 64 | no_aug_epoch: 12 65 | 66 | ## Our DataAug 67 | train_dataloader: 68 | dataset: 69 | transforms: 70 | ops: 71 | - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5], 72 | probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True} 73 | - {type: RandomPhotometricDistort, p: 0.5} 74 | - {type: RandomZoomOut, fill: 0} 75 | - {type: RandomIoUCrop, p: 0.8} 76 | - {type: SanitizeBoundingBoxes, min_size: 1} 77 | - {type: RandomHorizontalFlip} 78 | - {type: Resize, size: [640, 640], } 79 | - {type: SanitizeBoundingBoxes, min_size: 1} 80 | - {type: ConvertPILImage, dtype: 'float32', scale: True} 81 | - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]} 82 | - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} 83 | policy: 84 | epoch: [4, 64, 120] # list 85 | 86 | collate_fn: 87 | base_size: 640 88 | mixup_prob: 0.5 89 | ema_restart_decay: 0.9999 90 | base_size_repeat: 20 91 | mixup_epochs: [4, 64] 92 | stop_epoch: 120 93 | copyblend_epochs: [4, 120] 94 | 95 | val_dataloader: 96 | dataset: 97 | transforms: 98 | ops: 99 | - {type: Resize, size: [640, 640], } 100 | - {type: ConvertPILImage, dtype: 'float32', scale: True} 101 | - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]} 102 | 103 | DEIMCriterion: 104 | matcher: 105 | # change matcher 106 | change_matcher: True 107 | iou_order_alpha: 4.0 108 | matcher_change_epoch: 100 109 | -------------------------------------------------------------------------------- /configs/deimv2/deimv2_dinov3_x_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../dataset/coco_detection.yml', 3 | '../runtime.yml', 4 | '../base/dataloader.yml', 5 | '../base/optimizer.yml', 6 | '../base/deimv2.yml', 7 | ] 8 | 9 | 10 | output_dir: ./outputs/deimv2_dinov3_x_coco 11 | 12 | DEIM: 13 | backbone: DINOv3STAs 14 | 15 | DINOv3STAs: 16 | name: dinov3_vits16plus 17 | weights_path: ./ckpts/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pth 18 | interaction_indexes: [5,8,11] # only need the [1/8, 1/16, 1/32] 19 | finetune: True 20 | conv_inplane: 64 21 | hidden_dim: 256 22 | 23 | HybridEncoder: 24 | in_channels: [256, 256, 256] 25 | # intra 26 | hidden_dim: 256 27 | dim_feedforward: 1024 28 | 29 | # cross 30 | expansion: 1.25 31 | depth_mult: 1.37 32 | 33 | DEIMTransformer: 34 | num_layers: 6 35 | eval_idx: -1 36 | feat_channels: [256, 256, 256] 37 | # reg_scale: 8 38 | hidden_dim: 256 39 | dim_feedforward: 2048 40 | 41 | optimizer: 42 | type: AdamW 43 | params: 44 | - 45 | # except norm/bn/bias in self.dinov3 46 | params: '^(?=.*.dinov3)(?!.*(?:norm|bn|bias)).*$' 47 | lr: 0.00001 48 | - 49 | # including norm/bn/bias in self.dinov3 50 | params: '^(?=.*.dinov3)(?=.*(?:norm|bn|bias)).*$' 51 | lr: 0.00001 52 | weight_decay: 0. 53 | - 54 | # including norm/bn/bias except for the self.dinov3 55 | params: '^(?=.*(?:sta|encoder|decoder))(?=.*(?:norm|bn|bias)).*$' 56 | weight_decay: 0. 57 | 58 | lr: 0.0005 59 | betas: [0.9, 0.999] 60 | weight_decay: 0.000125 61 | 62 | ## Dense O2O: Mosaic + Mixup + CopyBlend 63 | train_dataloader: 64 | dataset: 65 | transforms: 66 | ops: 67 | - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5], 68 | probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True} 69 | - {type: RandomPhotometricDistort, p: 0.5} 70 | - {type: RandomZoomOut, fill: 0} 71 | - {type: RandomIoUCrop, p: 0.8} 72 | - {type: SanitizeBoundingBoxes, min_size: 1} 73 | - {type: RandomHorizontalFlip} 74 | - {type: Resize, size: [640, 640], } 75 | - {type: SanitizeBoundingBoxes, min_size: 1} 76 | - {type: ConvertPILImage, dtype: 'float32', scale: True} 77 | - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]} 78 | - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} 79 | policy: 80 | epoch: [4, 29, 50] # list 81 | 82 | collate_fn: 83 | mixup_epochs: [4, 29] 84 | stop_epoch: 50 85 | copyblend_epochs: [4, 50] 86 | base_size_repeat: 3 87 | 88 | val_dataloader: 89 | dataset: 90 | transforms: 91 | ops: 92 | - {type: Resize, size: [640, 640], } 93 | - {type: ConvertPILImage, dtype: 'float32', scale: True} 94 | - {type: Normalize, mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225]} -------------------------------------------------------------------------------- /configs/deimv2/deimv2_hgnetv2_atto_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../dataset/coco_detection.yml', 3 | '../runtime.yml', 4 | '../base/dataloader.yml', 5 | '../base/optimizer.yml', 6 | '../base/deimv2.yml', 7 | ] 8 | 9 | output_dir: ./outputs/deimv2_hgnetv2_atto_coco 10 | 11 | DEIM: 12 | encoder: LiteEncoder 13 | 14 | HGNetv2: 15 | name: 'Atto' 16 | return_idx: [2] 17 | freeze_at: -1 18 | freeze_norm: False 19 | use_lab: True 20 | 21 | LiteEncoder: 22 | in_channels: [256] 23 | feat_strides: [16] 24 | # intra 25 | hidden_dim: 64 26 | 27 | # cross 28 | expansion: 0.34 29 | depth_mult: 0.5 30 | act: 'silu' 31 | 32 | 33 | DEIMTransformer: 34 | feat_channels: [64, 64] 35 | feat_strides: [16, 32] 36 | hidden_dim: 64 37 | num_levels: 2 38 | num_points: [4, 2] 39 | 40 | num_layers: 3 41 | eval_idx: -1 42 | num_queries: 100 43 | 44 | # FFN 45 | dim_feedforward: 160 46 | 47 | # New options for DEIMv2 48 | share_bbox_head: True 49 | use_gateway: False 50 | 51 | # Increase to search for the optimal ema 52 | epoches: 500 # 468 + 32 53 | 54 | ## Our LR-Scheduler 55 | warmup_iter: 4000 56 | flat_epoch: 250 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 57 | no_aug_epoch: 32 58 | lr_gamma: 0.5 59 | 60 | optimizer: 61 | type: AdamW 62 | params: 63 | - params: '^(?=.*backbone)(?!.*norm|bn).*$' 64 | lr: 0.001 65 | - params: '^(?=.*backbone)(?=.*norm|bn).*$' 66 | lr: 0.001 67 | weight_decay: 0. 68 | - params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' # except bias 69 | weight_decay: 0. 70 | 71 | lr: 0.002 72 | betas: [0.9, 0.999] 73 | weight_decay: 0.0001 74 | 75 | eval_spatial_size: [320, 320] 76 | train_dataloader: 77 | total_batch_size: 128 78 | dataset: 79 | transforms: 80 | ops: 81 | - {type: Mosaic, output_size: 160, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5], 82 | probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True} 83 | - {type: RandomPhotometricDistort, p: 0.5} 84 | - {type: RandomZoomOut, fill: 0} 85 | - {type: RandomIoUCrop, p: 0.8} 86 | - {type: SanitizeBoundingBoxes, min_size: 12} 87 | - {type: RandomHorizontalFlip} 88 | - {type: Resize, size: [320, 320], } 89 | - {type: SanitizeBoundingBoxes, min_size: 12} 90 | - {type: ConvertPILImage, dtype: 'float32', scale: True} 91 | - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} 92 | policy: 93 | epoch: [4, 250, 400] # list 94 | mosaic_prob: 0.3 95 | 96 | collate_fn: 97 | mixup_prob: 0.0 98 | mixup_epochs: [40000, 15000] 99 | copyblend_prob: 0.0 100 | copyblend_epochs: [40000, 15000] 101 | 102 | stop_epoch: 468 # 468 + 32 103 | ema_restart_decay: 0.9999 104 | base_size: 320 105 | base_size_repeat: ~ 106 | 107 | val_dataloader: 108 | total_batch_size: 256 109 | dataset: 110 | transforms: 111 | ops: 112 | - {type: Resize, size: [320, 320], } 113 | - {type: ConvertPILImage, dtype: 'float32', scale: True} 114 | shuffle: False 115 | num_workers: 16 116 | 117 | 118 | DEIMCriterion: 119 | losses: ['mal', 'boxes'] # , 'local' 120 | use_uni_set: False 121 | 122 | matcher: 123 | matcher_change_epoch: 450 # FIX This -------------------------------------------------------------------------------- /configs/deimv2/deimv2_hgnetv2_femto_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../dataset/coco_detection.yml', 3 | '../runtime.yml', 4 | '../base/dataloader.yml', 5 | '../base/optimizer.yml', 6 | '../base/deimv2.yml', 7 | ] 8 | 9 | output_dir: ./outputs/deimv2_hgnetv2_femto_coco 10 | 11 | DEIM: 12 | encoder: LiteEncoder 13 | 14 | HGNetv2: 15 | name: 'Femto' 16 | return_idx: [2] 17 | freeze_at: -1 18 | freeze_norm: False 19 | use_lab: True 20 | 21 | LiteEncoder: 22 | in_channels: [512] 23 | feat_strides: [16] 24 | 25 | # intra 26 | hidden_dim: 96 27 | 28 | # cross 29 | expansion: 0.34 30 | depth_mult: 0.5 31 | act: 'silu' 32 | 33 | 34 | DEIMTransformer: 35 | feat_channels: [96, 96] 36 | feat_strides: [16, 32] 37 | hidden_dim: 96 38 | num_levels: 2 39 | num_points: [4, 2] 40 | 41 | num_layers: 3 42 | eval_idx: -1 43 | num_queries: 150 44 | 45 | # FFN 46 | dim_feedforward: 256 47 | 48 | # New options for DEIMv2 49 | share_bbox_head: True 50 | use_gateway: False 51 | 52 | # Increase to search for the optimal ema 53 | epoches: 500 # 468 + 32 54 | 55 | ## Our LR-Scheduler 56 | warmup_iter: 4000 57 | flat_epoch: 250 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 58 | no_aug_epoch: 32 59 | lr_gamma: 0.5 60 | 61 | optimizer: 62 | type: AdamW 63 | params: 64 | - 65 | params: '^(?=.*backbone)(?!.*norm|bn).*$' 66 | lr: 0.0008 67 | - 68 | params: '^(?=.*backbone)(?=.*norm|bn).*$' 69 | lr: 0.0008 70 | weight_decay: 0. 71 | - # not opt 72 | params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' 73 | weight_decay: 0. 74 | 75 | lr: 0.0016 76 | betas: [0.9, 0.999] 77 | weight_decay: 0.0001 78 | 79 | eval_spatial_size: [416, 416] 80 | train_dataloader: 81 | total_batch_size: 128 82 | dataset: 83 | transforms: 84 | ops: 85 | - {type: Mosaic, output_size: 208, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5], 86 | probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True} 87 | - {type: RandomPhotometricDistort, p: 0.5} 88 | - {type: RandomZoomOut, fill: 0} 89 | - {type: RandomIoUCrop, p: 0.8} 90 | - {type: SanitizeBoundingBoxes, min_size: 10} 91 | - {type: RandomHorizontalFlip} 92 | - {type: Resize, size: [416, 416], } 93 | - {type: SanitizeBoundingBoxes, min_size: 10} 94 | - {type: ConvertPILImage, dtype: 'float32', scale: True} 95 | - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} 96 | policy: 97 | epoch: [4, 250, 400] # list 98 | ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop'] 99 | mosaic_prob: 0.5 100 | 101 | collate_fn: 102 | mixup_prob: 0.0 103 | mixup_epochs: [40000, 15000] 104 | copyblend_prob: 0.0 105 | copyblend_epochs: [40000, 15000] 106 | 107 | stop_epoch: 468 # 468 + 32 108 | ema_restart_decay: 0.9999 109 | base_size: 416 110 | base_size_repeat: ~ 111 | 112 | val_dataloader: 113 | total_batch_size: 256 114 | dataset: 115 | transforms: 116 | ops: 117 | - {type: Resize, size: [416, 416], } 118 | - {type: ConvertPILImage, dtype: 'float32', scale: True} 119 | shuffle: False 120 | num_workers: 16 121 | 122 | 123 | DEIMCriterion: 124 | losses: ['mal', 'boxes'] # , 'local' 125 | use_uni_set: False 126 | 127 | matcher: 128 | matcher_change_epoch: 450 # FIX This -------------------------------------------------------------------------------- /configs/deimv2/deimv2_hgnetv2_l_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../dataset/coco_detection.yml', 3 | '../runtime.yml', 4 | '../base/dataloader.yml', 5 | '../base/optimizer.yml', 6 | '../base/deimv2.yml' 7 | ] 8 | 9 | output_dir: ./outputs/deimv2_hgnetv2_l_coco 10 | 11 | 12 | optimizer: 13 | type: AdamW 14 | params: 15 | - 16 | params: '^(?=.*backbone)(?!.*norm|bn).*$' 17 | lr: 0.000025 18 | - 19 | params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' 20 | weight_decay: 0. 21 | 22 | lr: 0.0005 23 | betas: [0.9, 0.999] 24 | weight_decay: 0.000125 25 | -------------------------------------------------------------------------------- /configs/deimv2/deimv2_hgnetv2_m_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../dataset/coco_detection.yml', 3 | '../runtime.yml', 4 | '../base/dataloader.yml', 5 | '../base/optimizer.yml', 6 | '../base/deimv2.yml' 7 | ] 8 | 9 | output_dir: ./outputs/deimv2_hgnetv2_m_coco 10 | 11 | HGNetv2: 12 | name: 'B2' 13 | return_idx: [1, 2, 3] 14 | freeze_at: -1 15 | freeze_norm: False 16 | use_lab: True 17 | 18 | HybridEncoder: 19 | in_channels: [384, 768, 1536] 20 | hidden_dim: 256 21 | depth_mult: 0.67 22 | 23 | DEIMTransformer: 24 | num_layers: 4 # 5 6 25 | eval_idx: -1 # -2 -3 26 | 27 | optimizer: 28 | type: AdamW 29 | params: 30 | - 31 | params: '^(?=.*backbone)(?!.*bn).*$' 32 | lr: 0.00004 33 | - 34 | params: '^(?=.*(?:norm|bn)).*$' 35 | weight_decay: 0. 36 | 37 | lr: 0.0004 38 | betas: [0.9, 0.999] 39 | weight_decay: 0.0001 40 | 41 | # Increase to search for the optimal ema 42 | epoches: 102 # 120 + 4n 43 | 44 | ## Our LR-Scheduler 45 | flat_epoch: 49 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 46 | no_aug_epoch: 12 47 | 48 | ## Our DataAug 49 | train_dataloader: 50 | dataset: 51 | transforms: 52 | policy: 53 | epoch: [4, 49, 90] # list 54 | 55 | collate_fn: 56 | ema_restart_decay: 0.9999 57 | base_size_repeat: 6 58 | mixup_epochs: [4, 49] 59 | stop_epoch: 90 60 | copyblend_prob: 0.5 61 | copyblend_epochs: [4, 90] 62 | area_threshold: 100 63 | num_objects: 3 64 | with_expand: True 65 | expand_ratios: [0.1, 0.25] 66 | 67 | DEIMCriterion: 68 | matcher: 69 | # new matcher 70 | change_matcher: True 71 | iou_order_alpha: 4.0 72 | matcher_change_epoch: 80 -------------------------------------------------------------------------------- /configs/deimv2/deimv2_hgnetv2_n_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../dataset/coco_detection.yml', 3 | '../runtime.yml', 4 | '../base/dataloader.yml', 5 | '../base/optimizer.yml', 6 | '../base/deimv2.yml' 7 | ] 8 | 9 | output_dir: ./outputs/deimv2_hgnetv2_n_coco 10 | 11 | HGNetv2: 12 | name: 'B0' 13 | return_idx: [2, 3] 14 | freeze_at: -1 15 | freeze_norm: False 16 | use_lab: True 17 | 18 | HybridEncoder: 19 | in_channels: [512, 1024] 20 | feat_strides: [16, 32] 21 | 22 | # intra 23 | hidden_dim: 128 24 | use_encoder_idx: [1] 25 | dim_feedforward: 512 26 | 27 | # cross 28 | expansion: 0.34 29 | depth_mult: 0.5 30 | 31 | version: 'dfine' 32 | 33 | DEIMTransformer: 34 | feat_channels: [128, 128] 35 | feat_strides: [16, 32] 36 | hidden_dim: 128 37 | num_levels: 2 38 | num_points: [6, 6] 39 | 40 | num_layers: 3 41 | eval_idx: -1 42 | 43 | # FFN 44 | dim_feedforward: 512 45 | 46 | optimizer: 47 | type: AdamW 48 | params: 49 | - 50 | params: '^(?=.*backbone)(?!.*norm|bn).*$' 51 | lr: 0.0004 52 | - 53 | params: '^(?=.*backbone)(?=.*norm|bn).*$' 54 | lr: 0.0004 55 | weight_decay: 0. 56 | - 57 | params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' 58 | weight_decay: 0. 59 | 60 | lr: 0.0008 61 | betas: [0.9, 0.999] 62 | weight_decay: 0.0001 63 | 64 | # Increase to search for the optimal ema 65 | epoches: 160 # 148 + 12 66 | 67 | ## Our LR-Scheduler 68 | flat_epoch: 7800 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 69 | no_aug_epoch: 12 70 | lr_gamma: 1.0 71 | 72 | ## Our DataAug 73 | train_dataloader: 74 | dataset: 75 | transforms: 76 | policy: 77 | epoch: [4, 78, 148] # list 78 | 79 | collate_fn: 80 | ema_restart_decay: 0.9999 81 | base_size_repeat: ~ 82 | mixup_epochs: [4, 78] 83 | stop_epoch: 148 84 | copyblend_prob: 0.4 85 | copyblend_epochs: [4, 78] # CP half 86 | area_threshold: 100 87 | num_objects: 3 88 | with_expand: True 89 | expand_ratios: [0.1, 0.25] 90 | 91 | DEIMCriterion: 92 | matcher: 93 | # new matcher 94 | change_matcher: True 95 | iou_order_alpha: 4.0 96 | matcher_change_epoch: 136 -------------------------------------------------------------------------------- /configs/deimv2/deimv2_hgnetv2_pico_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../dataset/coco_detection.yml', 3 | '../runtime.yml', 4 | '../base/dataloader.yml', 5 | '../base/optimizer.yml', 6 | '../base/deimv2.yml', 7 | ] 8 | 9 | output_dir: ./outputs/deimv2_hgnetv2_pico_coco 10 | 11 | DEIM: 12 | encoder: LiteEncoder 13 | decoder: DEIMTransformer 14 | 15 | HGNetv2: 16 | name: 'Pico' 17 | return_idx: [2] 18 | freeze_at: -1 19 | freeze_norm: False 20 | use_lab: True 21 | 22 | LiteEncoder: 23 | in_channels: [512] 24 | feat_strides: [16] 25 | 26 | # intra 27 | hidden_dim: 112 28 | 29 | # cross 30 | expansion: 0.34 31 | depth_mult: 0.5 32 | act: 'silu' 33 | 34 | 35 | DEIMTransformer: 36 | feat_channels: [112, 112] 37 | feat_strides: [16, 32] 38 | hidden_dim: 112 39 | num_levels: 2 40 | num_points: [4, 2] 41 | 42 | num_layers: 3 43 | eval_idx: -1 44 | num_queries: 200 45 | 46 | # FFN 47 | dim_feedforward: 320 48 | 49 | # New options for DEIMv2 50 | share_bbox_head: True 51 | use_gateway: False 52 | 53 | # Increase to search for the optimal ema 54 | epoches: 500 # 468 + 32 55 | 56 | ## Our LR-Scheduler 57 | warmup_iter: 4000 58 | flat_epoch: 250 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 59 | no_aug_epoch: 32 60 | lr_gamma: 0.5 61 | 62 | optimizer: 63 | type: AdamW 64 | params: 65 | - 66 | params: '^(?=.*backbone)(?!.*norm|bn).*$' 67 | lr: 0.0008 68 | - 69 | params: '^(?=.*backbone)(?=.*norm|bn).*$' 70 | lr: 0.0008 71 | weight_decay: 0. 72 | - # not opt 73 | params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' 74 | weight_decay: 0. 75 | 76 | lr: 0.0016 77 | betas: [0.9, 0.999] 78 | weight_decay: 0.0001 79 | 80 | eval_spatial_size: [640, 640] 81 | train_dataloader: 82 | total_batch_size: 128 83 | dataset: 84 | transforms: 85 | ops: 86 | - {type: Mosaic, output_size: 320, rotation_range: 10, translation_range: [0.1, 0.1], scaling_range: [0.5, 1.5], 87 | probability: 1.0, fill_value: 0, use_cache: True, max_cached_images: 50, random_pop: True} 88 | - {type: RandomPhotometricDistort, p: 0.5} 89 | - {type: RandomZoomOut, fill: 0} 90 | - {type: RandomIoUCrop, p: 0.8} 91 | - {type: SanitizeBoundingBoxes, min_size: 8} 92 | - {type: RandomHorizontalFlip} 93 | - {type: Resize, size: [640, 640], } 94 | - {type: SanitizeBoundingBoxes, min_size: 8} 95 | - {type: ConvertPILImage, dtype: 'float32', scale: True} 96 | - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} 97 | policy: 98 | epoch: [4, 250, 400] # list 99 | ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop'] 100 | mosaic_prob: 0.5 101 | 102 | collate_fn: 103 | mixup_prob: 0.0 104 | mixup_epochs: [40000, 15000] 105 | copyblend_prob: 0.0 106 | copyblend_epochs: [40000, 15000] 107 | stop_epoch: 468 # 468 + 32 108 | ema_restart_decay: 0.9999 109 | base_size: 640 110 | base_size_repeat: ~ 111 | 112 | val_dataloader: 113 | total_batch_size: 256 114 | dataset: 115 | transforms: 116 | ops: 117 | - {type: Resize, size: [640, 640], } 118 | - {type: ConvertPILImage, dtype: 'float32', scale: True} 119 | shuffle: False 120 | num_workers: 16 121 | 122 | 123 | DEIMCriterion: 124 | losses: ['mal', 'boxes'] # , 'local' 125 | use_uni_set: False 126 | 127 | matcher: 128 | matcher_change_epoch: 450 # FIX This -------------------------------------------------------------------------------- /configs/deimv2/deimv2_hgnetv2_s_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../dataset/coco_detection.yml', 3 | '../runtime.yml', 4 | '../base/dataloader.yml', 5 | '../base/optimizer.yml', 6 | '../base/deimv2.yml' 7 | ] 8 | 9 | output_dir: ./outputs/deimv2_hgnetv2_s_coco 10 | 11 | HGNetv2: 12 | name: 'B0' 13 | return_idx: [1, 2, 3] 14 | freeze_at: -1 15 | freeze_norm: False 16 | use_lab: True 17 | 18 | HybridEncoder: 19 | in_channels: [256, 512, 1024] 20 | hidden_dim: 256 21 | depth_mult: 0.34 22 | expansion: 0.5 23 | 24 | version: 'dfine' 25 | 26 | DEIMTransformer: 27 | num_layers: 3 # 4 5 6 28 | eval_idx: -1 # -2 -3 -4 29 | 30 | optimizer: 31 | type: AdamW 32 | params: 33 | - 34 | params: '^(?=.*backbone)(?!.*bn).*$' 35 | lr: 0.0002 36 | - 37 | params: '^(?=.*(?:norm|bn)).*$' # except bias 38 | weight_decay: 0. 39 | 40 | lr: 0.0004 41 | betas: [0.9, 0.999] 42 | weight_decay: 0.0001 43 | 44 | # Increase to search for the optimal ema 45 | epoches: 132 # 120 + 4n 46 | 47 | ## Our LR-Scheduler 48 | flat_epoch: 64 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 49 | no_aug_epoch: 12 50 | 51 | ## Our DataAug 52 | train_dataloader: 53 | dataset: 54 | transforms: 55 | policy: 56 | epoch: [4, 64, 120] # list 57 | 58 | collate_fn: 59 | ema_restart_decay: 0.9999 60 | base_size_repeat: 20 61 | mixup_epochs: [4, 64] 62 | stop_epoch: 120 63 | copyblend_prob: 0.5 64 | # copyblend_epochs: [4, 64] # from v11 to v12: copy-paste continues only half epochs 65 | copyblend_epochs: [4, 120] 66 | area_threshold: 100 67 | num_objects: 3 68 | with_expand: True 69 | expand_ratios: [0.1, 0.25] 70 | 71 | DEIMCriterion: 72 | matcher: 73 | # new matcher 74 | change_matcher: True 75 | iou_order_alpha: 4.0 76 | matcher_change_epoch: 100 -------------------------------------------------------------------------------- /configs/deimv2/deimv2_hgnetv2_x_coco.yml: -------------------------------------------------------------------------------- 1 | __include__: [ 2 | '../dataset/coco_detection.yml', 3 | '../runtime.yml', 4 | '../base/dataloader.yml', 5 | '../base/optimizer.yml', 6 | '../base/deimv2.yml' 7 | ] 8 | 9 | output_dir: ./outputs/deimv2_hgnetv2_x_coco 10 | 11 | 12 | HGNetv2: 13 | name: 'B5' 14 | return_idx: [1, 2, 3] 15 | freeze_stem_only: True 16 | freeze_at: 0 17 | freeze_norm: True 18 | 19 | HybridEncoder: 20 | # intra 21 | hidden_dim: 384 22 | dim_feedforward: 2048 23 | 24 | DEIMTransformer: 25 | feat_channels: [384, 384, 384] # [256, 256, 256] 26 | reg_scale: 8 # 4 27 | 28 | # FFN 29 | dim_feedforward: 2048 30 | 31 | optimizer: 32 | type: AdamW 33 | params: 34 | - 35 | params: '^(?=.*backbone)(?!.*norm|bn).*$' 36 | lr: 0.000005 37 | - 38 | params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' 39 | weight_decay: 0. 40 | 41 | lr: 0.0005 42 | betas: [0.9, 0.999] 43 | weight_decay: 0.000125 44 | 45 | # Increase to search for the optimal ema 46 | epoches: 58 # 72 + 2n 47 | 48 | ## Our LR-Scheduler 49 | flat_epoch: 29 # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2 50 | no_aug_epoch: 8 51 | 52 | train_dataloader: 53 | dataset: 54 | transforms: 55 | policy: 56 | epoch: [4, 29, 50] # list 57 | 58 | collate_fn: 59 | ema_restart_decay: 0.9998 60 | base_size_repeat: 3 61 | -------------------------------------------------------------------------------- /configs/runtime.yml: -------------------------------------------------------------------------------- 1 | print_freq: 500 2 | output_dir: './logs' 3 | checkpoint_freq: 12 4 | 5 | 6 | sync_bn: True 7 | find_unused_parameters: True 8 | 9 | 10 | use_amp: False 11 | scaler: 12 | type: GradScaler 13 | enabled: True 14 | 15 | 16 | use_ema: False 17 | ema: 18 | type: ModelEMA 19 | decay: 0.9999 20 | warmups: 1000 21 | -------------------------------------------------------------------------------- /engine/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved. 3 | """ 4 | 5 | # for register purpose 6 | from . import optim 7 | from . import data 8 | from . import deim 9 | 10 | from .backbone import * 11 | 12 | from .backbone import ( 13 | get_activation, 14 | FrozenBatchNorm2d, 15 | freeze_batch_norm2d, 16 | ) -------------------------------------------------------------------------------- /engine/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | from .common import ( 7 | get_activation, 8 | FrozenBatchNorm2d, 9 | freeze_batch_norm2d, 10 | ) 11 | from .presnet import PResNet 12 | from .test_resnet import MResNet 13 | 14 | from .timm_model import TimmModel 15 | from .torchvision_model import TorchVisionModel 16 | 17 | from .csp_resnet import CSPResNet 18 | from .csp_darknet import CSPDarkNet, CSPPAN 19 | 20 | from .hgnetv2 import HGNetv2 21 | 22 | from .dinov3_adapter import * 23 | -------------------------------------------------------------------------------- /engine/backbone/common.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | 10 | class ConvNormLayer(nn.Module): 11 | def __init__(self, ch_in, ch_out, kernel_size, stride, padding=None, bias=False, act=None): 12 | super().__init__() 13 | self.conv = nn.Conv2d( 14 | ch_in, 15 | ch_out, 16 | kernel_size, 17 | stride, 18 | padding=(kernel_size-1)//2 if padding is None else padding, 19 | bias=bias) 20 | self.norm = nn.BatchNorm2d(ch_out) 21 | self.act = nn.Identity() if act is None else get_activation(act) 22 | 23 | def forward(self, x): 24 | return self.act(self.norm(self.conv(x))) 25 | 26 | 27 | class FrozenBatchNorm2d(nn.Module): 28 | """copy and modified from https://github.com/facebookresearch/detr/blob/master/models/backbone.py 29 | BatchNorm2d where the batch statistics and the affine parameters are fixed. 30 | Copy-paste from torchvision.misc.ops with added eps before rqsrt, 31 | without which any other models than torchvision.models.resnet[18,34,50,101] 32 | produce nans. 33 | """ 34 | def __init__(self, num_features, eps=1e-5): 35 | super(FrozenBatchNorm2d, self).__init__() 36 | n = num_features 37 | self.register_buffer("weight", torch.ones(n)) 38 | self.register_buffer("bias", torch.zeros(n)) 39 | self.register_buffer("running_mean", torch.zeros(n)) 40 | self.register_buffer("running_var", torch.ones(n)) 41 | self.eps = eps 42 | self.num_features = n 43 | 44 | def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, 45 | missing_keys, unexpected_keys, error_msgs): 46 | num_batches_tracked_key = prefix + 'num_batches_tracked' 47 | if num_batches_tracked_key in state_dict: 48 | del state_dict[num_batches_tracked_key] 49 | 50 | super(FrozenBatchNorm2d, self)._load_from_state_dict( 51 | state_dict, prefix, local_metadata, strict, 52 | missing_keys, unexpected_keys, error_msgs) 53 | 54 | def forward(self, x): 55 | # move reshapes to the beginning 56 | # to make it fuser-friendly 57 | w = self.weight.reshape(1, -1, 1, 1) 58 | b = self.bias.reshape(1, -1, 1, 1) 59 | rv = self.running_var.reshape(1, -1, 1, 1) 60 | rm = self.running_mean.reshape(1, -1, 1, 1) 61 | scale = w * (rv + self.eps).rsqrt() 62 | bias = b - rm * scale 63 | return x * scale + bias 64 | 65 | def extra_repr(self): 66 | return ( 67 | "{num_features}, eps={eps}".format(**self.__dict__) 68 | ) 69 | 70 | def freeze_batch_norm2d(module: nn.Module) -> nn.Module: 71 | if isinstance(module, nn.BatchNorm2d): 72 | module = FrozenBatchNorm2d(module.num_features) 73 | else: 74 | for name, child in module.named_children(): 75 | _child = freeze_batch_norm2d(child) 76 | if _child is not child: 77 | setattr(module, name, _child) 78 | return module 79 | 80 | 81 | def get_activation(act: str, inplace: bool=True): 82 | """get activation 83 | """ 84 | if act is None: 85 | return nn.Identity() 86 | 87 | elif isinstance(act, nn.Module): 88 | return act 89 | 90 | act = act.lower() 91 | 92 | if act == 'silu' or act == 'swish': 93 | m = nn.SiLU() 94 | 95 | elif act == 'relu': 96 | m = nn.ReLU() 97 | 98 | elif act == 'leaky_relu': 99 | m = nn.LeakyReLU() 100 | 101 | elif act == 'silu': 102 | m = nn.SiLU() 103 | 104 | elif act == 'gelu': 105 | m = nn.GELU() 106 | 107 | elif act == 'hardsigmoid': 108 | m = nn.Hardsigmoid() 109 | 110 | else: 111 | raise RuntimeError('') 112 | 113 | if hasattr(m, 'inplace'): 114 | m.inplace = inplace 115 | 116 | return m 117 | -------------------------------------------------------------------------------- /engine/backbone/dinov3/__init__.py: -------------------------------------------------------------------------------- 1 | from .vision_transformer import DinoVisionTransformer -------------------------------------------------------------------------------- /engine/backbone/dinov3/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This software may be used and distributed in accordance with 4 | # the terms of the DINOv3 License Agreement. 5 | 6 | from .attention import CausalSelfAttention, LinearKMaskedBias, SelfAttention 7 | from .block import CausalSelfAttentionBlock, SelfAttentionBlock 8 | from .ffn_layers import Mlp, SwiGLUFFN 9 | from .fp8_linear import convert_linears_to_fp8 10 | from .layer_scale import LayerScale 11 | from .patch_embed import PatchEmbed 12 | from .rms_norm import RMSNorm 13 | from .rope_position_encoding import RopePositionEmbedding 14 | -------------------------------------------------------------------------------- /engine/backbone/dinov3/layers/dino_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This software may be used and distributed in accordance with 4 | # the terms of the DINOv3 License Agreement. 5 | 6 | import torch 7 | import torch.nn as nn 8 | from torch.nn.init import trunc_normal_ 9 | 10 | 11 | class DINOHead(nn.Module): 12 | def __init__( 13 | self, 14 | in_dim, 15 | out_dim, 16 | use_bn=False, 17 | nlayers=3, 18 | hidden_dim=2048, 19 | bottleneck_dim=256, 20 | mlp_bias=True, 21 | ): 22 | super().__init__() 23 | nlayers = max(nlayers, 1) 24 | self.mlp = _build_mlp( 25 | nlayers, 26 | in_dim, 27 | bottleneck_dim, 28 | hidden_dim=hidden_dim, 29 | use_bn=use_bn, 30 | bias=mlp_bias, 31 | ) 32 | self.last_layer = nn.Linear(bottleneck_dim, out_dim, bias=False) 33 | 34 | def init_weights(self) -> None: 35 | self.apply(self._init_weights) 36 | 37 | def _init_weights(self, m): 38 | if isinstance(m, nn.Linear): 39 | trunc_normal_(m.weight, std=0.02) 40 | if m.bias is not None: 41 | nn.init.constant_(m.bias, 0) 42 | 43 | def forward(self, x, no_last_layer=False, only_last_layer=False): 44 | if not only_last_layer: 45 | x = self.mlp(x) 46 | eps = 1e-6 if x.dtype == torch.float16 else 1e-12 47 | x = nn.functional.normalize(x, dim=-1, p=2, eps=eps) 48 | if not no_last_layer: 49 | x = self.last_layer(x) 50 | return x 51 | 52 | 53 | def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True): 54 | if nlayers == 1: 55 | return nn.Linear(in_dim, bottleneck_dim, bias=bias) 56 | else: 57 | layers = [nn.Linear(in_dim, hidden_dim, bias=bias)] 58 | if use_bn: 59 | layers.append(nn.BatchNorm1d(hidden_dim)) 60 | layers.append(nn.GELU()) 61 | for _ in range(nlayers - 2): 62 | layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias)) 63 | if use_bn: 64 | layers.append(nn.BatchNorm1d(hidden_dim)) 65 | layers.append(nn.GELU()) 66 | layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias)) 67 | return nn.Sequential(*layers) 68 | -------------------------------------------------------------------------------- /engine/backbone/dinov3/layers/ffn_layers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This software may be used and distributed in accordance with 4 | # the terms of the DINOv3 License Agreement. 5 | 6 | from typing import Callable, List, Optional 7 | 8 | import torch.nn.functional as F 9 | from torch import Tensor, nn 10 | 11 | from ..utils import cat_keep_shapes, uncat_with_shapes 12 | 13 | 14 | class ListForwardMixin(object): 15 | def forward(self, x: Tensor): 16 | raise NotImplementedError 17 | 18 | def forward_list(self, x_list: List[Tensor]) -> List[Tensor]: 19 | x_flat, shapes, num_tokens = cat_keep_shapes(x_list) 20 | x_flat = self.forward(x_flat) 21 | return uncat_with_shapes(x_flat, shapes, num_tokens) 22 | 23 | 24 | class Mlp(nn.Module, ListForwardMixin): 25 | def __init__( 26 | self, 27 | in_features: int, 28 | hidden_features: Optional[int] = None, 29 | out_features: Optional[int] = None, 30 | act_layer: Callable[..., nn.Module] = nn.GELU, 31 | drop: float = 0.0, 32 | bias: bool = True, 33 | device=None, 34 | ) -> None: 35 | super().__init__() 36 | out_features = out_features or in_features 37 | hidden_features = hidden_features or in_features 38 | self.fc1 = nn.Linear(in_features, hidden_features, bias=bias, device=device) 39 | self.act = act_layer() 40 | self.fc2 = nn.Linear(hidden_features, out_features, bias=bias, device=device) 41 | self.drop = nn.Dropout(drop) 42 | 43 | def forward(self, x: Tensor) -> Tensor: 44 | x = self.fc1(x) 45 | x = self.act(x) 46 | x = self.drop(x) 47 | x = self.fc2(x) 48 | x = self.drop(x) 49 | return x 50 | 51 | 52 | class SwiGLUFFN(nn.Module, ListForwardMixin): 53 | def __init__( 54 | self, 55 | in_features: int, 56 | hidden_features: Optional[int] = None, 57 | out_features: Optional[int] = None, 58 | act_layer: Optional[Callable[..., nn.Module]] = None, 59 | drop: float = 0.0, 60 | bias: bool = True, 61 | align_to: int = 8, 62 | device=None, 63 | ) -> None: 64 | super().__init__() 65 | out_features = out_features or in_features 66 | hidden_features = hidden_features or in_features 67 | d = int(hidden_features * 2 / 3) 68 | swiglu_hidden_features = d + (-d % align_to) 69 | self.w1 = nn.Linear(in_features, swiglu_hidden_features, bias=bias, device=device) 70 | self.w2 = nn.Linear(in_features, swiglu_hidden_features, bias=bias, device=device) 71 | self.w3 = nn.Linear(swiglu_hidden_features, out_features, bias=bias, device=device) 72 | 73 | def forward(self, x: Tensor) -> Tensor: 74 | x1 = self.w1(x) 75 | x2 = self.w2(x) 76 | hidden = F.silu(x1) * x2 77 | return self.w3(hidden) 78 | -------------------------------------------------------------------------------- /engine/backbone/dinov3/layers/fp8_linear.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This software may be used and distributed in accordance with 4 | # the terms of the DINOv3 License Agreement. 5 | 6 | import re 7 | 8 | import torch 9 | 10 | from ..layers.attention import LinearKMaskedBias 11 | from ..utils import named_replace 12 | 13 | # avoid division by zero when calculating scale 14 | EPS = 1e-12 15 | 16 | 17 | def scale(t, amax_t): 18 | max_v = torch.finfo(torch.float8_e4m3fn).max 19 | scale_t = torch.clamp(amax_t.float(), min=EPS) / max_v 20 | t_fp8 = (t / scale_t).to(torch.float8_e4m3fn) 21 | return t_fp8, scale_t 22 | 23 | 24 | def matmul(first, amax_first, second_t, amax_second_t, bias): 25 | first_fp8, scale_first = scale(first, amax_first) 26 | second_t_fp8, scale_second_t = scale(second_t, amax_second_t) 27 | # PyTorch's row-wise scaled matmul kernel is based on CUTLASS and is quite 28 | # slow. Hence we fall back to an "unscaled" matmul, which uses cuBLAS, and 29 | # apply the scale manually afterwards. 30 | output = torch._scaled_mm( 31 | first_fp8, 32 | second_t_fp8.t(), 33 | scale_a=scale_first.new_ones((1, 1)), 34 | scale_b=scale_second_t.t().new_ones((1, 1)), 35 | bias=None, 36 | out_dtype=torch.bfloat16, 37 | use_fast_accum=False, 38 | ) 39 | output = (output * scale_first * scale_second_t.t()).to(torch.bfloat16) 40 | if bias is not None: 41 | output = output + bias 42 | return output 43 | 44 | 45 | @torch.compiler.allow_in_graph 46 | class Fp8LinearFn(torch.autograd.Function): 47 | @staticmethod 48 | def forward(ctx, a, b_t, bias): 49 | amax_a = a.abs().amax(dim=-1, keepdim=True) 50 | amax_b_t = b_t.abs().amax(dim=-1, keepdim=True) 51 | out = matmul(a, amax_a, b_t, amax_b_t, bias) 52 | 53 | ctx.a_requires_grad = a.requires_grad 54 | ctx.b_requires_grad = b_t.requires_grad 55 | ctx.bias_requires_grad = bias.requires_grad if bias is not None else False 56 | 57 | ctx.save_for_backward(a, b_t, amax_b_t.max()) 58 | 59 | return out 60 | 61 | @staticmethod 62 | def backward(ctx, grad_out): 63 | a, b_t, amax_b = ctx.saved_tensors 64 | 65 | if ctx.a_requires_grad: 66 | b = b_t.t().contiguous() 67 | amax_grad_out = grad_out.abs().amax(dim=-1, keepdim=True) 68 | amax_b = amax_b.repeat(b.shape[0], 1) 69 | grad_a = matmul(grad_out, amax_grad_out, b, amax_b, None) 70 | else: 71 | grad_a = None 72 | if ctx.b_requires_grad: 73 | grad_b = grad_out.t() @ a 74 | else: 75 | grad_b = None 76 | if ctx.bias_requires_grad: 77 | grad_bias = grad_out.sum(dim=0) 78 | else: 79 | grad_bias = None 80 | 81 | return grad_a, grad_b, grad_bias 82 | 83 | 84 | class Fp8Linear(torch.nn.Linear): 85 | def forward(self, input: torch.Tensor) -> torch.Tensor: 86 | out = Fp8LinearFn.apply(input.flatten(end_dim=-2), self.weight, self.bias) 87 | out = out.unflatten(0, input.shape[:-1]) 88 | return out 89 | 90 | 91 | class Fp8LinearKMaskedBias(LinearKMaskedBias): 92 | def forward(self, input: torch.Tensor) -> torch.Tensor: 93 | masked_bias = self.bias * self.bias_mask if self.bias is not None else None 94 | out = Fp8LinearFn.apply(input.flatten(end_dim=-2), self.weight, masked_bias) 95 | out = out.unflatten(0, input.shape[:-1]) 96 | return out 97 | 98 | 99 | def convert_linears_to_fp8(root_module: torch.nn.Module, *, filter: str) -> torch.nn.Module: 100 | filter_re = re.compile(filter) 101 | total_count = 0 102 | 103 | def replace(module: torch.nn.Module, name: str) -> torch.nn.Module: 104 | nonlocal total_count 105 | if not isinstance(module, torch.nn.Linear) or not filter_re.search(name): 106 | return module 107 | if type(module) == torch.nn.Linear: 108 | new_cls = Fp8Linear 109 | elif type(module) == LinearKMaskedBias: 110 | new_cls = Fp8LinearKMaskedBias 111 | else: 112 | assert False, str(type(module)) 113 | if module.in_features % 64 != 0 or module.out_features % 64 != 0: 114 | # This is not a strict requirement, but H100 TensorCores for fp8 115 | # operate on tiles of 64 elements anyways, and Inductor sometimes 116 | # pads inner dims to become multiples of 64. Also, if one day we 117 | # switch back to cuBLAS, it artificially requires dims to be 118 | # multiples of 16. 119 | raise RuntimeError( 120 | "fp8 requires all dimensions to be multiples of 64 " "(consider using ffn_layer=swiglu64 or higher)" 121 | ) 122 | new_module = new_cls( 123 | in_features=module.in_features, 124 | out_features=module.out_features, 125 | bias=module.bias is not None, 126 | dtype=module.weight.dtype, 127 | device=module.weight.device, 128 | ) 129 | new_module.weight = module.weight 130 | new_module.bias = module.bias 131 | total_count += 1 132 | return new_module 133 | 134 | out = named_replace(replace, root_module) 135 | assert total_count > 0, "fp8: no layer found to convert" 136 | # Force re-compile everything 137 | torch._dynamo.reset_code_caches() 138 | from torch._inductor.cudagraph_trees import reset_cudagraph_trees 139 | 140 | reset_cudagraph_trees() 141 | return out 142 | -------------------------------------------------------------------------------- /engine/backbone/dinov3/layers/layer_scale.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This software may be used and distributed in accordance with 4 | # the terms of the DINOv3 License Agreement. 5 | 6 | from typing import Union 7 | 8 | import torch 9 | from torch import Tensor, nn 10 | 11 | 12 | class LayerScale(nn.Module): 13 | def __init__( 14 | self, 15 | dim: int, 16 | init_values: Union[float, Tensor] = 1e-5, 17 | inplace: bool = False, 18 | device=None, 19 | ) -> None: 20 | super().__init__() 21 | self.inplace = inplace 22 | self.gamma = nn.Parameter(torch.empty(dim, device=device)) 23 | self.init_values = init_values 24 | 25 | def reset_parameters(self): 26 | nn.init.constant_(self.gamma, self.init_values) 27 | 28 | def forward(self, x: Tensor) -> Tensor: 29 | return x.mul_(self.gamma) if self.inplace else x * self.gamma 30 | -------------------------------------------------------------------------------- /engine/backbone/dinov3/layers/patch_embed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This software may be used and distributed in accordance with 4 | # the terms of the DINOv3 License Agreement. 5 | 6 | import math 7 | from typing import Callable, Tuple, Union 8 | 9 | from torch import Tensor, nn 10 | 11 | 12 | def make_2tuple(x): 13 | if isinstance(x, tuple): 14 | assert len(x) == 2 15 | return x 16 | 17 | assert isinstance(x, int) 18 | return (x, x) 19 | 20 | 21 | class PatchEmbed(nn.Module): 22 | """ 23 | 2D image to patch embedding: (B,C,H,W) -> (B,N,D) 24 | 25 | Args: 26 | img_size: Image size. 27 | patch_size: Patch token size. 28 | in_chans: Number of input image channels. 29 | embed_dim: Number of linear projection output channels. 30 | norm_layer: Normalization layer. 31 | """ 32 | 33 | def __init__( 34 | self, 35 | img_size: Union[int, Tuple[int, int]] = 224, 36 | patch_size: Union[int, Tuple[int, int]] = 16, 37 | in_chans: int = 3, 38 | embed_dim: int = 768, 39 | norm_layer: Callable | None = None, 40 | flatten_embedding: bool = True, 41 | ) -> None: 42 | super().__init__() 43 | 44 | image_HW = make_2tuple(img_size) 45 | patch_HW = make_2tuple(patch_size) 46 | patch_grid_size = ( 47 | image_HW[0] // patch_HW[0], 48 | image_HW[1] // patch_HW[1], 49 | ) 50 | 51 | self.img_size = image_HW 52 | self.patch_size = patch_HW 53 | self.patches_resolution = patch_grid_size 54 | self.num_patches = patch_grid_size[0] * patch_grid_size[1] 55 | 56 | self.in_chans = in_chans 57 | self.embed_dim = embed_dim 58 | 59 | self.flatten_embedding = flatten_embedding 60 | 61 | self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW) 62 | self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() 63 | 64 | def forward(self, x: Tensor) -> Tensor: 65 | _, _, H, W = x.shape 66 | # patch_H, patch_W = self.patch_size 67 | # assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}" 68 | # assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}" 69 | 70 | x = self.proj(x) # B C H W 71 | H, W = x.size(2), x.size(3) 72 | x = x.flatten(2).transpose(1, 2) # B HW C 73 | x = self.norm(x) 74 | if not self.flatten_embedding: 75 | x = x.reshape(-1, H, W, self.embed_dim) # B H W C 76 | return x 77 | 78 | def flops(self) -> float: 79 | Ho, Wo = self.patches_resolution 80 | flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) 81 | if self.norm is not None: 82 | flops += Ho * Wo * self.embed_dim 83 | return flops 84 | 85 | def reset_parameters(self): 86 | k = 1 / (self.in_chans * (self.patch_size[0] ** 2)) 87 | nn.init.uniform_(self.proj.weight, -math.sqrt(k), math.sqrt(k)) 88 | if self.proj.bias is not None: 89 | nn.init.uniform_(self.proj.bias, -math.sqrt(k), math.sqrt(k)) 90 | -------------------------------------------------------------------------------- /engine/backbone/dinov3/layers/rms_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This software may be used and distributed in accordance with 4 | # the terms of the DINOv3 License Agreement. 5 | 6 | import torch 7 | from torch import Tensor, nn 8 | 9 | 10 | class RMSNorm(nn.Module): 11 | def __init__(self, dim: int, eps: float = 1e-5): 12 | super().__init__() 13 | self.weight = nn.Parameter(torch.ones(dim)) 14 | self.eps = eps 15 | 16 | def reset_parameters(self) -> None: 17 | nn.init.constant_(self.weight, 1) 18 | 19 | def _norm(self, x: Tensor) -> Tensor: 20 | return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) 21 | 22 | def forward(self, x: Tensor) -> Tensor: 23 | output = self._norm(x.float()).type_as(x) 24 | return output * self.weight 25 | -------------------------------------------------------------------------------- /engine/backbone/dinov3/layers/sparse_linear.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This software may be used and distributed in accordance with 4 | # the terms of the DINOv3 License Agreement. 5 | 6 | import logging 7 | from typing import Callable 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import xformers.ops as xops 13 | 14 | from ..utils import named_apply, named_replace 15 | 16 | logger = logging.getLogger("dinov3") 17 | 18 | 19 | class LinearW24(torch.nn.Linear): 20 | ALGO = "largest_abs_values_greedy" 21 | 22 | def __init__(self, *args, **kwargs) -> None: 23 | super().__init__(*args, **kwargs) 24 | self.sparsity_enabled = False 25 | 26 | def forward(self, input: torch.Tensor) -> torch.Tensor: 27 | if not self.sparsity_enabled: 28 | return super().forward(input) 29 | 30 | input_shape = input.shape 31 | input = input.flatten(end_dim=-2) 32 | dim0 = input.shape[0] 33 | if dim0 % 8 != 0: 34 | # NOTE: This should be torch-compiled away 35 | input = F.pad(input, [0, 0, 0, -dim0 % 8]) 36 | w_sparse = xops.sparsify24( 37 | self.weight, 38 | algo=self.ALGO, 39 | gradient="ste", 40 | backend="cusparselt", 41 | ) 42 | return F.linear(input, w_sparse, self.bias,)[ 43 | :dim0 44 | ].unflatten(dim=0, sizes=input_shape[:-1]) 45 | 46 | 47 | def replace_linears_with_sparse_linear(root_module: nn.Module, *, filter_fn: Callable[[str], bool]) -> nn.Module: 48 | total_count = 0 49 | 50 | def replace(module: nn.Module, name: str) -> nn.Module: 51 | nonlocal total_count 52 | if not isinstance(module, nn.Linear) or not filter_fn(name): 53 | return module 54 | assert type(module) == nn.Linear, "Subtypes not supported" 55 | new_module = LinearW24( 56 | in_features=module.in_features, 57 | out_features=module.out_features, 58 | bias=module.bias is not None, 59 | dtype=module.weight.dtype, 60 | device=module.weight.device, 61 | ) 62 | new_module.weight = module.weight 63 | new_module.bias = module.bias 64 | total_count += 1 65 | return new_module 66 | 67 | out = named_replace(replace, root_module) 68 | assert total_count > 0, "2:4 sparsity: no layer found to sparsify" 69 | return out 70 | 71 | 72 | def update_24sparsity(root_module: nn.Module, enabled: bool) -> int: 73 | num_modified = 0 74 | 75 | def maybe_apply_sparsity(module: nn.Module, name: str) -> nn.Module: 76 | nonlocal num_modified 77 | if not isinstance(module, LinearW24): 78 | return module 79 | num_modified += 1 80 | module.sparsity_enabled = enabled 81 | logger.info(f"- {'' if module.sparsity_enabled else 'de'}sparsifying {name}") 82 | return module 83 | 84 | named_apply(maybe_apply_sparsity, root_module) 85 | # Force re-compile everything 86 | torch._dynamo.reset_code_caches() 87 | from torch._inductor.cudagraph_trees import reset_cudagraph_trees 88 | 89 | reset_cudagraph_trees() 90 | return num_modified 91 | -------------------------------------------------------------------------------- /engine/backbone/dinov3/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This software may be used and distributed in accordance with 4 | # the terms of the DINOv3 License Agreement. 5 | 6 | from .dtype import as_torch_dtype 7 | from .utils import ( 8 | cat_keep_shapes, 9 | count_parameters, 10 | fix_random_seeds, 11 | get_conda_env, 12 | get_sha, 13 | named_apply, 14 | named_replace, 15 | uncat_with_shapes, 16 | ) 17 | -------------------------------------------------------------------------------- /engine/backbone/dinov3/utils/cluster.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This software may be used and distributed in accordance with 4 | # the terms of the DINOv3 License Agreement. 5 | 6 | import os 7 | from enum import Enum 8 | from pathlib import Path 9 | from typing import Any, Dict, Optional 10 | 11 | 12 | class ClusterType(Enum): 13 | CW = "cw" 14 | 15 | 16 | def _guess_cluster_type() -> ClusterType: 17 | return ClusterType.CW 18 | 19 | 20 | def get_cluster_type( 21 | cluster_type: Optional[ClusterType] = None, 22 | ) -> Optional[ClusterType]: 23 | if cluster_type is None: 24 | return _guess_cluster_type() 25 | 26 | return cluster_type 27 | 28 | 29 | def get_slurm_account(cluster_type: Optional[ClusterType] = None) -> Optional[str]: 30 | cluster_type = get_cluster_type(cluster_type) 31 | if cluster_type is None: 32 | return None 33 | return { 34 | ClusterType.CW: "fair_amaia_cw_explore", 35 | }[cluster_type] 36 | 37 | 38 | def get_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]: 39 | cluster_type = get_cluster_type(cluster_type) 40 | if cluster_type is None: 41 | return None 42 | 43 | CHECKPOINT_DIRNAMES = { 44 | ClusterType.CW: "", 45 | } 46 | return Path("/") / CHECKPOINT_DIRNAMES[cluster_type] 47 | 48 | 49 | def get_user_checkpoint_path( 50 | cluster_type: Optional[ClusterType] = None, 51 | ) -> Optional[Path]: 52 | checkpoint_path = get_checkpoint_path(cluster_type) 53 | if checkpoint_path is None: 54 | return None 55 | 56 | username = os.environ.get("USER") 57 | assert username is not None 58 | return checkpoint_path / username 59 | 60 | 61 | def get_slurm_qos(cluster_type: Optional[ClusterType] = None) -> Optional[str]: 62 | cluster_type = get_cluster_type(cluster_type) 63 | if cluster_type is None: 64 | return None 65 | 66 | return { 67 | ClusterType.CW: "explore", 68 | }.get(cluster_type) 69 | 70 | 71 | def get_slurm_partition(cluster_type: Optional[ClusterType] = None) -> Optional[str]: 72 | cluster_type = get_cluster_type(cluster_type) 73 | if cluster_type is None: 74 | return None 75 | 76 | SLURM_PARTITIONS = { 77 | ClusterType.CW: "learn", 78 | } 79 | return SLURM_PARTITIONS[cluster_type] 80 | 81 | 82 | def get_slurm_executor_parameters( 83 | nodes: int, 84 | num_gpus_per_node: int, 85 | cluster_type: Optional[ClusterType] = None, 86 | **kwargs, 87 | ) -> Dict[str, Any]: 88 | # create default parameters 89 | params = { 90 | "mem_gb": 0, # Requests all memory on a node, see https://slurm.schedmd.com/sbatch.html 91 | "gpus_per_node": num_gpus_per_node, 92 | "tasks_per_node": num_gpus_per_node, # one task per GPU 93 | "cpus_per_task": 10, 94 | "nodes": nodes, 95 | "slurm_partition": get_slurm_partition(cluster_type), 96 | } 97 | # apply cluster-specific adjustments 98 | cluster_type = get_cluster_type(cluster_type) 99 | if cluster_type == ClusterType.CW: 100 | params["cpus_per_task"] = 16 101 | # set additional parameters / apply overrides 102 | params.update(kwargs) 103 | return params 104 | -------------------------------------------------------------------------------- /engine/backbone/dinov3/utils/custom_callable.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This software may be used and distributed in accordance with 4 | # the terms of the DINOv3 License Agreement. 5 | 6 | import contextlib 7 | import importlib 8 | import inspect 9 | import os 10 | import sys 11 | from pathlib import Path 12 | 13 | 14 | @contextlib.contextmanager 15 | def _load_modules_from_dir(dir_: str): 16 | sys.path.insert(0, dir_) 17 | yield 18 | sys.path.pop(0) 19 | 20 | 21 | def load_custom_callable(module_path: str | Path, callable_name: str): 22 | module_full_path = os.path.realpath(module_path) 23 | assert os.path.exists(module_full_path), f"module {module_full_path} does not exist" 24 | module_dir, module_filename = os.path.split(module_full_path) 25 | module_name, _ = os.path.splitext(module_filename) 26 | 27 | with _load_modules_from_dir(module_dir): 28 | module = importlib.import_module(module_name) 29 | if inspect.getfile(module) != module_full_path: 30 | importlib.reload(module) 31 | callable_ = getattr(module, callable_name) 32 | 33 | return callable_ 34 | 35 | 36 | @contextlib.contextmanager 37 | def change_working_dir_and_pythonpath(new_dir): 38 | old_dir = Path.cwd() 39 | new_dir = Path(new_dir).expanduser().resolve().as_posix() 40 | old_pythonpath = sys.path.copy() 41 | sys.path.insert(0, new_dir) 42 | os.chdir(new_dir) 43 | try: 44 | yield 45 | finally: 46 | os.chdir(old_dir) 47 | sys.path = old_pythonpath 48 | -------------------------------------------------------------------------------- /engine/backbone/dinov3/utils/dtype.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This software may be used and distributed in accordance with 4 | # the terms of the DINOv3 License Agreement. 5 | 6 | from typing import Dict, Union 7 | 8 | import numpy as np 9 | import torch 10 | 11 | TypeSpec = Union[str, np.dtype, torch.dtype] 12 | 13 | 14 | _NUMPY_TO_TORCH_DTYPE: Dict[np.dtype, torch.dtype] = { 15 | np.dtype("bool"): torch.bool, 16 | np.dtype("uint8"): torch.uint8, 17 | np.dtype("int8"): torch.int8, 18 | np.dtype("int16"): torch.int16, 19 | np.dtype("int32"): torch.int32, 20 | np.dtype("int64"): torch.int64, 21 | np.dtype("float16"): torch.float16, 22 | np.dtype("float32"): torch.float32, 23 | np.dtype("float64"): torch.float64, 24 | np.dtype("complex64"): torch.complex64, 25 | np.dtype("complex128"): torch.complex128, 26 | } 27 | 28 | 29 | def as_torch_dtype(dtype: TypeSpec) -> torch.dtype: 30 | if isinstance(dtype, torch.dtype): 31 | return dtype 32 | if isinstance(dtype, str): 33 | dtype = np.dtype(dtype) 34 | assert isinstance(dtype, np.dtype), f"Expected an instance of nunpy dtype, got {type(dtype)}" 35 | return _NUMPY_TO_TORCH_DTYPE[dtype] 36 | -------------------------------------------------------------------------------- /engine/backbone/dinov3/utils/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This software may be used and distributed in accordance with 4 | # the terms of the DINOv3 License Agreement. 5 | 6 | import logging 7 | import os 8 | import random 9 | import subprocess 10 | from typing import Callable, List, Optional, Tuple 11 | 12 | import numpy as np 13 | import torch 14 | from torch import Tensor, nn 15 | 16 | logger = logging.getLogger("dinov3") 17 | 18 | 19 | def cat_keep_shapes(x_list: List[Tensor]) -> Tuple[Tensor, List[Tuple[int]], List[int]]: 20 | shapes = [x.shape for x in x_list] 21 | num_tokens = [x.select(dim=-1, index=0).numel() for x in x_list] 22 | flattened = torch.cat([x.flatten(0, -2) for x in x_list]) 23 | return flattened, shapes, num_tokens 24 | 25 | 26 | def uncat_with_shapes(flattened: Tensor, shapes: List[Tuple[int]], num_tokens: List[int]) -> List[Tensor]: 27 | outputs_splitted = torch.split_with_sizes(flattened, num_tokens, dim=0) 28 | shapes_adjusted = [shape[:-1] + torch.Size([flattened.shape[-1]]) for shape in shapes] 29 | outputs_reshaped = [o.reshape(shape) for o, shape in zip(outputs_splitted, shapes_adjusted)] 30 | return outputs_reshaped 31 | 32 | 33 | def named_replace( 34 | fn: Callable, 35 | module: nn.Module, 36 | name: str = "", 37 | depth_first: bool = True, 38 | include_root: bool = False, 39 | ) -> nn.Module: 40 | if not depth_first and include_root: 41 | module = fn(module=module, name=name) 42 | for child_name_o, child_module in list(module.named_children()): 43 | child_name = ".".join((name, child_name_o)) if name else child_name_o 44 | new_child = named_replace( 45 | fn=fn, 46 | module=child_module, 47 | name=child_name, 48 | depth_first=depth_first, 49 | include_root=True, 50 | ) 51 | setattr(module, child_name_o, new_child) 52 | 53 | if depth_first and include_root: 54 | module = fn(module=module, name=name) 55 | return module 56 | 57 | 58 | def named_apply( 59 | fn: Callable, 60 | module: nn.Module, 61 | name: str = "", 62 | depth_first: bool = True, 63 | include_root: bool = False, 64 | ) -> nn.Module: 65 | if not depth_first and include_root: 66 | fn(module=module, name=name) 67 | for child_name, child_module in module.named_children(): 68 | child_name = ".".join((name, child_name)) if name else child_name 69 | named_apply( 70 | fn=fn, 71 | module=child_module, 72 | name=child_name, 73 | depth_first=depth_first, 74 | include_root=True, 75 | ) 76 | if depth_first and include_root: 77 | fn(module=module, name=name) 78 | return module 79 | 80 | 81 | def fix_random_seeds(seed: int = 31): 82 | """ 83 | Fix random seeds. 84 | """ 85 | torch.manual_seed(seed) 86 | torch.cuda.manual_seed_all(seed) 87 | np.random.seed(seed) 88 | random.seed(seed) 89 | 90 | 91 | def get_sha() -> str: 92 | cwd = os.path.dirname(os.path.abspath(__file__)) 93 | 94 | def _run(command): 95 | return subprocess.check_output(command, cwd=cwd).decode("ascii").strip() 96 | 97 | sha = "N/A" 98 | diff = "clean" 99 | branch = "N/A" 100 | try: 101 | sha = _run(["git", "rev-parse", "HEAD"]) 102 | subprocess.check_output(["git", "diff"], cwd=cwd) 103 | diff = _run(["git", "diff-index", "HEAD"]) 104 | diff = "has uncommited changes" if diff else "clean" 105 | branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"]) 106 | except Exception: 107 | pass 108 | message = f"sha: {sha}, status: {diff}, branch: {branch}" 109 | return message 110 | 111 | 112 | def get_conda_env() -> Tuple[Optional[str], Optional[str]]: 113 | conda_env_name = os.environ.get("CONDA_DEFAULT_ENV") 114 | conda_env_path = os.environ.get("CONDA_PREFIX") 115 | return conda_env_name, conda_env_path 116 | 117 | 118 | def count_parameters(module: nn.Module) -> int: 119 | c = 0 120 | for m in module.parameters(): 121 | c += m.nelement() 122 | return c 123 | 124 | 125 | def has_batchnorms(model: nn.Module) -> bool: 126 | bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm) 127 | for _, module in model.named_modules(): 128 | if isinstance(module, bn_types): 129 | return True 130 | return False 131 | -------------------------------------------------------------------------------- /engine/backbone/test_resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from collections import OrderedDict 6 | 7 | 8 | from ..core import register 9 | 10 | 11 | class BasicBlock(nn.Module): 12 | expansion = 1 13 | 14 | def __init__(self, in_planes, planes, stride=1): 15 | super(BasicBlock, self).__init__() 16 | 17 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 18 | self.bn1 = nn.BatchNorm2d(planes) 19 | 20 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,stride=1, padding=1, bias=False) 21 | self.bn2 = nn.BatchNorm2d(planes) 22 | 23 | self.shortcut = nn.Sequential() 24 | if stride != 1 or in_planes != self.expansion*planes: 25 | self.shortcut = nn.Sequential( 26 | nn.Conv2d(in_planes, self.expansion*planes,kernel_size=1, stride=stride, bias=False), 27 | nn.BatchNorm2d(self.expansion*planes) 28 | ) 29 | def forward(self, x): 30 | out = F.relu(self.bn1(self.conv1(x))) 31 | out = self.bn2(self.conv2(out)) 32 | out += self.shortcut(x) 33 | out = F.relu(out) 34 | return out 35 | 36 | 37 | 38 | class _ResNet(nn.Module): 39 | def __init__(self, block, num_blocks, num_classes=10): 40 | super().__init__() 41 | self.in_planes = 64 42 | 43 | self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) 44 | self.bn1 = nn.BatchNorm2d(64) 45 | 46 | self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) 47 | self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) 48 | self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) 49 | self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) 50 | 51 | self.linear = nn.Linear(512 * block.expansion, num_classes) 52 | 53 | def _make_layer(self, block, planes, num_blocks, stride): 54 | strides = [stride] + [1]*(num_blocks-1) 55 | layers = [] 56 | for stride in strides: 57 | layers.append(block(self.in_planes, planes, stride)) 58 | self.in_planes = planes * block.expansion 59 | return nn.Sequential(*layers) 60 | 61 | def forward(self, x): 62 | out = F.relu(self.bn1(self.conv1(x))) 63 | out = self.layer1(out) 64 | out = self.layer2(out) 65 | out = self.layer3(out) 66 | out = self.layer4(out) 67 | out = F.avg_pool2d(out, 4) 68 | out = out.view(out.size(0), -1) 69 | out = self.linear(out) 70 | return out 71 | 72 | 73 | @register() 74 | class MResNet(nn.Module): 75 | def __init__(self, num_classes=10, num_blocks=[2, 2, 2, 2]) -> None: 76 | super().__init__() 77 | self.model = _ResNet(BasicBlock, num_blocks, num_classes) 78 | 79 | def forward(self, x): 80 | return self.model(x) 81 | -------------------------------------------------------------------------------- /engine/backbone/timm_model.py: -------------------------------------------------------------------------------- 1 | """Copyright(c) 2023 lyuwenyu. All Rights Reserved. 2 | 3 | https://towardsdatascience.com/getting-started-with-pytorch-image-models-timm-a-practitioners-guide-4e77b4bf9055#0583 4 | """ 5 | import torch 6 | from torchvision.models.feature_extraction import get_graph_node_names, create_feature_extractor 7 | 8 | from .utils import IntermediateLayerGetter 9 | from ..core import register 10 | 11 | 12 | @register() 13 | class TimmModel(torch.nn.Module): 14 | def __init__(self, \ 15 | name, 16 | return_layers, 17 | pretrained=False, 18 | exportable=True, 19 | features_only=True, 20 | **kwargs) -> None: 21 | 22 | super().__init__() 23 | 24 | import timm 25 | model = timm.create_model( 26 | name, 27 | pretrained=pretrained, 28 | exportable=exportable, 29 | features_only=features_only, 30 | **kwargs 31 | ) 32 | # nodes, _ = get_graph_node_names(model) 33 | # print(nodes) 34 | # features = {'': ''} 35 | # model = create_feature_extractor(model, return_nodes=features) 36 | 37 | assert set(return_layers).issubset(model.feature_info.module_name()), \ 38 | f'return_layers should be a subset of {model.feature_info.module_name()}' 39 | 40 | # self.model = model 41 | self.model = IntermediateLayerGetter(model, return_layers) 42 | 43 | return_idx = [model.feature_info.module_name().index(name) for name in return_layers] 44 | self.strides = [model.feature_info.reduction()[i] for i in return_idx] 45 | self.channels = [model.feature_info.channels()[i] for i in return_idx] 46 | self.return_idx = return_idx 47 | self.return_layers = return_layers 48 | 49 | def forward(self, x: torch.Tensor): 50 | outputs = self.model(x) 51 | # outputs = [outputs[i] for i in self.return_idx] 52 | return outputs 53 | 54 | 55 | if __name__ == '__main__': 56 | 57 | model = TimmModel(name='resnet34', return_layers=['layer2', 'layer3']) 58 | data = torch.rand(1, 3, 640, 640) 59 | outputs = model(data) 60 | 61 | for output in outputs: 62 | print(output.shape) 63 | 64 | """ 65 | model: 66 | type: TimmModel 67 | name: resnet34 68 | return_layers: ['layer2', 'layer4'] 69 | """ 70 | -------------------------------------------------------------------------------- /engine/backbone/torchvision_model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | import torch 7 | import torchvision 8 | 9 | from ..core import register 10 | from .utils import IntermediateLayerGetter 11 | 12 | __all__ = ['TorchVisionModel'] 13 | 14 | @register() 15 | class TorchVisionModel(torch.nn.Module): 16 | def __init__(self, name, return_layers, weights=None, **kwargs) -> None: 17 | super().__init__() 18 | 19 | if weights is not None: 20 | weights = getattr(torchvision.models.get_model_weights(name), weights) 21 | 22 | model = torchvision.models.get_model(name, weights=weights, **kwargs) 23 | 24 | if hasattr(model, 'features'): 25 | model = IntermediateLayerGetter(model.features, return_layers) 26 | else: 27 | model = IntermediateLayerGetter(model, return_layers) 28 | 29 | self.model = model 30 | 31 | def forward(self, x): 32 | return self.model(x) 33 | 34 | 35 | # TorchVisionModel('swin_t', return_layers=['5', '7']) 36 | # TorchVisionModel('resnet34', return_layers=['layer2','layer3', 'layer4']) 37 | 38 | # TorchVisionModel: 39 | # name: swin_t 40 | # return_layers: ['5', '7'] 41 | # weights: DEFAULT 42 | 43 | 44 | # model: 45 | # type: TorchVisionModel 46 | # name: resnet34 47 | # return_layers: ['layer2','layer3', 'layer4'] 48 | # weights: DEFAULT 49 | -------------------------------------------------------------------------------- /engine/backbone/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | https://github.com/pytorch/vision/blob/main/torchvision/models/_utils.py 3 | 4 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 5 | """ 6 | 7 | from collections import OrderedDict 8 | from typing import Dict, List 9 | 10 | 11 | import torch.nn as nn 12 | 13 | 14 | class IntermediateLayerGetter(nn.ModuleDict): 15 | """ 16 | Module wrapper that returns intermediate layers from a model 17 | 18 | It has a strong assumption that the modules have been registered 19 | into the model in the same order as they are used. 20 | This means that one should **not** reuse the same nn.Module 21 | twice in the forward if you want this to work. 22 | 23 | Additionally, it is only able to query submodules that are directly 24 | assigned to the model. So if `model` is passed, `model.feature1` can 25 | be returned, but not `model.feature1.layer2`. 26 | """ 27 | 28 | _version = 3 29 | 30 | def __init__(self, model: nn.Module, return_layers: List[str]) -> None: 31 | if not set(return_layers).issubset([name for name, _ in model.named_children()]): 32 | raise ValueError("return_layers are not present in model. {}"\ 33 | .format([name for name, _ in model.named_children()])) 34 | orig_return_layers = return_layers 35 | return_layers = {str(k): str(k) for k in return_layers} 36 | layers = OrderedDict() 37 | for name, module in model.named_children(): 38 | layers[name] = module 39 | if name in return_layers: 40 | del return_layers[name] 41 | if not return_layers: 42 | break 43 | 44 | super().__init__(layers) 45 | self.return_layers = orig_return_layers 46 | 47 | def forward(self, x): 48 | outputs = [] 49 | for name, module in self.items(): 50 | x = module(x) 51 | if name in self.return_layers: 52 | outputs.append(x) 53 | 54 | return outputs 55 | -------------------------------------------------------------------------------- /engine/core/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | from .workspace import GLOBAL_CONFIG, register, create 7 | from .yaml_utils import * 8 | from ._config import BaseConfig 9 | from .yaml_config import YAMLConfig 10 | -------------------------------------------------------------------------------- /engine/core/yaml_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | import os 7 | import copy 8 | import yaml 9 | from typing import Any, Dict, Optional, List 10 | 11 | from .workspace import GLOBAL_CONFIG 12 | 13 | __all__ = [ 14 | 'load_config', 15 | 'merge_config', 16 | 'merge_dict', 17 | 'parse_cli', 18 | ] 19 | 20 | 21 | INCLUDE_KEY = '__include__' 22 | 23 | 24 | def load_config(file_path, cfg=dict()): 25 | """load config 26 | """ 27 | _, ext = os.path.splitext(file_path) 28 | assert ext in ['.yml', '.yaml'], "only support yaml files" 29 | 30 | with open(file_path) as f: 31 | file_cfg = yaml.load(f, Loader=yaml.Loader) 32 | if file_cfg is None: 33 | return {} 34 | 35 | if INCLUDE_KEY in file_cfg: 36 | base_yamls = list(file_cfg[INCLUDE_KEY]) 37 | for base_yaml in base_yamls: 38 | if base_yaml.startswith('~'): 39 | base_yaml = os.path.expanduser(base_yaml) 40 | 41 | if not base_yaml.startswith('/'): 42 | base_yaml = os.path.join(os.path.dirname(file_path), base_yaml) 43 | 44 | with open(base_yaml) as f: 45 | base_cfg = load_config(base_yaml, cfg) 46 | merge_dict(cfg, base_cfg) 47 | 48 | return merge_dict(cfg, file_cfg) 49 | 50 | 51 | def merge_dict(dct, another_dct, inplace=True) -> Dict: 52 | """merge another_dct into dct 53 | """ 54 | def _merge(dct, another) -> Dict: 55 | for k in another: 56 | if (k in dct and isinstance(dct[k], dict) and isinstance(another[k], dict)): 57 | _merge(dct[k], another[k]) 58 | else: 59 | dct[k] = another[k] 60 | 61 | return dct 62 | 63 | if not inplace: 64 | dct = copy.deepcopy(dct) 65 | 66 | return _merge(dct, another_dct) 67 | 68 | 69 | def dictify(s: str, v: Any) -> Dict: 70 | if '.' not in s: 71 | return {s: v} 72 | key, rest = s.split('.', 1) 73 | return {key: dictify(rest, v)} 74 | 75 | 76 | def parse_cli(nargs: List[str]) -> Dict: 77 | """ 78 | parse command-line arguments 79 | convert `a.c=3 b=10` to `{'a': {'c': 3}, 'b': 10}` 80 | """ 81 | cfg = {} 82 | if nargs is None or len(nargs) == 0: 83 | return cfg 84 | 85 | for s in nargs: 86 | s = s.strip() 87 | k, v = s.split('=', 1) 88 | d = dictify(k, yaml.load(v, Loader=yaml.Loader)) 89 | cfg = merge_dict(cfg, d) 90 | 91 | return cfg 92 | 93 | 94 | 95 | def merge_config(cfg, another_cfg=GLOBAL_CONFIG, inplace: bool=False, overwrite: bool=False): 96 | """ 97 | Merge another_cfg into cfg, return the merged config 98 | 99 | Example: 100 | 101 | cfg1 = load_config('./dfine_r18vd_6x_coco.yml') 102 | cfg1 = merge_config(cfg, inplace=True) 103 | 104 | cfg2 = load_config('./dfine_r50vd_6x_coco.yml') 105 | cfg2 = merge_config(cfg2, inplace=True) 106 | 107 | model1 = create(cfg1['model'], cfg1) 108 | model2 = create(cfg2['model'], cfg2) 109 | """ 110 | def _merge(dct, another): 111 | for k in another: 112 | if k not in dct: 113 | dct[k] = another[k] 114 | 115 | elif isinstance(dct[k], dict) and isinstance(another[k], dict): 116 | _merge(dct[k], another[k]) 117 | 118 | elif overwrite: 119 | dct[k] = another[k] 120 | 121 | return cfg 122 | 123 | if not inplace: 124 | cfg = copy.deepcopy(cfg) 125 | 126 | return _merge(cfg, another_cfg) 127 | -------------------------------------------------------------------------------- /engine/data/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | from .dataset import * 7 | from .transforms import * 8 | from .dataloader import * 9 | 10 | from ._misc import convert_to_tv_tensor 11 | 12 | 13 | 14 | 15 | # def set_epoch(self, epoch) -> None: 16 | # self.epoch = epoch 17 | # def _set_epoch_func(datasets): 18 | # """Add `set_epoch` for datasets 19 | # """ 20 | # from ..core import register 21 | # for ds in datasets: 22 | # register(ds)(set_epoch) 23 | # _set_epoch_func([CIFAR10, VOCDetection, CocoDetection]) 24 | -------------------------------------------------------------------------------- /engine/data/_misc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | import importlib.metadata 7 | from torch import Tensor 8 | 9 | if '0.15.2' in importlib.metadata.version('torchvision'): 10 | import torchvision 11 | torchvision.disable_beta_transforms_warning() 12 | 13 | from torchvision.datapoints import BoundingBox as BoundingBoxes 14 | from torchvision.datapoints import BoundingBoxFormat, Mask, Image, Video 15 | from torchvision.transforms.v2 import SanitizeBoundingBox as SanitizeBoundingBoxes 16 | _boxes_keys = ['format', 'spatial_size'] 17 | 18 | elif '0.17' > importlib.metadata.version('torchvision') >= '0.16': 19 | import torchvision 20 | torchvision.disable_beta_transforms_warning() 21 | 22 | from torchvision.transforms.v2 import SanitizeBoundingBoxes 23 | from torchvision.tv_tensors import ( 24 | BoundingBoxes, BoundingBoxFormat, Mask, Image, Video) 25 | _boxes_keys = ['format', 'canvas_size'] 26 | 27 | elif importlib.metadata.version('torchvision') >= '0.17': 28 | import torchvision 29 | from torchvision.transforms.v2 import SanitizeBoundingBoxes 30 | from torchvision.tv_tensors import ( 31 | BoundingBoxes, BoundingBoxFormat, Mask, Image, Video) 32 | _boxes_keys = ['format', 'canvas_size'] 33 | 34 | else: 35 | raise RuntimeError('Please make sure torchvision version >= 0.15.2') 36 | 37 | 38 | 39 | def convert_to_tv_tensor(tensor: Tensor, key: str, box_format='xyxy', spatial_size=None) -> Tensor: 40 | """ 41 | Args: 42 | tensor (Tensor): input tensor 43 | key (str): transform to key 44 | 45 | Return: 46 | Dict[str, TV_Tensor] 47 | """ 48 | assert key in ('boxes', 'masks', ), "Only support 'boxes' and 'masks'" 49 | 50 | if key == 'boxes': 51 | box_format = getattr(BoundingBoxFormat, box_format.upper()) 52 | _kwargs = dict(zip(_boxes_keys, [box_format, spatial_size])) 53 | return BoundingBoxes(tensor, **_kwargs) 54 | 55 | if key == 'masks': 56 | return Mask(tensor) 57 | -------------------------------------------------------------------------------- /engine/data/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | # from ._dataset import DetDataset 7 | from .coco_dataset import CocoDetection 8 | from .coco_dataset import ( 9 | mscoco_category2name, 10 | mscoco_category2label, 11 | mscoco_label2category, 12 | ) 13 | from .coco_eval import CocoEvaluator 14 | from .coco_utils import get_coco_api_from_dataset 15 | from .voc_detection import VOCDetection 16 | from .voc_eval import VOCEvaluator 17 | -------------------------------------------------------------------------------- /engine/data/dataset/_dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | import torch 7 | import torch.utils.data as data 8 | 9 | class DetDataset(data.Dataset): 10 | def __getitem__(self, index): 11 | img, target = self.load_item(index) 12 | if self.transforms is not None: 13 | img, target, _ = self.transforms(img, target, self) 14 | return img, target 15 | 16 | def load_item(self, index): 17 | raise NotImplementedError("Please implement this function to return item before `transforms`.") 18 | 19 | def set_epoch(self, epoch) -> None: 20 | self._epoch = epoch 21 | 22 | @property 23 | def epoch(self): 24 | return self._epoch if hasattr(self, '_epoch') else -1 25 | -------------------------------------------------------------------------------- /engine/data/dataset/voc_detection.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | from sympy import im 7 | import torch 8 | import torchvision 9 | import torchvision.transforms.functional as TVF 10 | 11 | import os 12 | from PIL import Image 13 | from typing import Optional, Callable 14 | 15 | try: 16 | from defusedxml.ElementTree import parse as ET_parse 17 | except ImportError: 18 | from xml.etree.ElementTree import parse as ET_parse 19 | 20 | from ._dataset import DetDataset 21 | from .._misc import convert_to_tv_tensor 22 | from ...core import register 23 | 24 | @register() 25 | class VOCDetection(torchvision.datasets.VOCDetection, DetDataset): 26 | __inject__ = ['transforms', ] 27 | 28 | def __init__(self, root: str, ann_file: str = "trainval.txt", label_file: str = "label_list.txt", transforms: Optional[Callable] = None): 29 | 30 | with open(os.path.join(root, ann_file), 'r') as f: 31 | lines = [x.strip() for x in f.readlines()] 32 | lines = [x.split(' ') for x in lines] 33 | 34 | self.images = [os.path.join(root, lin[0]) for lin in lines] 35 | self.targets = [os.path.join(root, lin[1]) for lin in lines] 36 | assert len(self.images) == len(self.targets) 37 | 38 | with open(os.path.join(root + label_file), 'r') as f: 39 | labels = f.readlines() 40 | labels = [lab.strip() for lab in labels] 41 | 42 | self.transforms = transforms 43 | self.labels_map = {lab: i for i, lab in enumerate(labels)} 44 | 45 | def __getitem__(self, index: int): 46 | image, target = self.load_item(index) 47 | if self.transforms is not None: 48 | image, target, _ = self.transforms(image, target, self) 49 | # target["orig_size"] = torch.tensor(TVF.get_image_size(image)) 50 | return image, target 51 | 52 | def load_item(self, index: int): 53 | image = Image.open(self.images[index]).convert("RGB") 54 | target = self.parse_voc_xml(ET_parse(self.annotations[index]).getroot()) 55 | 56 | output = {} 57 | output["image_id"] = torch.tensor([index]) 58 | for k in ['area', 'boxes', 'labels', 'iscrowd']: 59 | output[k] = [] 60 | 61 | for blob in target['annotation']['object']: 62 | box = [float(v) for v in blob['bndbox'].values()] 63 | output["boxes"].append(box) 64 | output["labels"].append(blob['name']) 65 | output["area"].append((box[2] - box[0]) * (box[3] - box[1])) 66 | output["iscrowd"].append(0) 67 | 68 | w, h = image.size 69 | boxes = torch.tensor(output["boxes"]) if len(output["boxes"]) > 0 else torch.zeros(0, 4) 70 | output['boxes'] = convert_to_tv_tensor(boxes, 'boxes', box_format='xyxy', spatial_size=[h, w]) 71 | output['labels'] = torch.tensor([self.labels_map[lab] for lab in output["labels"]]) 72 | output['area'] = torch.tensor(output['area']) 73 | output["iscrowd"] = torch.tensor(output["iscrowd"]) 74 | output["orig_size"] = torch.tensor([w, h]) 75 | 76 | return image, output 77 | -------------------------------------------------------------------------------- /engine/data/dataset/voc_eval.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | import torch 7 | import torchvision 8 | 9 | 10 | class VOCEvaluator(object): 11 | def __init__(self) -> None: 12 | pass 13 | -------------------------------------------------------------------------------- /engine/data/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | 7 | from ._transforms import ( 8 | EmptyTransform, 9 | RandomPhotometricDistort, 10 | RandomZoomOut, 11 | RandomIoUCrop, 12 | RandomHorizontalFlip, 13 | Resize, 14 | PadToSize, 15 | SanitizeBoundingBoxes, 16 | RandomCrop, 17 | Normalize, 18 | ConvertBoxes, 19 | ConvertPILImage, 20 | ) 21 | from .container import Compose 22 | from .mosaic import Mosaic -------------------------------------------------------------------------------- /engine/data/transforms/_transforms.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | import torchvision 10 | import torchvision.transforms.v2 as T 11 | import torchvision.transforms.v2.functional as F 12 | 13 | import PIL 14 | import PIL.Image 15 | 16 | from typing import Any, Dict, List, Optional 17 | 18 | from .._misc import convert_to_tv_tensor, _boxes_keys 19 | from .._misc import Image, Video, Mask, BoundingBoxes 20 | from .._misc import SanitizeBoundingBoxes 21 | 22 | from ...core import register 23 | torchvision.disable_beta_transforms_warning() 24 | 25 | 26 | RandomPhotometricDistort = register()(T.RandomPhotometricDistort) 27 | RandomZoomOut = register()(T.RandomZoomOut) 28 | RandomHorizontalFlip = register()(T.RandomHorizontalFlip) 29 | Resize = register()(T.Resize) 30 | # ToImageTensor = register()(T.ToImageTensor) 31 | # ConvertDtype = register()(T.ConvertDtype) 32 | # PILToTensor = register()(T.PILToTensor) 33 | SanitizeBoundingBoxes = register(name='SanitizeBoundingBoxes')(SanitizeBoundingBoxes) 34 | RandomCrop = register()(T.RandomCrop) 35 | Normalize = register()(T.Normalize) 36 | 37 | 38 | @register() 39 | class EmptyTransform(T.Transform): 40 | def __init__(self, ) -> None: 41 | super().__init__() 42 | 43 | def forward(self, *inputs): 44 | inputs = inputs if len(inputs) > 1 else inputs[0] 45 | return inputs 46 | 47 | 48 | @register() 49 | class PadToSize(T.Pad): 50 | _transformed_types = ( 51 | PIL.Image.Image, 52 | Image, 53 | Video, 54 | Mask, 55 | BoundingBoxes, 56 | ) 57 | def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: 58 | sp = F.get_spatial_size(flat_inputs[0]) 59 | h, w = self.size[1] - sp[0], self.size[0] - sp[1] 60 | self.padding = [0, 0, w, h] 61 | return dict(padding=self.padding) 62 | 63 | def __init__(self, size, fill=0, padding_mode='constant') -> None: 64 | if isinstance(size, int): 65 | size = (size, size) 66 | self.size = size 67 | super().__init__(0, fill, padding_mode) 68 | 69 | def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: 70 | fill = self._fill[type(inpt)] 71 | padding = params['padding'] 72 | return F.pad(inpt, padding=padding, fill=fill, padding_mode=self.padding_mode) # type: ignore[arg-type] 73 | 74 | def __call__(self, *inputs: Any) -> Any: 75 | outputs = super().forward(*inputs) 76 | if len(outputs) > 1 and isinstance(outputs[1], dict): 77 | outputs[1]['padding'] = torch.tensor(self.padding) 78 | return outputs 79 | 80 | 81 | @register() 82 | class RandomIoUCrop(T.RandomIoUCrop): 83 | def __init__(self, min_scale: float = 0.3, max_scale: float = 1, min_aspect_ratio: float = 0.5, max_aspect_ratio: float = 2, sampler_options: Optional[List[float]] = None, trials: int = 40, p: float = 1.0): 84 | super().__init__(min_scale, max_scale, min_aspect_ratio, max_aspect_ratio, sampler_options, trials) 85 | self.p = p 86 | 87 | def __call__(self, *inputs: Any) -> Any: 88 | if torch.rand(1) >= self.p: 89 | return inputs if len(inputs) > 1 else inputs[0] 90 | 91 | return super().forward(*inputs) 92 | 93 | 94 | @register() 95 | class ConvertBoxes(T.Transform): 96 | _transformed_types = ( 97 | BoundingBoxes, 98 | ) 99 | def __init__(self, fmt='', normalize=False) -> None: 100 | super().__init__() 101 | self.fmt = fmt 102 | self.normalize = normalize 103 | 104 | def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: 105 | spatial_size = getattr(inpt, _boxes_keys[1]) 106 | if self.fmt: 107 | in_fmt = inpt.format.value.lower() 108 | inpt = torchvision.ops.box_convert(inpt, in_fmt=in_fmt, out_fmt=self.fmt.lower()) 109 | inpt = convert_to_tv_tensor(inpt, key='boxes', box_format=self.fmt.upper(), spatial_size=spatial_size) 110 | 111 | if self.normalize: 112 | inpt = inpt / torch.tensor(spatial_size[::-1]).tile(2)[None] 113 | 114 | return inpt 115 | 116 | 117 | @register() 118 | class ConvertPILImage(T.Transform): 119 | _transformed_types = ( 120 | PIL.Image.Image, 121 | ) 122 | def __init__(self, dtype='float32', scale=True) -> None: 123 | super().__init__() 124 | self.dtype = dtype 125 | self.scale = scale 126 | 127 | def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: 128 | inpt = F.pil_to_tensor(inpt) 129 | if self.dtype == 'float32': 130 | inpt = inpt.float() 131 | 132 | if self.scale: 133 | inpt = inpt / 255. 134 | 135 | inpt = Image(inpt) 136 | 137 | return inpt 138 | -------------------------------------------------------------------------------- /engine/data/transforms/container.py: -------------------------------------------------------------------------------- 1 | """ 2 | DEIM: DETR with Improved Matching for Fast Convergence 3 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved. 4 | --------------------------------------------------------------------------------- 5 | Modified from D-FINE (https://github.com/Peterande/D-FINE) 6 | Copyright (c) 2024 D-FINE authors. All Rights Reserved. 7 | """ 8 | 9 | import torch 10 | import torch.nn as nn 11 | 12 | import torchvision 13 | import torchvision.transforms.v2 as T 14 | 15 | from typing import Any, Dict, List, Optional 16 | 17 | from ._transforms import EmptyTransform 18 | from ...core import register, GLOBAL_CONFIG 19 | torchvision.disable_beta_transforms_warning() 20 | import random 21 | 22 | 23 | @register() 24 | class Compose(T.Compose): 25 | def __init__(self, ops, policy=None, mosaic_prob=-0.1) -> None: 26 | transforms = [] 27 | if ops is not None: 28 | for op in ops: 29 | if isinstance(op, dict): 30 | name = op.pop('type') 31 | transform = getattr(GLOBAL_CONFIG[name]['_pymodule'], GLOBAL_CONFIG[name]['_name'])(**op) 32 | transforms.append(transform) 33 | op['type'] = name 34 | print(" ### Transform @{} ### ".format(type(transform).__name__)) 35 | 36 | elif isinstance(op, nn.Module): 37 | transforms.append(op) 38 | 39 | else: 40 | raise ValueError('') 41 | else: 42 | transforms =[EmptyTransform(), ] 43 | 44 | super().__init__(transforms=transforms) 45 | 46 | self.mosaic_prob = mosaic_prob 47 | if policy is None: 48 | policy = {'name': 'default'} 49 | else: 50 | if self.mosaic_prob > 0: 51 | print(" ### Mosaic with Prob.@{} and ZoomOut/IoUCrop existed ### ".format(self.mosaic_prob)) 52 | print(" ### ImgTransforms Epochs: {} ### ".format(policy['epoch'])) 53 | print(' ### Policy_ops@{} ###'.format(policy['ops'])) 54 | self.global_samples = 0 55 | self.policy = policy 56 | 57 | def forward(self, *inputs: Any) -> Any: 58 | return self.get_forward(self.policy['name'])(*inputs) 59 | 60 | def get_forward(self, name): 61 | forwards = { 62 | 'default': self.default_forward, 63 | 'stop_epoch': self.stop_epoch_forward, 64 | 'stop_sample': self.stop_sample_forward, 65 | } 66 | return forwards[name] 67 | 68 | def default_forward(self, *inputs: Any) -> Any: 69 | sample = inputs if len(inputs) > 1 else inputs[0] 70 | for transform in self.transforms: 71 | sample = transform(sample) 72 | return sample 73 | 74 | def stop_epoch_forward(self, *inputs: Any): 75 | sample = inputs if len(inputs) > 1 else inputs[0] 76 | dataset = sample[-1] 77 | cur_epoch = dataset.epoch 78 | policy_ops = self.policy['ops'] 79 | policy_epoch = self.policy['epoch'] 80 | 81 | if isinstance(policy_epoch, list) and len(policy_epoch) == 3: # 4-stages 82 | if policy_epoch[0] <= cur_epoch < policy_epoch[1]: 83 | with_mosaic = random.random() <= self.mosaic_prob # Probility for Mosaic 84 | else: 85 | with_mosaic = False 86 | for transform in self.transforms: 87 | if (type(transform).__name__ in policy_ops and cur_epoch < policy_epoch[0]): # first stage: NoAug 88 | pass 89 | elif (type(transform).__name__ in policy_ops and cur_epoch >= policy_epoch[-1]): # last stage: NoAug 90 | pass 91 | else: 92 | # Using Mosaic for [policy_epoch[0], policy_epoch[1]] with probability 93 | if (type(transform).__name__ == 'Mosaic' and not with_mosaic): 94 | pass 95 | # Mosaic and Zoomout/IoUCrop can not be co-existed in the same sample 96 | elif (type(transform).__name__ == 'RandomZoomOut' or type(transform).__name__ == 'RandomIoUCrop') and with_mosaic: 97 | pass 98 | else: 99 | sample = transform(sample) 100 | else: # the default data scheduler 101 | for transform in self.transforms: 102 | if type(transform).__name__ in policy_ops and cur_epoch >= policy_epoch: 103 | pass 104 | else: 105 | sample = transform(sample) 106 | 107 | return sample 108 | 109 | 110 | def stop_sample_forward(self, *inputs: Any): 111 | sample = inputs if len(inputs) > 1 else inputs[0] 112 | dataset = sample[-1] 113 | 114 | cur_epoch = dataset.epoch 115 | policy_ops = self.policy['ops'] 116 | policy_sample = self.policy['sample'] 117 | 118 | for transform in self.transforms: 119 | if type(transform).__name__ in policy_ops and self.global_samples >= policy_sample: 120 | pass 121 | else: 122 | sample = transform(sample) 123 | 124 | self.global_samples += 1 125 | 126 | return sample 127 | -------------------------------------------------------------------------------- /engine/deim/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | DEIM: DETR with Improved Matching for Fast Convergence 3 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved. 4 | --------------------------------------------------------------------------------- 5 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 6 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 7 | """ 8 | 9 | 10 | from .deim import DEIM 11 | 12 | from .matcher import HungarianMatcher 13 | 14 | from .hybrid_encoder import HybridEncoder 15 | from .lite_encoder import LiteEncoder 16 | 17 | 18 | from .dfine_decoder import DFINETransformer 19 | from .rtdetrv2_decoder import RTDETRTransformerv2 20 | 21 | from .postprocessor import PostProcessor 22 | from .deim_criterion import DEIMCriterion 23 | from .deim_decoder import DEIMTransformer -------------------------------------------------------------------------------- /engine/deim/box_ops.py: -------------------------------------------------------------------------------- 1 | """ 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | https://github.com/facebookresearch/detr/blob/main/util/box_ops.py 4 | """ 5 | 6 | import torch 7 | from torch import Tensor 8 | from torchvision.ops.boxes import box_area 9 | 10 | 11 | def box_cxcywh_to_xyxy(x): 12 | x_c, y_c, w, h = x.unbind(-1) 13 | b = [(x_c - 0.5 * w.clamp(min=0.0)), (y_c - 0.5 * h.clamp(min=0.0)), 14 | (x_c + 0.5 * w.clamp(min=0.0)), (y_c + 0.5 * h.clamp(min=0.0))] 15 | return torch.stack(b, dim=-1) 16 | 17 | 18 | def box_xyxy_to_cxcywh(x: Tensor) -> Tensor: 19 | x0, y0, x1, y1 = x.unbind(-1) 20 | b = [(x0 + x1) / 2, (y0 + y1) / 2, 21 | (x1 - x0), (y1 - y0)] 22 | return torch.stack(b, dim=-1) 23 | 24 | 25 | # modified from torchvision to also return the union 26 | def box_iou(boxes1: Tensor, boxes2: Tensor): 27 | area1 = box_area(boxes1) 28 | area2 = box_area(boxes2) 29 | 30 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 31 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 32 | 33 | wh = (rb - lt).clamp(min=0) # [N,M,2] 34 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 35 | 36 | union = area1[:, None] + area2 - inter 37 | 38 | iou = inter / union 39 | return iou, union 40 | 41 | 42 | def generalized_box_iou(boxes1, boxes2): 43 | """ 44 | Generalized IoU from https://giou.stanford.edu/ 45 | 46 | The boxes should be in [x0, y0, x1, y1] format 47 | 48 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 49 | and M = len(boxes2) 50 | """ 51 | # degenerate boxes gives inf / nan results 52 | # so do an early check 53 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 54 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 55 | iou, union = box_iou(boxes1, boxes2) 56 | 57 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) 58 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) 59 | 60 | wh = (rb - lt).clamp(min=0) # [N,M,2] 61 | area = wh[:, :, 0] * wh[:, :, 1] 62 | 63 | return iou - (area - union) / area 64 | 65 | 66 | def masks_to_boxes(masks): 67 | """Compute the bounding boxes around the provided masks 68 | 69 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. 70 | 71 | Returns a [N, 4] tensors, with the boxes in xyxy format 72 | """ 73 | if masks.numel() == 0: 74 | return torch.zeros((0, 4), device=masks.device) 75 | 76 | h, w = masks.shape[-2:] 77 | 78 | y = torch.arange(0, h, dtype=torch.float) 79 | x = torch.arange(0, w, dtype=torch.float) 80 | y, x = torch.meshgrid(y, x) 81 | 82 | x_mask = (masks * x.unsqueeze(0)) 83 | x_max = x_mask.flatten(1).max(-1)[0] 84 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 85 | 86 | y_mask = (masks * y.unsqueeze(0)) 87 | y_max = y_mask.flatten(1).max(-1)[0] 88 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 89 | 90 | return torch.stack([x_min, y_min, x_max, y_max], 1) -------------------------------------------------------------------------------- /engine/deim/deim.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved. 3 | """ 4 | 5 | import torch.nn as nn 6 | from ..core import register 7 | 8 | 9 | __all__ = ['DEIM', ] 10 | 11 | 12 | @register() 13 | class DEIM(nn.Module): 14 | __inject__ = ['backbone', 'encoder', 'decoder', ] 15 | 16 | def __init__(self, \ 17 | backbone: nn.Module, 18 | encoder: nn.Module, 19 | decoder: nn.Module, 20 | ): 21 | super().__init__() 22 | self.backbone = backbone 23 | self.decoder = decoder 24 | self.encoder = encoder 25 | 26 | def forward(self, x, targets=None): 27 | x = self.backbone(x) 28 | x = self.encoder(x) 29 | x = self.decoder(x, targets) 30 | 31 | return x 32 | 33 | def deploy(self, ): 34 | self.eval() 35 | for m in self.modules(): 36 | if hasattr(m, 'convert_to_deploy'): 37 | m.convert_to_deploy() 38 | return self 39 | -------------------------------------------------------------------------------- /engine/deim/deim_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.nn.init as init 5 | 6 | from .utils import get_activation, bias_init_with_prob 7 | 8 | 9 | class RMSNorm(nn.Module): 10 | def __init__(self, dim: int, eps: float = 1e-6): 11 | super().__init__() 12 | self.dim = dim 13 | self.eps = eps 14 | self.scale = nn.Parameter(torch.ones(dim)) 15 | 16 | def _norm(self, x): 17 | return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) 18 | 19 | def forward(self, x): 20 | output = self._norm(x.float()).type_as(x) 21 | output = output * self.scale 22 | return output 23 | 24 | def extra_repr(self) -> str: 25 | return f'dim={self.dim}, eps={self.eps}' 26 | 27 | # default 3-layer MLP 28 | class MLP(nn.Module): 29 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers=3, act='relu'): 30 | super().__init__() 31 | self.num_layers = num_layers 32 | h = [hidden_dim] * (num_layers - 1) 33 | self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) 34 | self.act = get_activation(act) 35 | 36 | def forward(self, x): 37 | for i, layer in enumerate(self.layers): 38 | x = self.act(layer(x)) if i < self.num_layers - 1 else layer(x) 39 | return x 40 | 41 | # Taken from: https://github.com/facebookresearch/dinov2/blob/main/dinov2/layers/swiglu_ffn.py#L14-L34 42 | class SwiGLUFFN(nn.Module): 43 | def __init__( 44 | self, 45 | in_features: int, 46 | hidden_features: int, 47 | out_features: int, 48 | bias: bool = True, 49 | ) -> None: 50 | super().__init__() 51 | out_features = out_features or in_features 52 | hidden_features = hidden_features or in_features 53 | self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias) 54 | self.w3 = nn.Linear(hidden_features, out_features, bias=bias) 55 | self._reset_parameters() 56 | 57 | def _reset_parameters(self): 58 | init.xavier_uniform_(self.w12.weight) 59 | init.constant_(self.w12.bias, 0) 60 | init.xavier_uniform_(self.w3.weight) 61 | init.constant_(self.w3.bias, 0) 62 | 63 | def forward(self, x): 64 | x12 = self.w12(x) 65 | x1, x2 = x12.chunk(2, dim=-1) 66 | hidden = F.silu(x1) * x2 67 | return self.w3(hidden) 68 | 69 | 70 | class Gate(nn.Module): 71 | def __init__(self, d_model, use_rmsnorm=False): 72 | super(Gate, self).__init__() 73 | self.gate = nn.Linear(2 * d_model, 2 * d_model) 74 | bias = bias_init_with_prob(0.5) 75 | init.constant_(self.gate.bias, bias) 76 | init.constant_(self.gate.weight, 0) 77 | self.norm = RMSNorm(d_model) if use_rmsnorm else nn.LayerNorm(d_model) 78 | 79 | def forward(self, x1, x2): 80 | gate_input = torch.cat([x1, x2], dim=-1) 81 | gates = torch.sigmoid(self.gate(gate_input)) 82 | gate1, gate2 = gates.chunk(2, dim=-1) 83 | return self.norm(gate1 * x1 + gate2 * x2) -------------------------------------------------------------------------------- /engine/deim/denoising.py: -------------------------------------------------------------------------------- 1 | """Copyright(c) 2023 lyuwenyu. All Rights Reserved. 2 | Modifications Copyright (c) 2024 The DEIM Authors. All Rights Reserved. 3 | """ 4 | 5 | import torch 6 | 7 | from .utils import inverse_sigmoid 8 | from .box_ops import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh 9 | 10 | 11 | 12 | def get_contrastive_denoising_training_group(targets, 13 | num_classes, 14 | num_queries, 15 | class_embed, 16 | num_denoising=100, 17 | label_noise_ratio=0.5, 18 | box_noise_scale=1.0,): 19 | """cnd""" 20 | if num_denoising <= 0: 21 | return None, None, None, None 22 | 23 | num_gts = [len(t['labels']) for t in targets] 24 | device = targets[0]['labels'].device 25 | 26 | max_gt_num = max(num_gts) 27 | if max_gt_num == 0: 28 | return None, None, None, None 29 | 30 | num_group = num_denoising // max_gt_num 31 | num_group = 1 if num_group == 0 else num_group 32 | # pad gt to max_num of a batch 33 | bs = len(num_gts) 34 | 35 | input_query_class = torch.full([bs, max_gt_num], num_classes, dtype=torch.int32, device=device) 36 | input_query_bbox = torch.zeros([bs, max_gt_num, 4], device=device) 37 | pad_gt_mask = torch.zeros([bs, max_gt_num], dtype=torch.bool, device=device) 38 | 39 | for i in range(bs): 40 | num_gt = num_gts[i] 41 | if num_gt > 0: 42 | input_query_class[i, :num_gt] = targets[i]['labels'] 43 | input_query_bbox[i, :num_gt] = targets[i]['boxes'] 44 | pad_gt_mask[i, :num_gt] = 1 45 | # each group has positive and negative queries. 46 | input_query_class = input_query_class.tile([1, 2 * num_group]) 47 | input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1]) 48 | pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group]) 49 | # positive and negative mask 50 | negative_gt_mask = torch.zeros([bs, max_gt_num * 2, 1], device=device) 51 | negative_gt_mask[:, max_gt_num:] = 1 52 | negative_gt_mask = negative_gt_mask.tile([1, num_group, 1]) 53 | positive_gt_mask = 1 - negative_gt_mask 54 | # contrastive denoising training positive index 55 | positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask 56 | dn_positive_idx = torch.nonzero(positive_gt_mask)[:, 1] 57 | dn_positive_idx = torch.split(dn_positive_idx, [n * num_group for n in num_gts]) 58 | # total denoising queries 59 | num_denoising = int(max_gt_num * 2 * num_group) 60 | 61 | if label_noise_ratio > 0: 62 | mask = torch.rand_like(input_query_class, dtype=torch.float) < (label_noise_ratio * 0.5) 63 | # randomly put a new one here 64 | new_label = torch.randint_like(mask, 0, num_classes, dtype=input_query_class.dtype) 65 | input_query_class = torch.where(mask & pad_gt_mask, new_label, input_query_class) 66 | 67 | if box_noise_scale > 0: 68 | known_bbox = box_cxcywh_to_xyxy(input_query_bbox) 69 | diff = torch.tile(input_query_bbox[..., 2:] * 0.5, [1, 1, 2]) * box_noise_scale 70 | rand_sign = torch.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0 71 | rand_part = torch.rand_like(input_query_bbox) 72 | rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (1 - negative_gt_mask) 73 | known_bbox += (rand_sign * rand_part * diff) 74 | known_bbox = torch.clip(known_bbox, min=0.0, max=1.0) 75 | input_query_bbox = box_xyxy_to_cxcywh(known_bbox) 76 | input_query_bbox[input_query_bbox < 0] *= -1 77 | input_query_bbox_unact = inverse_sigmoid(input_query_bbox) 78 | 79 | input_query_logits = class_embed(input_query_class) 80 | 81 | tgt_size = num_denoising + num_queries 82 | attn_mask = torch.full([tgt_size, tgt_size], False, dtype=torch.bool, device=device) 83 | # match query cannot see the reconstruction 84 | attn_mask[num_denoising:, :num_denoising] = True 85 | 86 | # reconstruct cannot see each other 87 | for i in range(num_group): 88 | if i == 0: 89 | attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1): num_denoising] = True 90 | if i == num_group - 1: 91 | attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), :max_gt_num * i * 2] = True 92 | else: 93 | attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1): num_denoising] = True 94 | attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), :max_gt_num * 2 * i] = True 95 | 96 | dn_meta = { 97 | "dn_positive_idx": dn_positive_idx, 98 | "dn_num_group": num_group, 99 | "dn_num_split": [num_denoising, num_queries] 100 | } 101 | 102 | # print(input_query_class.shape) # torch.Size([4, 196, 256]) 103 | # print(input_query_bbox.shape) # torch.Size([4, 196, 4]) 104 | # print(attn_mask.shape) # torch.Size([496, 496]) 105 | 106 | return input_query_logits, input_query_bbox_unact, attn_mask, dn_meta 107 | -------------------------------------------------------------------------------- /engine/deim/lite_encoder.py: -------------------------------------------------------------------------------- 1 | """ 2 | DEIM: DETR with Improved Matching for Fast Convergence 3 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved. 4 | --------------------------------------------------------------------------------- 5 | Modified from D-FINE (https://github.com/Peterande/D-FINE/) 6 | Copyright (c) 2024 D-FINE Authors. All Rights Reserved. 7 | """ 8 | 9 | import copy 10 | from collections import OrderedDict 11 | 12 | import torch 13 | import torch.nn as nn 14 | import torch.nn.functional as F 15 | from functools import partial 16 | 17 | from .utils import get_activation 18 | 19 | from ..core import register 20 | from .hybrid_encoder import ConvNormLayer_fuse 21 | from .hybrid_encoder import RepNCSPELAN4 22 | 23 | __all__ = ['LiteEncoder'] 24 | 25 | 26 | # Copy from https://github.com/meituan/YOLOv6/blob/main/yolov6/layers/common.py#L695 27 | class GAP_Fusion(nn.Module): 28 | '''BiFusion Block in PAN''' 29 | def __init__(self, in_channels, out_channels, act=None): 30 | super().__init__() 31 | self.cv = ConvNormLayer_fuse(out_channels, out_channels, 1, 1, act=act) 32 | 33 | def forward(self, x): 34 | # global average pooling 35 | gap = F.adaptive_avg_pool2d(x, 1) 36 | x = x + gap 37 | return self.cv(x) 38 | 39 | # Two-scale encoder 40 | @register() 41 | class LiteEncoder(nn.Module): 42 | __share__ = ['eval_spatial_size', ] 43 | 44 | def __init__(self, 45 | in_channels=[512], 46 | feat_strides=[16], 47 | hidden_dim=256, 48 | expansion=1.0, 49 | depth_mult=1.0, 50 | act='silu', 51 | eval_spatial_size=None, 52 | csp_type='csp2', 53 | ): 54 | super().__init__() 55 | self.in_channels = in_channels 56 | self.feat_strides = feat_strides 57 | self.hidden_dim = hidden_dim 58 | self.eval_spatial_size = eval_spatial_size 59 | self.out_channels = [hidden_dim for _ in range(len(in_channels))] 60 | self.out_strides = feat_strides 61 | 62 | # channel projection: unify the channel dimension of the input features 63 | self.input_proj = nn.ModuleList() 64 | for in_channel in in_channels: 65 | proj = nn.Sequential(OrderedDict([ 66 | ('conv', nn.Conv2d(in_channel, hidden_dim, kernel_size=1, bias=False)), 67 | ('norm', nn.BatchNorm2d(hidden_dim)) 68 | ])) 69 | 70 | self.input_proj.append(proj) 71 | 72 | # get the small-scale feature 73 | down_sample = nn.Sequential( # avg pooling 74 | nn.AvgPool2d(kernel_size=3, stride=2, padding=1), 75 | nn.Conv2d(hidden_dim, hidden_dim, 1, 1, bias=False), 76 | nn.BatchNorm2d(hidden_dim), 77 | get_activation(act) 78 | ) 79 | self.down_sample1 = copy.deepcopy(down_sample) 80 | self.down_sample2 = copy.deepcopy(down_sample) 81 | 82 | # Bi-Fusion 83 | self.bi_fusion = GAP_Fusion(hidden_dim, hidden_dim, act=act) 84 | 85 | # fuse block 86 | c1, c2, c3, c4, num_blocks = hidden_dim, hidden_dim, hidden_dim*2, round(expansion * hidden_dim // 2), round(3 * depth_mult) 87 | fuse_block = RepNCSPELAN4(c1=c1, c2=c2, c3=c3, c4=c4, n=num_blocks, act=act, csp_type=csp_type) 88 | self.fpn_block = copy.deepcopy(fuse_block) 89 | self.pan_block = copy.deepcopy(fuse_block) 90 | 91 | def forward(self, feats): 92 | assert len(feats) == len(self.in_channels) 93 | proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] 94 | proj_feats.append(self.down_sample1(proj_feats[-1])) # get the small-scale feature 95 | 96 | # fuse the global feature and the small-scale feature 97 | proj_feats[-1] = self.bi_fusion(proj_feats[-1]) 98 | 99 | outs = [] 100 | # fpn 101 | fuse_feat = proj_feats[0] + F.interpolate(proj_feats[1], scale_factor=2., mode='nearest') 102 | outs.append(self.fpn_block(fuse_feat)) 103 | 104 | fuse_feat = proj_feats[1] + self.down_sample2(outs[-1]) 105 | outs.append(self.pan_block(fuse_feat)) 106 | 107 | return outs -------------------------------------------------------------------------------- /engine/deim/postprocessor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | import torchvision 11 | 12 | from ..core import register 13 | 14 | 15 | __all__ = ['PostProcessor'] 16 | 17 | 18 | def mod(a, b): 19 | out = a - a // b * b 20 | return out 21 | 22 | 23 | @register() 24 | class PostProcessor(nn.Module): 25 | __share__ = [ 26 | 'num_classes', 27 | 'use_focal_loss', 28 | 'num_top_queries', 29 | 'remap_mscoco_category' 30 | ] 31 | 32 | def __init__( 33 | self, 34 | num_classes=80, 35 | use_focal_loss=True, 36 | num_top_queries=300, 37 | remap_mscoco_category=False 38 | ) -> None: 39 | super().__init__() 40 | self.use_focal_loss = use_focal_loss 41 | self.num_top_queries = num_top_queries 42 | self.num_classes = int(num_classes) 43 | self.remap_mscoco_category = remap_mscoco_category 44 | self.deploy_mode = False 45 | 46 | def extra_repr(self) -> str: 47 | return f'use_focal_loss={self.use_focal_loss}, num_classes={self.num_classes}, num_top_queries={self.num_top_queries}' 48 | 49 | # def forward(self, outputs, orig_target_sizes): 50 | def forward(self, outputs, orig_target_sizes: torch.Tensor): 51 | logits, boxes = outputs['pred_logits'], outputs['pred_boxes'] 52 | # orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0) 53 | 54 | bbox_pred = torchvision.ops.box_convert(boxes, in_fmt='cxcywh', out_fmt='xyxy') 55 | bbox_pred *= orig_target_sizes.repeat(1, 2).unsqueeze(1) 56 | 57 | if self.use_focal_loss: 58 | scores = F.sigmoid(logits) 59 | scores, index = torch.topk(scores.flatten(1), self.num_top_queries, dim=-1) 60 | # labels = index % self.num_classes 61 | labels = mod(index, self.num_classes) 62 | index = index // self.num_classes 63 | boxes = bbox_pred.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, bbox_pred.shape[-1])) 64 | 65 | else: 66 | scores = F.softmax(logits)[:, :, :-1] 67 | scores, labels = scores.max(dim=-1) 68 | if scores.shape[1] > self.num_top_queries: 69 | scores, index = torch.topk(scores, self.num_top_queries, dim=-1) 70 | labels = torch.gather(labels, dim=1, index=index) 71 | boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1])) 72 | 73 | if self.deploy_mode: 74 | return labels, boxes, scores 75 | 76 | if self.remap_mscoco_category: 77 | from ..data.dataset import mscoco_label2category 78 | labels = torch.tensor([mscoco_label2category[int(x.item())] for x in labels.flatten()])\ 79 | .to(boxes.device).reshape(labels.shape) 80 | 81 | results = [] 82 | for lab, box, sco in zip(labels, boxes, scores): 83 | result = dict(labels=lab, boxes=box, scores=sco) 84 | results.append(result) 85 | 86 | return results 87 | 88 | 89 | def deploy(self, ): 90 | self.eval() 91 | self.deploy_mode = True 92 | return self 93 | -------------------------------------------------------------------------------- /engine/misc/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | from .logger import * 7 | from .visualizer import * 8 | from .dist_utils import setup_seed, setup_print 9 | from .profiler_utils import stats 10 | -------------------------------------------------------------------------------- /engine/misc/box_ops.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | import torch 7 | import torchvision 8 | from torch import Tensor 9 | from typing import List, Tuple 10 | 11 | 12 | def generalized_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor: 13 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 14 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 15 | return torchvision.ops.generalized_box_iou(boxes1, boxes2) 16 | 17 | 18 | # elementwise 19 | def elementwise_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor: 20 | """ 21 | Args: 22 | boxes1, [N, 4] 23 | boxes2, [N, 4] 24 | Returns: 25 | iou, [N, ] 26 | union, [N, ] 27 | """ 28 | area1 = torchvision.ops.box_area(boxes1) # [N, ] 29 | area2 = torchvision.ops.box_area(boxes2) # [N, ] 30 | lt = torch.max(boxes1[:, :2], boxes2[:, :2]) # [N, 2] 31 | rb = torch.min(boxes1[:, 2:], boxes2[:, 2:]) # [N, 2] 32 | wh = (rb - lt).clamp(min=0) # [N, 2] 33 | inter = wh[:, 0] * wh[:, 1] # [N, ] 34 | union = area1 + area2 - inter 35 | iou = inter / union 36 | return iou, union 37 | 38 | 39 | def elementwise_generalized_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor: 40 | """ 41 | Args: 42 | boxes1, [N, 4] with [x1, y1, x2, y2] 43 | boxes2, [N, 4] with [x1, y1, x2, y2] 44 | Returns: 45 | giou, [N, ] 46 | """ 47 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 48 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 49 | iou, union = elementwise_box_iou(boxes1, boxes2) 50 | lt = torch.min(boxes1[:, :2], boxes2[:, :2]) # [N, 2] 51 | rb = torch.max(boxes1[:, 2:], boxes2[:, 2:]) # [N, 2] 52 | wh = (rb - lt).clamp(min=0) # [N, 2] 53 | area = wh[:, 0] * wh[:, 1] 54 | return iou - (area - union) / area 55 | 56 | 57 | def check_point_inside_box(points: Tensor, boxes: Tensor, eps=1e-9) -> Tensor: 58 | """ 59 | Args: 60 | points, [K, 2], (x, y) 61 | boxes, [N, 4], (x1, y1, y2, y2) 62 | Returns: 63 | Tensor (bool), [K, N] 64 | """ 65 | x, y = [p.unsqueeze(-1) for p in points.unbind(-1)] 66 | x1, y1, x2, y2 = [x.unsqueeze(0) for x in boxes.unbind(-1)] 67 | 68 | l = x - x1 69 | t = y - y1 70 | r = x2 - x 71 | b = y2 - y 72 | 73 | ltrb = torch.stack([l, t, r, b], dim=-1) 74 | mask = ltrb.min(dim=-1).values > eps 75 | 76 | return mask 77 | 78 | 79 | def point_box_distance(points: Tensor, boxes: Tensor) -> Tensor: 80 | """ 81 | Args: 82 | boxes, [N, 4], (x1, y1, x2, y2) 83 | points, [N, 2], (x, y) 84 | Returns: 85 | Tensor (N, 4), (l, t, r, b) 86 | """ 87 | x1y1, x2y2 = torch.split(boxes, 2, dim=-1) 88 | lt = points - x1y1 89 | rb = x2y2 - points 90 | return torch.concat([lt, rb], dim=-1) 91 | 92 | 93 | def point_distance_box(points: Tensor, distances: Tensor) -> Tensor: 94 | """ 95 | Args: 96 | points (Tensor), [N, 2], (x, y) 97 | distances (Tensor), [N, 4], (l, t, r, b) 98 | Returns: 99 | boxes (Tensor), (N, 4), (x1, y1, x2, y2) 100 | """ 101 | lt, rb = torch.split(distances, 2, dim=-1) 102 | x1y1 = -lt + points 103 | x2y2 = rb + points 104 | boxes = torch.concat([x1y1, x2y2], dim=-1) 105 | return boxes 106 | -------------------------------------------------------------------------------- /engine/misc/lazy_loader.py: -------------------------------------------------------------------------------- 1 | """ 2 | https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/util/lazy_loader.py 3 | """ 4 | 5 | 6 | import types 7 | import importlib 8 | 9 | class LazyLoader(types.ModuleType): 10 | """Lazily import a module, mainly to avoid pulling in large dependencies. 11 | 12 | `paddle`, and `ffmpeg` are examples of modules that are large and not always 13 | needed, and this allows them to only be loaded when they are used. 14 | """ 15 | 16 | # The lint error here is incorrect. 17 | def __init__(self, local_name, parent_module_globals, name, warning=None): 18 | self._local_name = local_name 19 | self._parent_module_globals = parent_module_globals 20 | self._warning = warning 21 | 22 | # These members allows doctest correctly process this module member without 23 | # triggering self._load(). self._load() mutates parant_module_globals and 24 | # triggers a dict mutated during iteration error from doctest.py. 25 | # - for from_module() 26 | self.__module__ = name.rsplit(".", 1)[0] 27 | # - for is_routine() 28 | self.__wrapped__ = None 29 | 30 | super(LazyLoader, self).__init__(name) 31 | 32 | def _load(self): 33 | """Load the module and insert it into the parent's globals.""" 34 | # Import the target module and insert it into the parent's namespace 35 | module = importlib.import_module(self.__name__) 36 | self._parent_module_globals[self._local_name] = module 37 | 38 | # Emit a warning if one was specified 39 | if self._warning: 40 | # logging.warning(self._warning) 41 | # Make sure to only warn once. 42 | self._warning = None 43 | 44 | # Update this object's dict so that if someone keeps a reference to the 45 | # LazyLoader, lookups are efficient (__getattr__ is only called on lookups 46 | # that fail). 47 | self.__dict__.update(module.__dict__) 48 | 49 | return module 50 | 51 | def __getattr__(self, item): 52 | module = self._load() 53 | return getattr(module, item) 54 | 55 | def __repr__(self): 56 | # Carefully to not trigger _load, since repr may be called in very 57 | # sensitive places. 58 | return f"" 59 | 60 | def __dir__(self): 61 | module = self._load() 62 | return dir(module) 63 | 64 | 65 | # import paddle.nn as nn 66 | # nn = LazyLoader("nn", globals(), "paddle.nn") 67 | 68 | # class M(nn.Layer): 69 | # def __init__(self) -> None: 70 | # super().__init__() 71 | -------------------------------------------------------------------------------- /engine/misc/profiler_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. 3 | """ 4 | 5 | import copy 6 | from calflops import calculate_flops 7 | from typing import Tuple 8 | 9 | def stats( 10 | cfg, 11 | input_shape: Tuple=(1, 3, 640, 640), ) -> Tuple[int, dict]: 12 | 13 | base_size = cfg.train_dataloader.collate_fn.base_size 14 | input_shape = (1, 3, base_size, base_size) 15 | 16 | model_for_info = copy.deepcopy(cfg.model).deploy() 17 | 18 | flops, macs, _ = calculate_flops(model=model_for_info, 19 | input_shape=input_shape, 20 | output_as_string=True, 21 | output_precision=4, 22 | print_detailed=False) 23 | params = sum(p.numel() for p in model_for_info.parameters()) 24 | del model_for_info 25 | 26 | return params, {"Model FLOPs:%s MACs:%s Params:%s" %(flops, macs, params)} 27 | -------------------------------------------------------------------------------- /engine/misc/visualizer.py: -------------------------------------------------------------------------------- 1 | """" 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | import PIL 7 | import torch 8 | import torch.utils.data 9 | import torchvision 10 | torchvision.disable_beta_transforms_warning() 11 | 12 | __all__ = ['show_sample'] 13 | 14 | def show_sample(sample): 15 | """for coco dataset/dataloader 16 | """ 17 | import matplotlib.pyplot as plt 18 | from torchvision.transforms.v2 import functional as F 19 | from torchvision.utils import draw_bounding_boxes 20 | 21 | image, target = sample 22 | if isinstance(image, PIL.Image.Image): 23 | image = F.to_image_tensor(image) 24 | 25 | image = F.convert_dtype(image, torch.uint8) 26 | annotated_image = draw_bounding_boxes(image, target["boxes"], colors="yellow", width=3) 27 | 28 | fig, ax = plt.subplots() 29 | ax.imshow(annotated_image.permute(1, 2, 0).numpy()) 30 | ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[]) 31 | fig.tight_layout() 32 | fig.show() 33 | plt.show() 34 | -------------------------------------------------------------------------------- /engine/optim/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | from .ema import * 7 | from .optim import * 8 | from .amp import * 9 | from .warmup import * 10 | -------------------------------------------------------------------------------- /engine/optim/amp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | 7 | import torch.cuda.amp as amp 8 | 9 | from ..core import register 10 | 11 | 12 | __all__ = ['GradScaler'] 13 | 14 | GradScaler = register()(amp.grad_scaler.GradScaler) 15 | -------------------------------------------------------------------------------- /engine/optim/ema.py: -------------------------------------------------------------------------------- 1 | """ 2 | D-FINE: Redefine Regression Task of DETRs as Fine-grained Distribution Refinement 3 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. 4 | --------------------------------------------------------------------------------- 5 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 6 | Copyright (c) 2023 lyuwenyu. All Rights Reserved. 7 | """ 8 | 9 | 10 | import torch 11 | import torch.nn as nn 12 | 13 | import math 14 | from copy import deepcopy 15 | 16 | from ..core import register 17 | from ..misc import dist_utils 18 | 19 | __all__ = ['ModelEMA'] 20 | 21 | 22 | @register() 23 | class ModelEMA(object): 24 | """ 25 | Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models 26 | Keep a moving average of everything in the model state_dict (parameters and buffers). 27 | This is intended to allow functionality like 28 | https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage 29 | A smoothed version of the weights is necessary for some training schemes to perform well. 30 | This class is sensitive where it is initialized in the sequence of model init, 31 | GPU assignment and distributed training wrappers. 32 | """ 33 | def __init__(self, model: nn.Module, decay: float=0.9999, warmups: int=1000, start: int=0): 34 | super().__init__() 35 | 36 | self.module = deepcopy(dist_utils.de_parallel(model)).eval() 37 | # if next(model.parameters()).device.type != 'cpu': 38 | # self.module.half() # FP16 EMA 39 | 40 | self.decay = decay 41 | self.warmups = warmups 42 | self.before_start = 0 43 | self.start = start 44 | self.updates = 0 # number of EMA updates 45 | if warmups == 0: 46 | self.decay_fn = lambda x: decay 47 | else: 48 | self.decay_fn = lambda x: decay * (1 - math.exp(-x / warmups)) # decay exponential ramp (to help early epochs) 49 | 50 | for p in self.module.parameters(): 51 | p.requires_grad_(False) 52 | 53 | 54 | def update(self, model: nn.Module): 55 | if self.before_start < self.start: 56 | self.before_start += 1 57 | return 58 | # Update EMA parameters 59 | with torch.no_grad(): 60 | self.updates += 1 61 | d = self.decay_fn(self.updates) 62 | msd = dist_utils.de_parallel(model).state_dict() 63 | for k, v in self.module.state_dict().items(): 64 | if v.dtype.is_floating_point: 65 | v *= d 66 | v += (1 - d) * msd[k].detach() 67 | 68 | def to(self, *args, **kwargs): 69 | self.module = self.module.to(*args, **kwargs) 70 | return self 71 | 72 | def state_dict(self, ): 73 | return dict(module=self.module.state_dict(), updates=self.updates) 74 | 75 | def load_state_dict(self, state, strict=True): 76 | self.module.load_state_dict(state['module'], strict=strict) 77 | if 'updates' in state: 78 | self.updates = state['updates'] 79 | 80 | def forwad(self, ): 81 | raise RuntimeError('ema...') 82 | 83 | def extra_repr(self) -> str: 84 | return f'decay={self.decay}, warmups={self.warmups}' 85 | 86 | 87 | 88 | class ExponentialMovingAverage(torch.optim.swa_utils.AveragedModel): 89 | """Maintains moving averages of model parameters using an exponential decay. 90 | ``ema_avg = decay * avg_model_param + (1 - decay) * model_param`` 91 | `torch.optim.swa_utils.AveragedModel `_ 92 | is used to compute the EMA. 93 | """ 94 | def __init__(self, model, decay, device="cpu", use_buffers=True): 95 | 96 | self.decay_fn = lambda x: decay * (1 - math.exp(-x / 2000)) 97 | 98 | def ema_avg(avg_model_param, model_param, num_averaged): 99 | decay = self.decay_fn(num_averaged) 100 | return decay * avg_model_param + (1 - decay) * model_param 101 | 102 | super().__init__(model, device, ema_avg, use_buffers=use_buffers) 103 | -------------------------------------------------------------------------------- /engine/optim/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | """ 2 | DEIM: DETR with Improved Matching for Fast Convergence 3 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved. 4 | """ 5 | 6 | import math 7 | from functools import partial 8 | 9 | 10 | def flat_cosine_schedule(total_iter, warmup_iter, flat_iter, no_aug_iter, current_iter, init_lr, min_lr): 11 | """ 12 | Computes the learning rate using a warm-up, flat, and cosine decay schedule. 13 | 14 | Args: 15 | total_iter (int): Total number of iterations. 16 | warmup_iter (int): Number of iterations for warm-up phase. 17 | flat_iter (int): Number of iterations for flat phase. 18 | no_aug_iter (int): Number of iterations for no-augmentation phase. 19 | current_iter (int): Current iteration. 20 | init_lr (float): Initial learning rate. 21 | min_lr (float): Minimum learning rate. 22 | 23 | Returns: 24 | float: Calculated learning rate. 25 | """ 26 | if current_iter <= warmup_iter: 27 | return init_lr * (current_iter / float(warmup_iter)) ** 2 28 | elif warmup_iter < current_iter <= flat_iter: 29 | return init_lr 30 | elif current_iter >= total_iter - no_aug_iter: 31 | return min_lr 32 | else: 33 | cosine_decay = 0.5 * (1 + math.cos(math.pi * (current_iter - flat_iter) / 34 | (total_iter - flat_iter - no_aug_iter))) 35 | return min_lr + (init_lr - min_lr) * cosine_decay 36 | 37 | 38 | class FlatCosineLRScheduler: 39 | """ 40 | Learning rate scheduler with warm-up, optional flat phase, and cosine decay following RTMDet. 41 | 42 | Args: 43 | optimizer (torch.optim.Optimizer): Optimizer instance. 44 | lr_gamma (float): Scaling factor for the minimum learning rate. 45 | iter_per_epoch (int): Number of iterations per epoch. 46 | total_epochs (int): Total number of training epochs. 47 | warmup_epochs (int): Number of warm-up epochs. 48 | flat_epochs (int): Number of flat epochs (for flat-cosine scheduler). 49 | no_aug_epochs (int): Number of no-augmentation epochs. 50 | """ 51 | def __init__(self, optimizer, lr_gamma, iter_per_epoch, total_epochs, 52 | warmup_iter, flat_epochs, no_aug_epochs, scheduler_type="cosine"): 53 | self.base_lrs = [group["initial_lr"] for group in optimizer.param_groups] 54 | self.min_lrs = [base_lr * lr_gamma for base_lr in self.base_lrs] 55 | 56 | total_iter = int(iter_per_epoch * total_epochs) 57 | no_aug_iter = int(iter_per_epoch * no_aug_epochs) 58 | flat_iter = int(iter_per_epoch * flat_epochs) 59 | 60 | print(self.base_lrs, self.min_lrs, total_iter, warmup_iter, flat_iter, no_aug_iter) 61 | self.lr_func = partial(flat_cosine_schedule, total_iter, warmup_iter, flat_iter, no_aug_iter) 62 | 63 | def step(self, current_iter, optimizer): 64 | """ 65 | Updates the learning rate of the optimizer at the current iteration. 66 | 67 | Args: 68 | current_iter (int): Current iteration. 69 | optimizer (torch.optim.Optimizer): Optimizer instance. 70 | """ 71 | for i, group in enumerate(optimizer.param_groups): 72 | group["lr"] = self.lr_func(current_iter, self.base_lrs[i], self.min_lrs[i]) 73 | return optimizer 74 | -------------------------------------------------------------------------------- /engine/optim/optim.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | 7 | import torch.optim as optim 8 | import torch.optim.lr_scheduler as lr_scheduler 9 | 10 | from ..core import register 11 | 12 | 13 | __all__ = ['AdamW', 'SGD', 'Adam', 'MultiStepLR', 'CosineAnnealingLR', 'OneCycleLR', 'LambdaLR'] 14 | 15 | 16 | 17 | SGD = register()(optim.SGD) 18 | Adam = register()(optim.Adam) 19 | AdamW = register()(optim.AdamW) 20 | 21 | 22 | MultiStepLR = register()(lr_scheduler.MultiStepLR) 23 | CosineAnnealingLR = register()(lr_scheduler.CosineAnnealingLR) 24 | OneCycleLR = register()(lr_scheduler.OneCycleLR) 25 | LambdaLR = register()(lr_scheduler.LambdaLR) 26 | -------------------------------------------------------------------------------- /engine/optim/warmup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | from torch.optim.lr_scheduler import LRScheduler 7 | 8 | from ..core import register 9 | 10 | 11 | class Warmup(object): 12 | def __init__(self, lr_scheduler: LRScheduler, warmup_duration: int, last_step: int=-1) -> None: 13 | self.lr_scheduler = lr_scheduler 14 | self.warmup_end_values = [pg['lr'] for pg in lr_scheduler.optimizer.param_groups] 15 | self.last_step = last_step 16 | self.warmup_duration = warmup_duration 17 | self.step() 18 | 19 | def state_dict(self): 20 | return {k: v for k, v in self.__dict__.items() if k != 'lr_scheduler'} 21 | 22 | def load_state_dict(self, state_dict): 23 | self.__dict__.update(state_dict) 24 | 25 | def get_warmup_factor(self, step, **kwargs): 26 | raise NotImplementedError 27 | 28 | def step(self, ): 29 | self.last_step += 1 30 | if self.last_step >= self.warmup_duration: 31 | return 32 | factor = self.get_warmup_factor(self.last_step) 33 | for i, pg in enumerate(self.lr_scheduler.optimizer.param_groups): 34 | pg['lr'] = factor * self.warmup_end_values[i] 35 | 36 | def finished(self, ): 37 | if self.last_step >= self.warmup_duration: 38 | return True 39 | return False 40 | 41 | 42 | @register() 43 | class LinearWarmup(Warmup): 44 | def __init__(self, lr_scheduler: LRScheduler, warmup_duration: int, last_step: int = -1) -> None: 45 | super().__init__(lr_scheduler, warmup_duration, last_step) 46 | 47 | def get_warmup_factor(self, step): 48 | return min(1.0, (step + 1) / self.warmup_duration) 49 | -------------------------------------------------------------------------------- /engine/solver/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | from ._solver import BaseSolver 7 | from .clas_solver import ClasSolver 8 | from .det_solver import DetSolver 9 | 10 | 11 | 12 | from typing import Dict 13 | 14 | TASKS :Dict[str, BaseSolver] = { 15 | 'classification': ClasSolver, 16 | 'detection': DetSolver, 17 | } 18 | -------------------------------------------------------------------------------- /engine/solver/clas_engine.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from ..misc import (MetricLogger, SmoothedValue, reduce_dict) 10 | 11 | 12 | def train_one_epoch(model: nn.Module, criterion: nn.Module, dataloader, optimizer, ema, epoch, device): 13 | """ 14 | """ 15 | model.train() 16 | 17 | metric_logger = MetricLogger(delimiter=" ") 18 | metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}')) 19 | print_freq = 100 20 | header = 'Epoch: [{}]'.format(epoch) 21 | 22 | for imgs, labels in metric_logger.log_every(dataloader, print_freq, header): 23 | imgs = imgs.to(device) 24 | labels = labels.to(device) 25 | 26 | preds = model(imgs) 27 | loss: torch.Tensor = criterion(preds, labels, epoch) 28 | 29 | optimizer.zero_grad() 30 | loss.backward() 31 | optimizer.step() 32 | 33 | if ema is not None: 34 | ema.update(model) 35 | 36 | loss_reduced_values = {k: v.item() for k, v in reduce_dict({'loss': loss}).items()} 37 | metric_logger.update(**loss_reduced_values) 38 | metric_logger.update(lr=optimizer.param_groups[0]["lr"]) 39 | 40 | metric_logger.synchronize_between_processes() 41 | print("Averaged stats:", metric_logger) 42 | 43 | stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()} 44 | return stats 45 | 46 | 47 | 48 | @torch.no_grad() 49 | def evaluate(model, criterion, dataloader, device): 50 | model.eval() 51 | 52 | metric_logger = MetricLogger(delimiter=" ") 53 | # metric_logger.add_meter('acc', SmoothedValue(window_size=1, fmt='{global_avg:.4f}')) 54 | # metric_logger.add_meter('loss', SmoothedValue(window_size=1, fmt='{value:.2f}')) 55 | metric_logger.add_meter('acc', SmoothedValue(window_size=1)) 56 | metric_logger.add_meter('loss', SmoothedValue(window_size=1)) 57 | 58 | header = 'Test:' 59 | for imgs, labels in metric_logger.log_every(dataloader, 10, header): 60 | imgs, labels = imgs.to(device), labels.to(device) 61 | preds = model(imgs) 62 | 63 | acc = (preds.argmax(dim=-1) == labels).sum() / preds.shape[0] 64 | loss = criterion(preds, labels) 65 | 66 | dict_reduced = reduce_dict({'acc': acc, 'loss': loss}) 67 | reduced_values = {k: v.item() for k, v in dict_reduced.items()} 68 | metric_logger.update(**reduced_values) 69 | 70 | metric_logger.synchronize_between_processes() 71 | print("Averaged stats:", metric_logger) 72 | 73 | stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()} 74 | return stats 75 | -------------------------------------------------------------------------------- /engine/solver/clas_solver.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | import time 7 | import json 8 | import datetime 9 | from pathlib import Path 10 | 11 | import torch 12 | import torch.nn as nn 13 | 14 | from ..misc import dist_utils 15 | from ._solver import BaseSolver 16 | from .clas_engine import train_one_epoch, evaluate 17 | 18 | 19 | class ClasSolver(BaseSolver): 20 | 21 | def fit(self, ): 22 | print("Start training") 23 | self.train() 24 | args = self.cfg 25 | 26 | n_parameters = sum(p.numel() for p in self.model.parameters() if p.requires_grad) 27 | print('Number of params:', n_parameters) 28 | 29 | output_dir = Path(args.output_dir) 30 | output_dir.mkdir(exist_ok=True) 31 | 32 | start_time = time.time() 33 | start_epoch = self.last_epoch + 1 34 | for epoch in range(start_epoch, args.epoches): 35 | 36 | if dist_utils.is_dist_available_and_initialized(): 37 | self.train_dataloader.sampler.set_epoch(epoch) 38 | 39 | train_stats = train_one_epoch(self.model, 40 | self.criterion, 41 | self.train_dataloader, 42 | self.optimizer, 43 | self.ema, 44 | epoch=epoch, 45 | device=self.device) 46 | self.lr_scheduler.step() 47 | self.last_epoch += 1 48 | 49 | if output_dir: 50 | checkpoint_paths = [output_dir / 'checkpoint.pth'] 51 | # extra checkpoint before LR drop and every 100 epochs 52 | if (epoch + 1) % args.checkpoint_freq == 0: 53 | checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') 54 | for checkpoint_path in checkpoint_paths: 55 | dist_utils.save_on_master(self.state_dict(epoch), checkpoint_path) 56 | 57 | module = self.ema.module if self.ema else self.model 58 | test_stats = evaluate(module, self.criterion, self.val_dataloader, self.device) 59 | 60 | log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, 61 | **{f'test_{k}': v for k, v in test_stats.items()}, 62 | 'epoch': epoch, 63 | 'n_parameters': n_parameters} 64 | 65 | if output_dir and dist_utils.is_main_process(): 66 | with (output_dir / "log.txt").open("a") as f: 67 | f.write(json.dumps(log_stats) + "\n") 68 | 69 | total_time = time.time() - start_time 70 | total_time_str = str(datetime.timedelta(seconds=int(total_time))) 71 | print('Training time {}'.format(total_time_str)) 72 | -------------------------------------------------------------------------------- /figures/deimv2_coco_AP_vs_GFLOPs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Intellindust-AI-Lab/DEIMv2/19d5b19a58c229dd7ad5f079947bbe398e005d01/figures/deimv2_coco_AP_vs_GFLOPs.png -------------------------------------------------------------------------------- /figures/deimv2_coco_AP_vs_Params.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Intellindust-AI-Lab/DEIMv2/19d5b19a58c229dd7ad5f079947bbe398e005d01/figures/deimv2_coco_AP_vs_Params.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.5.1 2 | torchvision==0.20.1 3 | faster-coco-eval>=1.6.7 4 | PyYAML 5 | tensorboard 6 | scipy 7 | calflops 8 | transformers 9 | -------------------------------------------------------------------------------- /tools/benchmark/dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | import os 7 | import glob 8 | from PIL import Image 9 | 10 | import torch 11 | import torch.utils.data as data 12 | import torchvision 13 | import torchvision.transforms as T 14 | import torchvision.transforms.functional as F 15 | 16 | Image.MAX_IMAGE_PIXELS = None 17 | 18 | class ToTensor(T.ToTensor): 19 | def __init__(self) -> None: 20 | super().__init__() 21 | 22 | def __call__(self, pic): 23 | if isinstance(pic, torch.Tensor): 24 | return pic 25 | return super().__call__(pic) 26 | 27 | class PadToSize(T.Pad): 28 | def __init__(self, size, fill=0, padding_mode='constant'): 29 | super().__init__(0, fill, padding_mode) 30 | self.size = size 31 | self.fill = fill 32 | 33 | def __call__(self, img): 34 | """ 35 | Args: 36 | img (PIL Image or Tensor): Image to be padded. 37 | 38 | Returns: 39 | PIL Image or Tensor: Padded image. 40 | """ 41 | w, h = F.get_image_size(img) 42 | padding = (0, 0, self.size[0] - w, self.size[1] - h) 43 | return F.pad(img, padding, self.fill, self.padding_mode) 44 | 45 | 46 | class Dataset(data.Dataset): 47 | def __init__(self, img_dir: str='', preprocess: T.Compose=None, device='cuda:0') -> None: 48 | super().__init__() 49 | 50 | self.device = device 51 | self.size = 640 52 | 53 | self.im_path_list = list(glob.glob(os.path.join(img_dir, '*.jpg'))) 54 | 55 | if preprocess is None: 56 | self.preprocess = T.Compose([ 57 | T.Resize(size=639, max_size=640), 58 | PadToSize(size=(640, 640), fill=114), 59 | ToTensor(), 60 | T.ConvertImageDtype(torch.float), 61 | ]) 62 | else: 63 | self.preprocess = preprocess 64 | 65 | def __len__(self, ): 66 | return len(self.im_path_list) 67 | 68 | def __getitem__(self, index): 69 | # im = Image.open(self.img_path_list[index]).convert('RGB') 70 | im = torchvision.io.read_file(self.im_path_list[index]) 71 | im = torchvision.io.decode_jpeg(im, mode=torchvision.io.ImageReadMode.RGB, device=self.device) 72 | _, h, w = im.shape # c,h,w 73 | 74 | im = self.preprocess(im) 75 | 76 | blob = { 77 | 'images': im, 78 | 'im_shape': torch.tensor([self.size, self.size]).to(im.device), 79 | 'scale_factor': torch.tensor([self.size / h, self.size / w]).to(im.device), 80 | 'orig_target_sizes': torch.tensor([w, h]).to(im.device), 81 | } 82 | 83 | return blob 84 | 85 | @staticmethod 86 | def post_process(): 87 | pass 88 | 89 | @staticmethod 90 | def collate_fn(): 91 | pass 92 | 93 | 94 | def draw_nms_result(blob, outputs, draw_score_threshold=0.25, name=''): 95 | '''show result 96 | Keys: 97 | 'num_dets', 'det_boxes', 'det_scores', 'det_classes' 98 | ''' 99 | for i in range(blob['image'].shape[0]): 100 | det_scores = outputs['det_scores'][i] 101 | det_boxes = outputs['det_boxes'][i][det_scores > draw_score_threshold] 102 | 103 | im = (blob['image'][i] * 255).to(torch.uint8) 104 | im = torchvision.utils.draw_bounding_boxes(im, boxes=det_boxes, width=2) 105 | Image.fromarray(im.permute(1, 2, 0).cpu().numpy()).save(f'test_{name}_{i}.jpg') 106 | -------------------------------------------------------------------------------- /tools/benchmark/get_info.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. 3 | """ 4 | 5 | import os 6 | import sys 7 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..')) 8 | 9 | import argparse 10 | from calflops import calculate_flops 11 | from engine.core import YAMLConfig 12 | 13 | import torch 14 | import torch.nn as nn 15 | 16 | def custom_repr(self): 17 | return f'{{Tensor:{tuple(self.shape)}}} {original_repr(self)}' 18 | original_repr = torch.Tensor.__repr__ 19 | torch.Tensor.__repr__ = custom_repr 20 | 21 | def main(args, ): 22 | """main 23 | """ 24 | cfg = YAMLConfig(args.config, resume=None) 25 | class Model_for_flops(nn.Module): 26 | def __init__(self, ) -> None: 27 | super().__init__() 28 | self.model = cfg.model.deploy() 29 | 30 | def forward(self, images): 31 | outputs = self.model(images) 32 | return outputs 33 | 34 | model = Model_for_flops().eval() 35 | 36 | flops, macs, _ = calculate_flops(model=model, 37 | input_shape=(1, 3, 640, 640), 38 | output_as_string=True, 39 | output_precision=4) 40 | params = sum(p.numel() for p in model.parameters()) 41 | print("Model FLOPs:%s MACs:%s Params:%s \n" %(flops, macs, params)) 42 | 43 | 44 | if __name__ == '__main__': 45 | 46 | parser = argparse.ArgumentParser() 47 | parser.add_argument('--config', '-c', default= "configs/dfine/dfine_hgnetv2_l_coco.yml", type=str) 48 | args = parser.parse_args() 49 | 50 | main(args) 51 | -------------------------------------------------------------------------------- /tools/benchmark/requirements.txt: -------------------------------------------------------------------------------- 1 | onnxruntime 2 | tensorrt 3 | pycuda 4 | calflops 5 | tqdm 6 | # onnx_graphsurgeon # for YOLOs 7 | -------------------------------------------------------------------------------- /tools/benchmark/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import contextlib 3 | import numpy as np 4 | from PIL import Image 5 | from collections import OrderedDict 6 | 7 | import onnx 8 | import torch 9 | import onnx_graphsurgeon 10 | 11 | 12 | def to_binary_data(path, size=(640, 640), output_name='input_tensor.bin'): 13 | '''--loadInputs='image:input_tensor.bin' 14 | ''' 15 | im = Image.open(path).resize(size) 16 | data = np.asarray(im, dtype=np.float32).transpose(2, 0, 1)[None] / 255. 17 | data.tofile(output_name) 18 | 19 | 20 | def yolo_insert_nms(path, score_threshold=0.01, iou_threshold=0.7, max_output_boxes=300, simplify=False): 21 | ''' 22 | http://www.xavierdupre.fr/app/onnxcustom/helpsphinx/api/onnxops/onnx__EfficientNMS_TRT.html 23 | https://huggingface.co/spaces/muttalib1326/Punjabi_Character_Detection/blob/3dd1e17054c64e5f6b2254278f96cfa2bf418cd4/utils/add_nms.py 24 | ''' 25 | onnx_model = onnx.load(path) 26 | 27 | if simplify: 28 | from onnxsim import simplify 29 | onnx_model, _ = simplify(onnx_model, overwrite_input_shapes={'image': [1, 3, 640, 640]}) 30 | 31 | graph = onnx_graphsurgeon.import_onnx(onnx_model) 32 | graph.toposort() 33 | graph.fold_constants() 34 | graph.cleanup() 35 | 36 | topk = max_output_boxes 37 | attrs = OrderedDict(plugin_version='1', 38 | background_class=-1, 39 | max_output_boxes=topk, 40 | score_threshold=score_threshold, 41 | iou_threshold=iou_threshold, 42 | score_activation=False, 43 | box_coding=0, ) 44 | 45 | outputs = [onnx_graphsurgeon.Variable('num_dets', np.int32, [-1, 1]), 46 | onnx_graphsurgeon.Variable('det_boxes', np.float32, [-1, topk, 4]), 47 | onnx_graphsurgeon.Variable('det_scores', np.float32, [-1, topk]), 48 | onnx_graphsurgeon.Variable('det_classes', np.int32, [-1, topk])] 49 | 50 | graph.layer(op='EfficientNMS_TRT', 51 | name="batched_nms", 52 | inputs=[graph.outputs[0], 53 | graph.outputs[1]], 54 | outputs=outputs, 55 | attrs=attrs, ) 56 | 57 | graph.outputs = outputs 58 | graph.cleanup().toposort() 59 | 60 | onnx.save(onnx_graphsurgeon.export_onnx(graph), 'yolo_w_nms.onnx') 61 | 62 | 63 | class TimeProfiler(contextlib.ContextDecorator): 64 | def __init__(self, ): 65 | self.total = 0 66 | 67 | def __enter__(self, ): 68 | self.start = self.time() 69 | return self 70 | 71 | def __exit__(self, type, value, traceback): 72 | self.total += self.time() - self.start 73 | 74 | def reset(self, ): 75 | self.total = 0 76 | 77 | def time(self, ): 78 | if torch.cuda.is_available(): 79 | torch.cuda.synchronize() 80 | return time.time() 81 | -------------------------------------------------------------------------------- /tools/deployment/export_onnx.py: -------------------------------------------------------------------------------- 1 | """ 2 | DEIMv2: Real-Time Object Detection Meets DINOv3 3 | Copyright (c) 2025 The DEIMv2 Authors. All Rights Reserved. 4 | --------------------------------------------------------------------------------- 5 | D-FINE: Redefine Regression Task of DETRs as Fine-grained Distribution Refinement 6 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. 7 | --------------------------------------------------------------------------------- 8 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 9 | Copyright (c) 2023 lyuwenyu. All Rights Reserved. 10 | """ 11 | 12 | import os 13 | import sys 14 | 15 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..')) 16 | 17 | import torch 18 | import torch.nn as nn 19 | 20 | from engine.core import YAMLConfig 21 | 22 | 23 | def main(args, ): 24 | """main 25 | """ 26 | cfg = YAMLConfig(args.config, resume=args.resume) 27 | 28 | if 'HGNetv2' in cfg.yaml_cfg: 29 | cfg.yaml_cfg['HGNetv2']['pretrained'] = False 30 | 31 | if args.resume: 32 | checkpoint = torch.load(args.resume, map_location='cpu') 33 | if 'ema' in checkpoint: 34 | state = checkpoint['ema']['module'] 35 | else: 36 | state = checkpoint['model'] 37 | 38 | # NOTE load train mode state -> convert to deploy mode 39 | cfg.model.load_state_dict(state) 40 | 41 | else: 42 | # raise AttributeError('Only support resume to load model.state_dict by now.') 43 | print('not load model.state_dict, use default init state dict...') 44 | 45 | class Model(nn.Module): 46 | def __init__(self, ) -> None: 47 | super().__init__() 48 | self.model = cfg.model.deploy() 49 | self.postprocessor = cfg.postprocessor.deploy() 50 | 51 | def forward(self, images, orig_target_sizes): 52 | outputs = self.model(images) 53 | outputs = self.postprocessor(outputs, orig_target_sizes) 54 | return outputs 55 | 56 | model = Model() 57 | 58 | img_size = cfg.yaml_cfg["eval_spatial_size"] 59 | data = torch.rand(32, 3, *img_size) 60 | size = torch.tensor([img_size]) 61 | _ = model(data, size) 62 | 63 | dynamic_axes = { 64 | 'images': {0: 'N', }, 65 | 'orig_target_sizes': {0: 'N'} 66 | } 67 | 68 | output_file = args.resume.replace('.pth', '.onnx') if args.resume else 'model.onnx' 69 | 70 | torch.onnx.export( 71 | model, 72 | (data, size), 73 | output_file, 74 | input_names=['images', 'orig_target_sizes'], 75 | output_names=['labels', 'boxes', 'scores'], 76 | dynamic_axes=dynamic_axes, 77 | opset_version=args.opset, 78 | verbose=False, 79 | do_constant_folding=True, 80 | ) 81 | 82 | if args.check: 83 | import onnx 84 | onnx_model = onnx.load(output_file) 85 | onnx.checker.check_model(onnx_model) 86 | print('Check export onnx model done...') 87 | 88 | if args.simplify: 89 | import onnx 90 | import onnxsim 91 | dynamic = True 92 | # input_shapes = {'images': [1, 3, 640, 640], 'orig_target_sizes': [1, 2]} if dynamic else None 93 | input_shapes = {'images': data.shape, 'orig_target_sizes': size.shape} if dynamic else None 94 | onnx_model_simplify, check = onnxsim.simplify(output_file, test_input_shapes=input_shapes) 95 | onnx.save(onnx_model_simplify, output_file) 96 | print(f'Simplify onnx model {check}...') 97 | 98 | 99 | if __name__ == '__main__': 100 | 101 | import argparse 102 | parser = argparse.ArgumentParser() 103 | parser.add_argument('--config', '-c', default='configs/dfine/dfine_hgnetv2_l_coco.yml', type=str, ) 104 | parser.add_argument('--resume', '-r', type=str, ) 105 | parser.add_argument('--opset', type=int, default=17,) 106 | parser.add_argument('--check', action='store_true') 107 | parser.add_argument('--simplify', action='store_true') 108 | args = parser.parse_args() 109 | main(args) 110 | -------------------------------------------------------------------------------- /tools/deployment/export_yolo_w_nms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | 4 | import numpy as np 5 | import onnxruntime as ort 6 | 7 | from utils import yolo_insert_nms 8 | 9 | class YOLO11(torch.nn.Module): 10 | def __init__(self, name) -> None: 11 | super().__init__() 12 | from ultralytics import YOLO 13 | # Load a model 14 | # build a new model from scratch 15 | # model = YOLO(f'{name}.yaml') 16 | 17 | # load a pretrained model (recommended for training) 18 | model = YOLO("yolo11n.pt") 19 | self.model = model.model 20 | 21 | def forward(self, x): 22 | '''https://github.com/ultralytics/ultralytics/blob/main/ultralytics/nn/tasks.py#L216 23 | ''' 24 | pred: torch.Tensor = self.model(x)[0] # n 84 8400, 25 | pred = pred.permute(0, 2, 1) 26 | boxes, scores = pred.split([4, 80], dim=-1) 27 | boxes = torchvision.ops.box_convert(boxes, in_fmt='cxcywh', out_fmt='xyxy') 28 | 29 | return boxes, scores 30 | 31 | 32 | 33 | def export_onnx(name='yolov8n'): 34 | '''export onnx 35 | ''' 36 | m = YOLO11(name) 37 | 38 | x = torch.rand(1, 3, 640, 640) 39 | dynamic_axes = { 40 | 'image': {0: '-1'} 41 | } 42 | torch.onnx.export(m, x, f'{name}.onnx', 43 | input_names=['image'], 44 | output_names=['boxes', 'scores'], 45 | opset_version=13, 46 | dynamic_axes=dynamic_axes) 47 | 48 | data = np.random.rand(1, 3, 640, 640).astype(np.float32) 49 | sess = ort.InferenceSession(f'{name}.onnx') 50 | _ = sess.run(output_names=None, input_feed={'image': data}) 51 | 52 | import onnx 53 | import onnxslim 54 | model_onnx = onnx.load(f'{name}.onnx') 55 | model_onnx = onnxslim.slim(model_onnx) 56 | onnx.save(model_onnx, f'{name}.onnx') 57 | 58 | 59 | if __name__ == '__main__': 60 | 61 | import argparse 62 | parser = argparse.ArgumentParser() 63 | parser.add_argument('--name', type=str, default='yolo11n_tuned') 64 | parser.add_argument('--score_threshold', type=float, default=0.01) 65 | parser.add_argument('--iou_threshold', type=float, default=0.6) 66 | parser.add_argument('--max_output_boxes', type=int, default=300) 67 | args = parser.parse_args() 68 | 69 | export_onnx(name=args.name) 70 | 71 | yolo_insert_nms(path=f'{args.name}.onnx', 72 | score_threshold=args.score_threshold, 73 | iou_threshold=args.iou_threshold, 74 | max_output_boxes=args.max_output_boxes, ) 75 | -------------------------------------------------------------------------------- /tools/inference/openvino_inf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 3 | Copyright(c) 2023 lyuwenyu. All Rights Reserved. 4 | """ 5 | 6 | 7 | # please reference: https://github.com/guojin-yan/RT-DETR-OpenVINO 8 | -------------------------------------------------------------------------------- /tools/inference/requirements.txt: -------------------------------------------------------------------------------- 1 | onnxruntime 2 | tensorrt 3 | -------------------------------------------------------------------------------- /tools/inference/torch_inf.py: -------------------------------------------------------------------------------- 1 | """ 2 | DEIMv2: Real-Time Object Detection Meets DINOv3 3 | Copyright (c) 2025 The DEIMv2 Authors. All Rights Reserved. 4 | --------------------------------------------------------------------------------- 5 | Modified from D-FINE (https://github.com/Peterande/D-FINE) 6 | Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. 7 | """ 8 | 9 | import os 10 | import sys 11 | 12 | import cv2 # Added for video processing 13 | import numpy as np 14 | import torch 15 | import torch.nn as nn 16 | import torchvision.transforms as T 17 | from PIL import Image, ImageDraw 18 | 19 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) 20 | from engine.core import YAMLConfig 21 | 22 | 23 | def draw(images, labels, boxes, scores, thrh=0.4): 24 | for i, im in enumerate(images): 25 | draw = ImageDraw.Draw(im) 26 | 27 | scr = scores[i] 28 | lab = labels[i][scr > thrh] 29 | box = boxes[i][scr > thrh] 30 | scrs = scr[scr > thrh] 31 | 32 | for j, b in enumerate(box): 33 | draw.rectangle(list(b), outline='red') 34 | draw.text((b[0], b[1]), text=f"{lab[j].item()} {round(scrs[j].item(), 2)}", fill='blue', ) 35 | 36 | im.save('torch_results.jpg') 37 | 38 | 39 | def process_image(model, device, file_path, size=(640, 640)): 40 | im_pil = Image.open(file_path).convert('RGB') 41 | w, h = im_pil.size 42 | orig_size = torch.tensor([[w, h]]).to(device) 43 | 44 | transforms = T.Compose([ 45 | T.Resize(size), 46 | T.ToTensor(), 47 | ]) 48 | im_data = transforms(im_pil).unsqueeze(0).to(device) 49 | 50 | output = model(im_data, orig_size) 51 | labels, boxes, scores = output 52 | 53 | draw([im_pil], labels, boxes, scores) 54 | 55 | 56 | def process_video(model, device, file_path, size=(640, 640)): 57 | cap = cv2.VideoCapture(file_path) 58 | 59 | # Get video properties 60 | fps = cap.get(cv2.CAP_PROP_FPS) 61 | orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) 62 | orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 63 | 64 | # Define the codec and create VideoWriter object 65 | fourcc = cv2.VideoWriter_fourcc(*'mp4v') 66 | out = cv2.VideoWriter('torch_results.mp4', fourcc, fps, (orig_w, orig_h)) 67 | 68 | transforms = T.Compose([ 69 | T.Resize(size), 70 | T.ToTensor(), 71 | ]) 72 | 73 | frame_count = 0 74 | print("Processing video frames...") 75 | while cap.isOpened(): 76 | ret, frame = cap.read() 77 | if not ret: 78 | break 79 | 80 | # Convert frame to PIL image 81 | frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) 82 | 83 | w, h = frame_pil.size 84 | orig_size = torch.tensor([[w, h]]).to(device) 85 | 86 | im_data = transforms(frame_pil).unsqueeze(0).to(device) 87 | 88 | output = model(im_data, orig_size) 89 | labels, boxes, scores = output 90 | 91 | # Draw detections on the frame 92 | draw([frame_pil], labels, boxes, scores) 93 | 94 | # Convert back to OpenCV image 95 | frame = cv2.cvtColor(np.array(frame_pil), cv2.COLOR_RGB2BGR) 96 | 97 | # Write the frame 98 | out.write(frame) 99 | frame_count += 1 100 | 101 | if frame_count % 10 == 0: 102 | print(f"Processed {frame_count} frames...") 103 | 104 | cap.release() 105 | out.release() 106 | print("Video processing complete. Result saved as 'results_video.mp4'.") 107 | 108 | 109 | def main(args): 110 | """Main function""" 111 | cfg = YAMLConfig(args.config, resume=args.resume) 112 | 113 | if 'HGNetv2' in cfg.yaml_cfg: 114 | cfg.yaml_cfg['HGNetv2']['pretrained'] = False 115 | 116 | if args.resume: 117 | checkpoint = torch.load(args.resume, map_location='cpu') 118 | if 'ema' in checkpoint: 119 | state = checkpoint['ema']['module'] 120 | else: 121 | state = checkpoint['model'] 122 | else: 123 | raise AttributeError('Only support resume to load model.state_dict by now.') 124 | 125 | # Load train mode state and convert to deploy mode 126 | cfg.model.load_state_dict(state) 127 | 128 | class Model(nn.Module): 129 | def __init__(self): 130 | super().__init__() 131 | self.model = cfg.model.deploy() 132 | self.postprocessor = cfg.postprocessor.deploy() 133 | 134 | def forward(self, images, orig_target_sizes): 135 | outputs = self.model(images) 136 | outputs = self.postprocessor(outputs, orig_target_sizes) 137 | return outputs 138 | 139 | device = args.device 140 | model = Model().to(device) 141 | img_size = cfg.yaml_cfg["eval_spatial_size"] 142 | 143 | # Check if the input file is an image or a video 144 | file_path = args.input 145 | if os.path.splitext(file_path)[-1].lower() in ['.jpg', '.jpeg', '.png', '.bmp']: 146 | # Process as image 147 | process_image(model, device, file_path, img_size) 148 | print("Image processing complete.") 149 | else: 150 | # Process as video 151 | process_video(model, device, file_path, img_size) 152 | 153 | 154 | if __name__ == '__main__': 155 | import argparse 156 | parser = argparse.ArgumentParser() 157 | parser.add_argument('-c', '--config', type=str, required=True) 158 | parser.add_argument('-r', '--resume', type=str, required=True) 159 | parser.add_argument('-i', '--input', type=str, required=True) 160 | parser.add_argument('-d', '--device', type=str, default='cpu') 161 | args = parser.parse_args() 162 | main(args) 163 | -------------------------------------------------------------------------------- /tools/reference/convert_weight.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | import argparse 4 | 5 | def save_only_ema_weights(checkpoint_file): 6 | """Extract and save only the EMA weights.""" 7 | checkpoint = torch.load(checkpoint_file, map_location='cpu') 8 | 9 | weights = {} 10 | if 'ema' in checkpoint: 11 | weights['model'] = checkpoint['ema']['module'] 12 | else: 13 | raise ValueError("The checkpoint does not contain 'ema'.") 14 | 15 | dir_name, base_name = os.path.split(checkpoint_file) 16 | name, ext = os.path.splitext(base_name) 17 | output_file = os.path.join(dir_name, f"{name}_converted{ext}") 18 | 19 | torch.save(weights, output_file) 20 | print(f"EMA weights saved to {output_file}") 21 | 22 | if __name__ == '__main__': 23 | parser = argparse.ArgumentParser(description="Extract and save only EMA weights.") 24 | parser.add_argument('checkpoint_dir', type=str, help="Path to the input checkpoint file.") 25 | 26 | args = parser.parse_args() 27 | for file in os.listdir(args.checkpoint_dir): 28 | if '.pth' in file and '_converted' not in file: 29 | save_only_ema_weights(os.path.join(args.checkpoint_dir, file)) 30 | -------------------------------------------------------------------------------- /tools/reference/safe_training.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Function to display the menu for selecting model size 4 | select_model_size() { 5 | echo "Select model size:" 6 | select size in s m l x; do 7 | case $size in 8 | s|m|l|x) 9 | echo "You selected model size: $size" 10 | MODEL_SIZE=$size 11 | break 12 | ;; 13 | *) 14 | echo "Invalid selection. Please try again." 15 | ;; 16 | esac 17 | done 18 | } 19 | 20 | # Function to display the menu for selecting task 21 | select_task() { 22 | echo "Select task:" 23 | select task in obj365 obj2coco coco; do 24 | case $task in 25 | obj365|obj2coco|coco) 26 | echo "You selected task: $task" 27 | TASK=$task 28 | break 29 | ;; 30 | *) 31 | echo "Invalid selection. Please try again." 32 | ;; 33 | esac 34 | done 35 | } 36 | 37 | # Function to ask if the user wants to save logs to a txt file 38 | ask_save_logs() { 39 | while true; do 40 | read -p "Do you want to save logs to a txt file? (y/n): " yn 41 | case $yn in 42 | [Yy]* ) 43 | SAVE_LOGS=true 44 | break 45 | ;; 46 | [Nn]* ) 47 | SAVE_LOGS=false 48 | break 49 | ;; 50 | * ) echo "Please answer yes or no.";; 51 | esac 52 | done 53 | } 54 | 55 | # Call the functions to let the user select 56 | select_model_size 57 | select_task 58 | ask_save_logs 59 | 60 | # Set config file and output directory based on selection 61 | if [ "$TASK" = "coco" ]; then 62 | CONFIG_FILE="configs/dfine/dfine_hgnetv2_${MODEL_SIZE}_${TASK}.yml" 63 | else 64 | CONFIG_FILE="configs/dfine/objects365/dfine_hgnetv2_${MODEL_SIZE}_${TASK}.yml" 65 | fi 66 | 67 | OUTPUT_DIR="output/${MODEL_SIZE}_${TASK}" 68 | 69 | # Construct the training command 70 | TRAIN_CMD="CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=7777 --nproc_per_node=4 train.py -c $CONFIG_FILE --use-amp --seed=0 --output-dir $OUTPUT_DIR" 71 | 72 | # Append log redirection if SAVE_LOGS is true 73 | if [ "$SAVE_LOGS" = true ]; then 74 | LOG_FILE="${MODEL_SIZE}_${TASK}.txt" 75 | TRAIN_CMD="$TRAIN_CMD &> \"$LOG_FILE\" 2>&1 &" 76 | else 77 | TRAIN_CMD="$TRAIN_CMD &" 78 | fi 79 | 80 | # Run the training command 81 | eval $TRAIN_CMD 82 | if [ $? -ne 0 ]; then 83 | echo "First training failed, restarting with resume option..." 84 | while true; do 85 | RESUME_CMD="CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=7777 --nproc_per_node=4 train.py -c $CONFIG_FILE --use-amp --seed=0 --output-dir $OUTPUT_DIR -r ${OUTPUT_DIR}/last.pth" 86 | if [ "$SAVE_LOGS" = true ]; then 87 | LOG_FILE="${MODEL_SIZE}_${TASK}_2.txt" 88 | RESUME_CMD="$RESUME_CMD &> \"$LOG_FILE\" 2>&1 &" 89 | else 90 | RESUME_CMD="$RESUME_CMD &" 91 | fi 92 | eval $RESUME_CMD 93 | if [ $? -eq 0 ]; then 94 | break 95 | fi 96 | done 97 | fi 98 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | """ 2 | DEIMv2: Real-Time Object Detection Meets DINOv3 3 | Copyright (c) 2025 The DEIMv2 Authors. All Rights Reserved. 4 | --------------------------------------------------------------------------------- 5 | DEIM: DETR with Improved Matching for Fast Convergence 6 | Copyright (c) 2024 The DEIM Authors. All Rights Reserved. 7 | --------------------------------------------------------------------------------- 8 | Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR) 9 | Copyright (c) 2023 lyuwenyu. All Rights Reserved. 10 | """ 11 | 12 | import os 13 | import sys 14 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')) 15 | 16 | import argparse 17 | 18 | from engine.misc import dist_utils 19 | from engine.core import YAMLConfig, yaml_utils 20 | from engine.solver import TASKS 21 | 22 | debug=False 23 | 24 | if debug: 25 | import torch 26 | def custom_repr(self): 27 | return f'{{Tensor:{tuple(self.shape)}}} {original_repr(self)}' 28 | original_repr = torch.Tensor.__repr__ 29 | torch.Tensor.__repr__ = custom_repr 30 | 31 | def main(args, ) -> None: 32 | """main 33 | """ 34 | dist_utils.setup_distributed(args.print_rank, args.print_method, seed=args.seed) 35 | 36 | assert not all([args.tuning, args.resume]), \ 37 | 'Only support from_scrach or resume or tuning at one time' 38 | 39 | 40 | update_dict = yaml_utils.parse_cli(args.update) 41 | update_dict.update({k: v for k, v in args.__dict__.items() \ 42 | if k not in ['update', ] and v is not None}) 43 | 44 | cfg = YAMLConfig(args.config, **update_dict) 45 | 46 | if args.resume or args.tuning: 47 | if 'HGNetv2' in cfg.yaml_cfg: 48 | cfg.yaml_cfg['HGNetv2']['pretrained'] = False 49 | 50 | print('cfg: ', cfg.__dict__) 51 | 52 | solver = TASKS[cfg.yaml_cfg['task']](cfg) 53 | 54 | if args.test_only: 55 | solver.val() 56 | else: 57 | solver.fit() 58 | 59 | dist_utils.cleanup() 60 | 61 | 62 | if __name__ == '__main__': 63 | 64 | parser = argparse.ArgumentParser() 65 | 66 | # priority 0 67 | parser.add_argument('-c', '--config', type=str, default='') 68 | parser.add_argument('-r', '--resume', type=str, help='resume from checkpoint') 69 | parser.add_argument('-t', '--tuning', type=str, help='tuning from checkpoint') 70 | parser.add_argument('-d', '--device', type=str, help='device',) 71 | parser.add_argument('--seed', type=int, default=0, help='exp reproducibility') 72 | parser.add_argument('--use-amp', action='store_true', help='auto mixed precision training') 73 | parser.add_argument('--output-dir', type=str, help='output directoy') 74 | parser.add_argument('--summary-dir', type=str, help='tensorboard summry') 75 | parser.add_argument('--test-only', action='store_true', default=False,) 76 | 77 | # priority 1 78 | parser.add_argument('-u', '--update', nargs='+', help='update yaml config') 79 | 80 | # env 81 | parser.add_argument('--print-method', type=str, default='builtin', help='print method') 82 | parser.add_argument('--print-rank', type=int, default=0, help='print rank id') 83 | 84 | parser.add_argument('--local-rank', type=int, help='local rank id') 85 | args = parser.parse_args() 86 | 87 | main(args) 88 | --------------------------------------------------------------------------------