├── .dockerignore
├── .gitattributes
├── .gitignore
├── Dockerfile
├── LICENSE
├── README-YW.md
├── README.md
├── assets
    ├── DOSOD_LOGO.png
    ├── dosod-l-4090.md
    ├── dosod-m-4090.md
    ├── dosod-s-4090.md
    ├── dosod_architecture.png
    ├── finetune_yoloworld.png
    ├── render_dosod.jpeg
    ├── reparameterize.png
    ├── yolo-worldv1-l-4090.md
    ├── yolo-worldv1-m-4090.md
    ├── yolo-worldv1-s-4090.md
    ├── yolo-worldv2-l-4090.md
    ├── yolo-worldv2-m-4090.md
    ├── yolo-worldv2-s-4090.md
    ├── yolo_arch.png
    └── yolo_logo.png
├── configs
    ├── dosod
    │   ├── dosod_mlp0x_s_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py
    │   ├── dosod_mlp1x_s_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py
    │   ├── dosod_mlp2x_s_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py
    │   ├── dosod_mlp3x_l_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py
    │   ├── dosod_mlp3x_m_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py
    │   ├── dosod_mlp3x_s_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py
    │   ├── dosod_mlp4x_s_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py
    │   ├── dosod_mlp5x_s_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py
    │   ├── rep_dosod_mlp3x_l_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py
    │   ├── rep_dosod_mlp3x_l_d-robotics.py
    │   ├── rep_dosod_mlp3x_m_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py
    │   ├── rep_dosod_mlp3x_m_d-robotics.py
    │   ├── rep_dosod_mlp3x_s_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py
    │   ├── rep_dosod_mlp3x_s_d-robotics.py
    │   ├── zero-shot-on-coco_dosod_mlp3x_l.py
    │   ├── zero-shot-on-coco_dosod_mlp3x_m.py
    │   └── zero-shot-on-coco_dosod_mlp3x_s.py
    ├── finetune_coco
    │   ├── README.md
    │   ├── yolo_world_l_dual_vlpan_2e-4_80e_8gpus_finetune_coco.py
    │   ├── yolo_world_l_dual_vlpan_2e-4_80e_8gpus_mask-refine_finetune_coco.py
    │   ├── yolo_world_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py
    │   ├── yolo_world_v2_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py
    │   ├── yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
    │   ├── yolo_world_v2_l_vlpan_bn_sgd_1e-3_40e_8gpus_finetune_coco.py
    │   ├── yolo_world_v2_l_vlpan_bn_sgd_1e-3_80e_8gpus_mask-refine_finetune_coco.py
    │   ├── yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
    │   ├── yolo_world_v2_s_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
    │   ├── yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
    │   ├── yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
    │   ├── yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
    │   └── yolo_world_v2_xl_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
    ├── pretrain
    │   ├── rep_yolo_world_v2_l.py
    │   ├── rep_yolo_world_v2_m.py
    │   ├── rep_yolo_world_v2_s.py
    │   ├── yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_800ft_lvis_minival.py
    │   ├── yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
    │   ├── yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
    │   ├── yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
    │   ├── yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py
    │   ├── yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
    │   ├── yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
    │   ├── yolo_world_v2_m_vlpan_bn_noeinsum_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
    │   ├── yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
    │   ├── yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
    │   ├── yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_cc3mlite_train_lvis_minival.py
    │   ├── yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
    │   ├── yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
    │   └── yolo_world_v2_xl_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
    ├── pretrain_v1
    │   ├── README.md
    │   ├── yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
    │   ├── yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py
    │   ├── yolo_world_m_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
    │   ├── yolo_world_s_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
    │   └── yolo_world_x_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
    ├── prompt_tuning_coco
    │   ├── READEME.md
    │   ├── yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco.py
    │   ├── yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_prompt_tuning_coco.py
    │   └── yolo_world_v2_l_vlpan_bn_sgd_1e-3_80e_8gpus_all_finetuning_coco.py
    └── segmentation
    │   ├── README.md
    │   ├── yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py
    │   ├── yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis.py
    │   ├── yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py
    │   ├── yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis.py
    │   ├── yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py
    │   └── yolo_world_v2_seg_m_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py
├── data
    ├── coco
    │   └── lvis
    │   │   └── lvis_v1_minival_inserted_image_name.json
    └── texts
    │   ├── coco_class_texts.json
    │   ├── lvis_v1_base_class_captions.json
    │   ├── lvis_v1_class_texts.json
    │   └── obj365v1_class_texts.json
├── demo
    ├── README.md
    ├── gradio_demo.py
    ├── image_demo.py
    ├── inference.ipynb
    ├── sample_images
    │   ├── bus.jpg
    │   └── zidane.jpg
    ├── simple_demo.py
    └── video_demo.py
├── deploy
    ├── __init__.py
    ├── easydeploy
    │   ├── README.md
    │   ├── README_zh-CN.md
    │   ├── backbone
    │   │   ├── __init__.py
    │   │   ├── common.py
    │   │   └── focus.py
    │   ├── bbox_code
    │   │   ├── __init__.py
    │   │   └── bbox_coder.py
    │   ├── deepstream
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   ├── README_zh-CN.md
    │   │   ├── coco_labels.txt
    │   │   ├── configs
    │   │   │   ├── config_infer_rtmdet.txt
    │   │   │   ├── config_infer_yolov5.txt
    │   │   │   └── config_infer_yolov8.txt
    │   │   ├── custom_mmyolo_bbox_parser
    │   │   │   └── nvdsparsebbox_mmyolo.cpp
    │   │   └── deepstream_app_config.txt
    │   ├── docs
    │   │   └── model_convert.md
    │   ├── examples
    │   │   ├── config.py
    │   │   ├── cv2_nms.py
    │   │   ├── main_onnxruntime.py
    │   │   ├── numpy_coder.py
    │   │   ├── preprocess.py
    │   │   └── requirements.txt
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── backend.py
    │   │   ├── backendwrapper.py
    │   │   └── model.py
    │   ├── nms
    │   │   ├── __init__.py
    │   │   ├── ort_nms.py
    │   │   └── trt_nms.py
    │   ├── onnx_demo.py
    │   └── tools
    │   │   ├── build_engine.py
    │   │   ├── export_onnx.py
    │   │   └── image-demo.py
    ├── export_onnx.py
    ├── onnx_demo.py
    └── tflite_demo.py
├── docs
    ├── data.md
    ├── deploy.md
    ├── faq.md
    ├── finetuning.md
    ├── installation.md
    ├── prompt_yolo_world.md
    ├── reparameterize.md
    ├── tflite_deploy.md
    └── updates.md
├── pyproject.toml
├── requirements
    ├── basic_requirements.txt
    ├── demo_requirements.txt
    └── onnx_requirements.txt
├── third_party
    └── mmyolo
    │   └── configs
    │       ├── _base_
    │           ├── default_runtime.py
    │           ├── det_p5_tta.py
    │           └── pose
    │           │   └── coco.py
    │       ├── deploy
    │           ├── base_dynamic.py
    │           ├── base_static.py
    │           ├── detection_onnxruntime_dynamic.py
    │           ├── detection_onnxruntime_static.py
    │           ├── detection_rknn-fp16_static-320x320.py
    │           ├── detection_rknn-int8_static-320x320.py
    │           ├── detection_tensorrt-fp16_dynamic-192x192-960x960.py
    │           ├── detection_tensorrt-fp16_dynamic-64x64-1344x1344.py
    │           ├── detection_tensorrt-fp16_static-640x640.py
    │           ├── detection_tensorrt-int8_dynamic-192x192-960x960.py
    │           ├── detection_tensorrt-int8_static-640x640.py
    │           ├── detection_tensorrt_dynamic-192x192-960x960.py
    │           ├── detection_tensorrt_static-640x640.py
    │           └── model
    │           │   ├── yolov5_s-static.py
    │           │   └── yolov6_s-static.py
    │       ├── ppyoloe
    │           ├── README.md
    │           ├── metafile.yml
    │           ├── ppyoloe_l_fast_8xb20-300e_coco.py
    │           ├── ppyoloe_m_fast_8xb28-300e_coco.py
    │           ├── ppyoloe_plus_l_fast_8xb8-80e_coco.py
    │           ├── ppyoloe_plus_m_fast_8xb8-80e_coco.py
    │           ├── ppyoloe_plus_s_fast_1xb12-40e_cat.py
    │           ├── ppyoloe_plus_s_fast_8xb8-80e_coco.py
    │           ├── ppyoloe_plus_x_fast_8xb8-80e_coco.py
    │           ├── ppyoloe_s_fast_8xb32-300e_coco.py
    │           ├── ppyoloe_s_fast_8xb32-400e_coco.py
    │           └── ppyoloe_x_fast_8xb16-300e_coco.py
    │       ├── razor
    │           └── subnets
    │           │   ├── README.md
    │           │   ├── rtmdet_tiny_ofa_lat31_syncbn_16xb16-300e_coco.py
    │           │   ├── yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco.py
    │           │   └── yolov6_l_attentivenas_a6_d12_syncbn_fast_8xb32-300e_coco.py
    │       ├── rtmdet
    │           ├── README.md
    │           ├── cspnext_imagenet_pretrain
    │           │   ├── README.md
    │           │   ├── cspnext-s_8xb256-rsb-a1-600e_in1k.py
    │           │   └── cspnext-tiny_8xb256-rsb-a1-600e_in1k.py
    │           ├── distillation
    │           │   ├── README.md
    │           │   ├── kd_l_rtmdet_x_neck_300e_coco.py
    │           │   ├── kd_m_rtmdet_l_neck_300e_coco.py
    │           │   ├── kd_s_rtmdet_m_neck_300e_coco.py
    │           │   └── kd_tiny_rtmdet_s_neck_300e_coco.py
    │           ├── metafile.yml
    │           ├── rotated
    │           │   ├── rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py
    │           │   ├── rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py
    │           │   ├── rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota.py
    │           │   ├── rtmdet-r_l_syncbn_fast_coco-pretrain_2xb4-36e_dota-ms.py
    │           │   ├── rtmdet-r_m_syncbn_fast_2xb4-36e_dota-ms.py
    │           │   ├── rtmdet-r_m_syncbn_fast_2xb4-36e_dota.py
    │           │   ├── rtmdet-r_s_fast_1xb8-36e_dota-ms.py
    │           │   ├── rtmdet-r_s_fast_1xb8-36e_dota.py
    │           │   ├── rtmdet-r_tiny_fast_1xb8-36e_dota-ms.py
    │           │   └── rtmdet-r_tiny_fast_1xb8-36e_dota.py
    │           ├── rtmdet-ins_s_syncbn_fast_8xb32-300e_coco.py
    │           ├── rtmdet_l_syncbn_fast_8xb32-300e_coco.py
    │           ├── rtmdet_m_syncbn_fast_8xb32-300e_coco.py
    │           ├── rtmdet_s_syncbn_fast_8xb32-300e_coco.py
    │           ├── rtmdet_tiny_fast_1xb12-40e_cat.py
    │           ├── rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py
    │           └── rtmdet_x_syncbn_fast_8xb32-300e_coco.py
    │       ├── yolov5
    │           ├── README.md
    │           ├── crowdhuman
    │           │   ├── yolov5_s-v61_8xb16-300e_ignore_crowdhuman.py
    │           │   └── yolov5_s-v61_fast_8xb16-300e_crowdhuman.py
    │           ├── ins_seg
    │           │   ├── yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py
    │           │   ├── yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py
    │           │   ├── yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py
    │           │   ├── yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py
    │           │   ├── yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py
    │           │   ├── yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py
    │           │   └── yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py
    │           ├── mask_refine
    │           │   ├── yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py
    │           │   ├── yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py
    │           │   ├── yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py
    │           │   ├── yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py
    │           │   └── yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py
    │           ├── metafile.yml
    │           ├── voc
    │           │   ├── yolov5_l-v61_fast_1xb32-50e_voc.py
    │           │   ├── yolov5_m-v61_fast_1xb64-50e_voc.py
    │           │   ├── yolov5_n-v61_fast_1xb64-50e_voc.py
    │           │   ├── yolov5_s-v61_fast_1xb64-50e_voc.py
    │           │   └── yolov5_x-v61_fast_1xb32-50e_voc.py
    │           ├── yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco.py
    │           ├── yolov5_l-v61_syncbn_fast_8xb16-300e_coco.py
    │           ├── yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco.py
    │           ├── yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py
    │           ├── yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco.py
    │           ├── yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py
    │           ├── yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py
    │           ├── yolov5_s-v61_fast_1xb12-40e_608x352_cat.py
    │           ├── yolov5_s-v61_fast_1xb12-40e_cat.py
    │           ├── yolov5_s-v61_fast_1xb12-ms-40e_cat.py
    │           ├── yolov5_s-v61_syncbn-detect_8xb16-300e_coco.py
    │           ├── yolov5_s-v61_syncbn_8xb16-300e_coco.py
    │           ├── yolov5_s-v61_syncbn_fast_1xb4-300e_balloon.py
    │           ├── yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py
    │           ├── yolov5_x-p6-v62_syncbn_fast_8xb16-300e_coco.py
    │           ├── yolov5_x-v61_syncbn_fast_8xb16-300e_coco.py
    │           └── yolov5u
    │           │   ├── yolov5u_l_mask-refine_syncbn_fast_8xb16-300e_coco.py
    │           │   ├── yolov5u_l_syncbn_fast_8xb16-300e_coco.py
    │           │   ├── yolov5u_m_mask-refine_syncbn_fast_8xb16-300e_coco.py
    │           │   ├── yolov5u_m_syncbn_fast_8xb16-300e_coco.py
    │           │   ├── yolov5u_n_mask-refine_syncbn_fast_8xb16-300e_coco.py
    │           │   ├── yolov5u_n_syncbn_fast_8xb16-300e_coco.py
    │           │   ├── yolov5u_s_mask-refine_syncbn_fast_8xb16-300e_coco.py
    │           │   ├── yolov5u_s_syncbn_fast_8xb16-300e_coco.py
    │           │   ├── yolov5u_x_mask-refine_syncbn_fast_8xb16-300e_coco.py
    │           │   └── yolov5u_x_syncbn_fast_8xb16-300e_coco.py
    │       ├── yolov6
    │           ├── README.md
    │           ├── metafile.yml
    │           ├── yolov6_l_syncbn_fast_8xb32-300e_coco.py
    │           ├── yolov6_m_syncbn_fast_8xb32-300e_coco.py
    │           ├── yolov6_n_syncbn_fast_8xb32-300e_coco.py
    │           ├── yolov6_n_syncbn_fast_8xb32-400e_coco.py
    │           ├── yolov6_s_fast_1xb12-40e_cat.py
    │           ├── yolov6_s_syncbn_fast_8xb32-300e_coco.py
    │           ├── yolov6_s_syncbn_fast_8xb32-400e_coco.py
    │           ├── yolov6_t_syncbn_fast_8xb32-300e_coco.py
    │           ├── yolov6_t_syncbn_fast_8xb32-400e_coco.py
    │           ├── yolov6_v3_l_syncbn_fast_8xb32-300e_coco.py
    │           ├── yolov6_v3_m_syncbn_fast_8xb32-300e_coco.py
    │           ├── yolov6_v3_n_syncbn_fast_8xb32-300e_coco.py
    │           ├── yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py
    │           └── yolov6_v3_t_syncbn_fast_8xb32-300e_coco.py
    │       ├── yolov7
    │           ├── README.md
    │           ├── metafile.yml
    │           ├── yolov7_d-p6_syncbn_fast_8x16b-300e_coco.py
    │           ├── yolov7_e-p6_syncbn_fast_8x16b-300e_coco.py
    │           ├── yolov7_e2e-p6_syncbn_fast_8x16b-300e_coco.py
    │           ├── yolov7_l_syncbn_fast_8x16b-300e_coco.py
    │           ├── yolov7_tiny_fast_1xb12-40e_cat.py
    │           ├── yolov7_tiny_syncbn_fast_8x16b-300e_coco.py
    │           ├── yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py
    │           └── yolov7_x_syncbn_fast_8x16b-300e_coco.py
    │       ├── yolov8
    │           ├── README.md
    │           ├── metafile.yml
    │           ├── yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py
    │           ├── yolov8_l_syncbn_fast_8xb16-500e_coco.py
    │           ├── yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py
    │           ├── yolov8_m_syncbn_fast_8xb16-500e_coco.py
    │           ├── yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco.py
    │           ├── yolov8_n_syncbn_fast_8xb16-500e_coco.py
    │           ├── yolov8_s_fast_1xb12-40e_cat.py
    │           ├── yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py
    │           ├── yolov8_s_syncbn_fast_8xb16-500e_coco.py
    │           ├── yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py
    │           └── yolov8_x_syncbn_fast_8xb16-500e_coco.py
    │       └── yolox
    │           ├── README.md
    │           ├── metafile.yml
    │           ├── pose
    │               ├── yolox-pose_l_8xb32-300e-rtmdet-hyp_coco.py
    │               ├── yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py
    │               ├── yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py
    │               └── yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco.py
    │           ├── yolox_l_fast_8xb8-300e_coco.py
    │           ├── yolox_m_fast_8xb32-300e-rtmdet-hyp_coco.py
    │           ├── yolox_m_fast_8xb8-300e_coco.py
    │           ├── yolox_nano_fast_8xb32-300e-rtmdet-hyp_coco.py
    │           ├── yolox_nano_fast_8xb8-300e_coco.py
    │           ├── yolox_p5_tta.py
    │           ├── yolox_s_fast_1xb12-40e-rtmdet-hyp_cat.py
    │           ├── yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py
    │           ├── yolox_s_fast_8xb8-300e_coco.py
    │           ├── yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco.py
    │           ├── yolox_tiny_fast_8xb8-300e_coco.py
    │           └── yolox_x_fast_8xb8-300e_coco.py
├── tools
    ├── count_num_parameters.py
    ├── dist_test.sh
    ├── dist_train.sh
    ├── evaluate_latency.sh
    ├── generate_image_prompts.py
    ├── generate_text_prompts_dosod.py
    ├── generate_text_prompts_yoloworld.py
    ├── generate_vocabulary_json.py
    ├── reparameterize_dosod.py
    ├── reparameterize_yoloworld.py
    ├── test.py
    └── train.py
└── yolo_world
    ├── __init__.py
    ├── datasets
        ├── __init__.py
        ├── mm_dataset.py
        ├── transformers
        │   ├── __init__.py
        │   ├── mm_mix_img_transforms.py
        │   └── mm_transforms.py
        ├── utils.py
        ├── yolov5_cc3m_grounding.py
        ├── yolov5_lvis.py
        ├── yolov5_mixed_grounding.py
        ├── yolov5_obj365v1.py
        ├── yolov5_obj365v2.py
        └── yolov5_v3det.py
    ├── engine
        ├── __init__.py
        └── optimizers
        │   ├── __init__.py
        │   └── yolow_v5_optim_constructor.py
    ├── models
        ├── __init__.py
        ├── assigner
        │   ├── __init__.py
        │   └── task_aligned_assigner.py
        ├── backbones
        │   ├── __init__.py
        │   └── mm_backbone.py
        ├── data_preprocessors
        │   ├── __init__.py
        │   └── data_preprocessor.py
        ├── dense_heads
        │   ├── __init__.py
        │   ├── dosod_head.py
        │   ├── yolo_world_head.py
        │   └── yolo_world_seg_head.py
        ├── detectors
        │   ├── __init__.py
        │   ├── dosod.py
        │   └── yolo_world.py
        ├── layers
        │   ├── __init__.py
        │   └── yolo_bricks.py
        ├── losses
        │   ├── __init__.py
        │   └── dynamic_loss.py
        └── necks
        │   ├── __init__.py
        │   └── yolo_world_pafpn.py
    └── version.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | docs
2 | Dockerfile


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Basic .gitattributes for a python repo.
 2 | 
 3 | # Source files
 4 | # ============
 5 | *.pxd    text diff=python
 6 | *.py     text diff=python
 7 | *.py3    text diff=python
 8 | *.pyw    text diff=python
 9 | *.pyx    text diff=python
10 | *.pyz    text diff=python
11 | *.pyi    text diff=python
12 | 
13 | # Binary files
14 | # ============
15 | *.db     binary
16 | *.p      binary
17 | *.pkl    binary
18 | *.pickle binary
19 | *.pyc    binary export-ignore
20 | *.pyo    binary export-ignore
21 | *.pyd    binary
22 | 
23 | # Jupyter notebook
24 | *.ipynb  text eol=lf
25 | 
26 | # Others
27 | *           text=auto
28 | *.txt		text
29 | *.sh		text eol=lf
30 | 
31 | # Note: .db, .p, and .pkl files are associated
32 | # with the python modules ``pickle``, ``dbm.*``,
33 | # ``shelve``, ``marshal``, ``anydbm``, & ``bsddb``
34 | # (among others).
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/en/_build/
 68 | docs/zh_cn/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # Environments
 86 | .env
 87 | .venv
 88 | env/
 89 | venv/
 90 | ENV/
 91 | env.bak/
 92 | venv.bak/
 93 | 
 94 | # Spyder project settings
 95 | .spyderproject
 96 | .spyproject
 97 | 
 98 | # Rope project settings
 99 | .ropeproject
100 | 
101 | # mkdocs documentation
102 | /site
103 | 
104 | # mypy
105 | .mypy_cache/
106 | # data/
107 | # data
108 | .vscode
109 | .idea
110 | .DS_Store
111 | 
112 | # custom
113 | *.pkl
114 | *.pkl.json
115 | *.log.json
116 | docs/modelzoo_statistics.md
117 | mmdet/.mim
118 | work_dirs
119 | 
120 | # Pytorch
121 | *.pth
122 | *.py~
123 | *.sh~
124 | 
125 | # venus
126 | venus_run.sh
127 | 
128 | /local_test
129 | /dosod_models
130 | data/coco/annotations
131 | data/coco/val2017
132 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
 2 | 
 3 | ARG MODEL="yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py"
 4 | ARG WEIGHT="yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth"
 5 | 
 6 | ENV FORCE_CUDA="1"
 7 | ENV MMCV_WITH_OPS=1
 8 | 
 9 | RUN apt-get update && apt-get install -y --no-install-recommends \
10 |     python3-pip     \
11 |     libgl1-mesa-glx \
12 |     libsm6          \
13 |     libxext6        \
14 |     libxrender-dev  \
15 |     libglib2.0-0    \
16 |     git             \
17 |     python3-dev     \
18 |     python3-wheel
19 | 
20 | RUN pip3 install --upgrade pip \
21 |     && pip3 install   \
22 |         gradio        \
23 |         opencv-python \
24 |         supervision   \
25 |         mmengine      \
26 |         setuptools    \
27 |         openmim       \
28 |     && mim install mmcv==2.0.0 \
29 |     && pip3 install --no-cache-dir --index-url https://download.pytorch.org/whl/cu118 \
30 |         wheel         \
31 |         torch         \
32 |         torchvision   \
33 |         torchaudio
34 | 
35 | COPY . /yolo
36 | WORKDIR /yolo
37 | 
38 | RUN pip3 install -e .
39 | 
40 | RUN curl -o weights/$WEIGHT -L https://huggingface.co/wondervictor/YOLO-World/resolve/main/$WEIGHT
41 | 
42 | ENTRYPOINT [ "python3", "demo.py" ]
43 | CMD ["configs/pretrain/$MODEL", "weights/$WEIGHT"]


--------------------------------------------------------------------------------
/assets/DOSOD_LOGO.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/D-Robotics-AI-Lab/DOSOD/2b133c3a9f7a9b685635090d219efa84a6951e75/assets/DOSOD_LOGO.png


--------------------------------------------------------------------------------
/assets/dosod-l-4090.md:
--------------------------------------------------------------------------------
 1 | ```
 2 | [09/02/2024-18:04:40] [I] === Performance summary ===
 3 | [09/02/2024-18:04:40] [I] Throughput: 630.463 qps
 4 | [09/02/2024-18:04:40] [I] Latency: min = 1.8894 ms, max = 4.01685 ms, mean = 1.8984 ms, median = 1.896 ms, percentile(90%) = 1.90039 ms, percentile(95%) = 1.90186 ms, percentile(99%) = 1.90576 ms
 5 | [09/02/2024-18:04:40] [I] Enqueue Time: min = 0.90802 ms, max = 5.06421 ms, mean = 1.00555 ms, median = 0.993896 ms, percentile(90%) = 1.01172 ms, percentile(95%) = 1.01953 ms, percentile(99%) = 1.15723 ms
 6 | [09/02/2024-18:04:40] [I] H2D Latency: min = 0.200684 ms, max = 0.240601 ms, mean = 0.201544 ms, median = 0.201355 ms, percentile(90%) = 0.202026 ms, percentile(95%) = 0.202271 ms, percentile(99%) = 0.20459 ms
 7 | [09/02/2024-18:04:40] [I] GPU Compute Time: min = 1.57288 ms, max = 3.65894 ms, mean = 1.58208 ms, median = 1.58002 ms, percentile(90%) = 1.58398 ms, percentile(95%) = 1.58521 ms, percentile(99%) = 1.58813 ms
 8 | [09/02/2024-18:04:40] [I] D2H Latency: min = 0.112549 ms, max = 0.167236 ms, mean = 0.114774 ms, median = 0.114502 ms, percentile(90%) = 0.115967 ms, percentile(95%) = 0.116211 ms, percentile(99%) = 0.11792 ms
 9 | [09/02/2024-18:04:40] [I] Total Host Walltime: 3.17227 s
10 | [09/02/2024-18:04:40] [I] Total GPU Compute Time: 3.16416 s
11 | [09/02/2024-18:04:40] [W] * GPU compute time is unstable, with coefficient of variance = 3.28975%.
12 | [09/02/2024-18:04:40] [W]   If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the stability.
13 | [09/02/2024-18:04:40] [I] Explanations of the performance metrics are printed in the verbose logs.
14 | ```


--------------------------------------------------------------------------------
/assets/dosod-m-4090.md:
--------------------------------------------------------------------------------
 1 | ```
 2 | [09/02/2024-17:57:01] [I] === Performance summary ===
 3 | [09/02/2024-17:57:01] [I] Throughput: 919.495 qps
 4 | [09/02/2024-17:57:01] [I] Latency: min = 1.39429 ms, max = 2.81079 ms, mean = 1.40298 ms, median = 1.40112 ms, percentile(90%) = 1.40356 ms, percentile(95%) = 1.40424 ms, percentile(99%) = 1.40601 ms
 5 | [09/02/2024-17:57:01] [I] Enqueue Time: min = 0.626221 ms, max = 3.35059 ms, mean = 0.659003 ms, median = 0.644882 ms, percentile(90%) = 0.675415 ms, percentile(95%) = 0.765869 ms, percentile(99%) = 0.790466 ms
 6 | [09/02/2024-17:57:01] [I] H2D Latency: min = 0.201172 ms, max = 0.26123 ms, mean = 0.201796 ms, median = 0.20166 ms, percentile(90%) = 0.201904 ms, percentile(95%) = 0.202148 ms, percentile(99%) = 0.20459 ms
 7 | [09/02/2024-17:57:01] [I] GPU Compute Time: min = 1.07715 ms, max = 2.45239 ms, mean = 1.08407 ms, median = 1.0824 ms, percentile(90%) = 1.08441 ms, percentile(95%) = 1.08447 ms, percentile(99%) = 1.08545 ms
 8 | [09/02/2024-17:57:01] [I] D2H Latency: min = 0.114746 ms, max = 0.156982 ms, mean = 0.117125 ms, median = 0.117126 ms, percentile(90%) = 0.118286 ms, percentile(95%) = 0.11853 ms, percentile(99%) = 0.119141 ms
 9 | [09/02/2024-17:57:01] [I] Total Host Walltime: 3.00382 s
10 | [09/02/2024-17:57:01] [I] Total GPU Compute Time: 2.9942 s
11 | [09/02/2024-17:57:01] [W] * GPU compute time is unstable, with coefficient of variance = 3.89596%.
12 | [09/02/2024-17:57:01] [W]   If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the stability.
13 | [09/02/2024-17:57:01] [I] Explanations of the performance metrics are printed in the verbose logs.
14 | ```


--------------------------------------------------------------------------------
/assets/dosod-s-4090.md:
--------------------------------------------------------------------------------
 1 | ```
 2 | [09/02/2024-17:47:04] [I] === Performance summary ===
 3 | [09/02/2024-17:47:04] [I] Throughput: 1574.34 qps
 4 | [09/02/2024-17:47:04] [I] Latency: min = 0.94249 ms, max = 1.51343 ms, mean = 0.949318 ms, median = 0.948975 ms, percentile(90%) = 0.951172 ms, percentile(95%) = 0.951843 ms, percentile(99%) = 0.953735 ms
 5 | [09/02/2024-17:47:04] [I] Enqueue Time: min = 0.539093 ms, max = 1.61792 ms, mean = 0.582978 ms, median = 0.587158 ms, percentile(90%) = 0.598022 ms, percentile(95%) = 0.601318 ms, percentile(99%) = 0.614258 ms
 6 | [09/02/2024-17:47:04] [I] H2D Latency: min = 0.201904 ms, max = 0.237549 ms, mean = 0.202861 ms, median = 0.202698 ms, percentile(90%) = 0.203125 ms, percentile(95%) = 0.203369 ms, percentile(99%) = 0.205811 ms
 7 | [09/02/2024-17:47:04] [I] GPU Compute Time: min = 0.626678 ms, max = 1.15527 ms, mean = 0.632278 ms, median = 0.631836 ms, percentile(90%) = 0.633789 ms, percentile(95%) = 0.633911 ms, percentile(99%) = 0.634888 ms
 8 | [09/02/2024-17:47:04] [I] D2H Latency: min = 0.112061 ms, max = 0.156982 ms, mean = 0.114173 ms, median = 0.113556 ms, percentile(90%) = 0.115723 ms, percentile(95%) = 0.115967 ms, percentile(99%) = 0.116699 ms
 9 | [09/02/2024-17:47:04] [I] Total Host Walltime: 3.0019 s
10 | [09/02/2024-17:47:04] [I] Total GPU Compute Time: 2.98815 s
11 | [09/02/2024-17:47:04] [W] * Throughput may be bound by Enqueue Time rather than GPU Compute and the GPU may be under-utilized.
12 | [09/02/2024-17:47:04] [W]   If not already in use, --useCudaGraph (utilize CUDA graphs where possible) may increase the throughput.
13 | [09/02/2024-17:47:04] [W] * GPU compute time is unstable, with coefficient of variance = 1.22015%.
14 | [09/02/2024-17:47:04] [W]   If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the stability.
15 | [09/02/2024-17:47:04] [I] Explanations of the performance metrics are printed in the verbose logs.
16 | ```


--------------------------------------------------------------------------------
/assets/dosod_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/D-Robotics-AI-Lab/DOSOD/2b133c3a9f7a9b685635090d219efa84a6951e75/assets/dosod_architecture.png


--------------------------------------------------------------------------------
/assets/finetune_yoloworld.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/D-Robotics-AI-Lab/DOSOD/2b133c3a9f7a9b685635090d219efa84a6951e75/assets/finetune_yoloworld.png


--------------------------------------------------------------------------------
/assets/render_dosod.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/D-Robotics-AI-Lab/DOSOD/2b133c3a9f7a9b685635090d219efa84a6951e75/assets/render_dosod.jpeg


--------------------------------------------------------------------------------
/assets/reparameterize.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/D-Robotics-AI-Lab/DOSOD/2b133c3a9f7a9b685635090d219efa84a6951e75/assets/reparameterize.png


--------------------------------------------------------------------------------
/assets/yolo-worldv1-l-4090.md:
--------------------------------------------------------------------------------
 1 | ```
 2 | [09/07/2024-21:35:18] [I] === Performance summary ===
 3 | [09/07/2024-21:35:18] [I] Throughput: 481.898 qps
 4 | [09/07/2024-21:35:18] [I] Latency: min = 2.21338 ms, max = 21.5757 ms, mean = 2.36123 ms, median = 2.22137 ms, percentile(90%) = 2.2915 ms, percentile(95%) = 2.64233 ms, percentile(99%) = 5.23608 ms
 5 | [09/07/2024-21:35:18] [I] Enqueue Time: min = 1.29114 ms, max = 21.397 ms, mean = 1.80525 ms, median = 1.59399 ms, percentile(90%) = 2.1189 ms, percentile(95%) = 2.59033 ms, percentile(99%) = 5.50952 ms
 6 | [09/07/2024-21:35:18] [I] H2D Latency: min = 0.200684 ms, max = 0.259521 ms, mean = 0.206849 ms, median = 0.202393 ms, percentile(90%) = 0.231079 ms, percentile(95%) = 0.235962 ms, percentile(99%) = 0.244141 ms
 7 | [09/07/2024-21:35:18] [I] GPU Compute Time: min = 1.89233 ms, max = 21.2131 ms, mean = 2.02429 ms, median = 1.89844 ms, percentile(90%) = 1.90262 ms, percentile(95%) = 2.25281 ms, percentile(99%) = 4.84247 ms
 8 | [09/07/2024-21:35:18] [I] D2H Latency: min = 0.116577 ms, max = 10.974 ms, mean = 0.130076 ms, median = 0.120117 ms, percentile(90%) = 0.152832 ms, percentile(95%) = 0.157898 ms, percentile(99%) = 0.166748 ms
 9 | [09/07/2024-21:35:18] [I] Total Host Walltime: 4.15026 s
10 | [09/07/2024-21:35:18] [I] Total GPU Compute Time: 4.04858 s
11 | [09/07/2024-21:35:18] [W] * Throughput may be bound by Enqueue Time rather than GPU Compute and the GPU may be under-utilized.
12 | [09/07/2024-21:35:18] [W]   If not already in use, --useCudaGraph (utilize CUDA graphs where possible) may increase the throughput.
13 | [09/07/2024-21:35:18] [W] * GPU compute time is unstable, with coefficient of variance = 44.7285%.
14 | [09/07/2024-21:35:18] [W]   If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the stability.
15 | [09/07/2024-21:35:18] [I] Explanations of the performance metrics are printed in the verbose logs.
16 | ```


--------------------------------------------------------------------------------
/assets/yolo-worldv1-m-4090.md:
--------------------------------------------------------------------------------
 1 | ```
 2 | [09/07/2024-21:22:23] [I] === Performance summary ===
 3 | [09/07/2024-21:22:23] [I] Throughput: 687.676 qps
 4 | [09/07/2024-21:22:23] [I] Latency: min = 1.68785 ms, max = 10.8534 ms, mean = 1.75215 ms, median = 1.69934 ms, percentile(90%) = 1.74597 ms, percentile(95%) = 1.78418 ms, percentile(99%) = 2.92371 ms
 5 | [09/07/2024-21:22:23] [I] Enqueue Time: min = 1.00952 ms, max = 10.7031 ms, mean = 1.29381 ms, median = 1.20312 ms, percentile(90%) = 1.4812 ms, percentile(95%) = 1.64844 ms, percentile(99%) = 2.86414 ms
 6 | [09/07/2024-21:22:23] [I] H2D Latency: min = 0.201416 ms, max = 0.246094 ms, mean = 0.20623 ms, median = 0.202393 ms, percentile(90%) = 0.221924 ms, percentile(95%) = 0.234863 ms, percentile(99%) = 0.24231 ms
 7 | [09/07/2024-21:22:23] [I] GPU Compute Time: min = 1.37114 ms, max = 10.4899 ms, mean = 1.42454 ms, median = 1.38147 ms, percentile(90%) = 1.3855 ms, percentile(95%) = 1.3894 ms, percentile(99%) = 2.47192 ms
 8 | [09/07/2024-21:22:23] [I] D2H Latency: min = 0.112793 ms, max = 4.37329 ms, mean = 0.121365 ms, median = 0.114746 ms, percentile(90%) = 0.139648 ms, percentile(95%) = 0.154541 ms, percentile(99%) = 0.166626 ms
 9 | [09/07/2024-21:22:23] [I] Total Host Walltime: 3.00432 s
10 | [09/07/2024-21:22:23] [I] Total GPU Compute Time: 2.94311 s
11 | [09/07/2024-21:22:23] [W] * Throughput may be bound by Enqueue Time rather than GPU Compute and the GPU may be under-utilized.
12 | [09/07/2024-21:22:23] [W]   If not already in use, --useCudaGraph (utilize CUDA graphs where possible) may increase the throughput.
13 | [09/07/2024-21:22:23] [W] * GPU compute time is unstable, with coefficient of variance = 26.399%.
14 | [09/07/2024-21:22:23] [W]   If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the stability.
15 | [09/07/2024-21:22:23] [I] Explanations of the performance metrics are printed in the verbose logs.
16 | ```


--------------------------------------------------------------------------------
/assets/yolo-worldv1-s-4090.md:
--------------------------------------------------------------------------------
 1 | ```
 2 | [09/07/2024-21:04:10] [I] === Performance summary ===
 3 | [09/07/2024-21:04:10] [I] Throughput: 802.713 qps
 4 | [09/07/2024-21:04:10] [I] Latency: min = 1.26953 ms, max = 6.30933 ms, mean = 1.40011 ms, median = 1.29671 ms, percentile(90%) = 1.5542 ms, percentile(95%) = 1.68701 ms, percentile(99%) = 2.52319 ms
 5 | [09/07/2024-21:04:10] [I] Enqueue Time: min = 0.995972 ms, max = 6.14551 ms, mean = 1.20034 ms, median = 1.1221 ms, percentile(90%) = 1.39355 ms, percentile(95%) = 1.51538 ms, percentile(99%) = 2.34814 ms
 6 | [09/07/2024-21:04:10] [I] H2D Latency: min = 0.203857 ms, max = 0.341919 ms, mean = 0.250004 ms, median = 0.242584 ms, percentile(90%) = 0.280029 ms, percentile(95%) = 0.292236 ms, percentile(99%) = 0.311523 ms
 7 | [09/07/2024-21:04:10] [I] GPU Compute Time: min = 0.881592 ms, max = 5.87183 ms, mean = 0.99288 ms, median = 0.887817 ms, percentile(90%) = 1.151 ms, percentile(95%) = 1.27808 ms, percentile(99%) = 2.11157 ms
 8 | [09/07/2024-21:04:10] [I] D2H Latency: min = 0.115723 ms, max = 3.60303 ms, mean = 0.157228 ms, median = 0.155029 ms, percentile(90%) = 0.166626 ms, percentile(95%) = 0.167114 ms, percentile(99%) = 0.168457 ms
 9 | [09/07/2024-21:04:10] [I] Total Host Walltime: 3.00232 s
10 | [09/07/2024-21:04:10] [I] Total GPU Compute Time: 2.39284 s
11 | [09/07/2024-21:04:10] [W] * Throughput may be bound by Enqueue Time rather than GPU Compute and the GPU may be under-utilized.
12 | [09/07/2024-21:04:10] [W]   If not already in use, --useCudaGraph (utilize CUDA graphs where possible) may increase the throughput.
13 | [09/07/2024-21:04:10] [W] * GPU compute time is unstable, with coefficient of variance = 31.6956%.
14 | [09/07/2024-21:04:10] [W]   If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the stability.
15 | [09/07/2024-21:04:10] [I] Explanations of the performance metrics are printed in the verbose logs.
16 | ```


--------------------------------------------------------------------------------
/assets/yolo-worldv2-l-4090.md:
--------------------------------------------------------------------------------
 1 | ```
 2 | [09/02/2024-17:39:48] [I] === Performance summary ===
 3 | [09/02/2024-17:39:48] [I] Throughput: 551.925 qps
 4 | [09/02/2024-17:39:48] [I] Latency: min = 2.12134 ms, max = 3.32349 ms, mean = 2.12996 ms, median = 2.12842 ms, percentile(90%) = 2.1311 ms, percentile(95%) = 2.13202 ms, percentile(99%) = 2.13391 ms
 5 | [09/02/2024-17:39:48] [I] Enqueue Time: min = 1.11572 ms, max = 4.59204 ms, mean = 1.14546 ms, median = 1.13232 ms, percentile(90%) = 1.14771 ms, percentile(95%) = 1.15222 ms, percentile(99%) = 1.18799 ms
 6 | [09/02/2024-17:39:48] [I] H2D Latency: min = 0.200684 ms, max = 0.236572 ms, mean = 0.201542 ms, median = 0.201416 ms, percentile(90%) = 0.202087 ms, percentile(95%) = 0.202332 ms, percentile(99%) = 0.203857 ms
 7 | [09/02/2024-17:39:48] [I] GPU Compute Time: min = 1.8002 ms, max = 2.9646 ms, mean = 1.80803 ms, median = 1.8064 ms, percentile(90%) = 1.80859 ms, percentile(95%) = 1.80945 ms, percentile(99%) = 1.8114 ms
 8 | [09/02/2024-17:39:48] [I] D2H Latency: min = 0.116699 ms, max = 0.157471 ms, mean = 0.120401 ms, median = 0.120117 ms, percentile(90%) = 0.121582 ms, percentile(95%) = 0.122192 ms, percentile(99%) = 0.123047 ms
 9 | [09/02/2024-17:39:48] [I] Total Host Walltime: 3.62368 s
10 | [09/02/2024-17:39:48] [I] Total GPU Compute Time: 3.61605 s
11 | [09/02/2024-17:39:48] [W] * GPU compute time is unstable, with coefficient of variance = 1.75854%.
12 | [09/02/2024-17:39:48] [W]   If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the stability.
13 | [09/02/2024-17:39:48] [I] Explanations of the performance metrics are printed in the verbose logs.
14 | ```


--------------------------------------------------------------------------------
/assets/yolo-worldv2-m-4090.md:
--------------------------------------------------------------------------------
 1 | ```
 2 | [09/02/2024-17:28:49] [I] === Performance summary ===
 3 | [09/02/2024-17:28:49] [I] Throughput: 768.53 qps
 4 | [09/02/2024-17:28:49] [I] Latency: min = 1.6106 ms, max = 3.10254 ms, mean = 1.61813 ms, median = 1.61597 ms, percentile(90%) = 1.61835 ms, percentile(95%) = 1.61905 ms, percentile(99%) = 1.62074 ms
 5 | [09/02/2024-17:28:49] [I] Enqueue Time: min = 0.873291 ms, max = 3.86377 ms, mean = 0.895041 ms, median = 0.886475 ms, percentile(90%) = 0.893494 ms, percentile(95%) = 0.898438 ms, percentile(99%) = 0.942871 ms
 6 | [09/02/2024-17:28:49] [I] H2D Latency: min = 0.201172 ms, max = 0.240723 ms, mean = 0.20176 ms, median = 0.20166 ms, percentile(90%) = 0.201904 ms, percentile(95%) = 0.202026 ms, percentile(99%) = 0.204102 ms
 7 | [09/02/2024-17:28:49] [I] GPU Compute Time: min = 1.29126 ms, max = 2.74023 ms, mean = 1.2974 ms, median = 1.29541 ms, percentile(90%) = 1.29739 ms, percentile(95%) = 1.29834 ms, percentile(99%) = 1.29932 ms
 8 | [09/02/2024-17:28:49] [I] D2H Latency: min = 0.116455 ms, max = 0.168213 ms, mean = 0.118966 ms, median = 0.118652 ms, percentile(90%) = 0.120117 ms, percentile(95%) = 0.120361 ms, percentile(99%) = 0.121094 ms
 9 | [09/02/2024-17:28:49] [I] Total Host Walltime: 3.00313 s
10 | [09/02/2024-17:28:49] [I] Total GPU Compute Time: 2.99441 s
11 | [09/02/2024-17:28:49] [W] * GPU compute time is unstable, with coefficient of variance = 3.23952%.
12 | [09/02/2024-17:28:49] [W]   If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the stability.
13 | [09/02/2024-17:28:49] [I] Explanations of the performance metrics are printed in the verbose logs.
14 | ```


--------------------------------------------------------------------------------
/assets/yolo-worldv2-s-4090.md:
--------------------------------------------------------------------------------
 1 | ```
 2 | [09/02/2024-17:14:34] [I] === Performance summary ===
 3 | [09/02/2024-17:14:34] [I] Throughput: 1099.65 qps
 4 | [09/02/2024-17:14:34] [I] Latency: min = 1.12085 ms, max = 5.30969 ms, mean = 1.17602 ms, median = 1.17322 ms, percentile(90%) = 1.18164 ms, percentile(95%) = 1.18457 ms, percentile(99%) = 1.20386 ms
 5 | [09/02/2024-17:14:34] [I] Enqueue Time: min = 0.697998 ms, max = 5.14844 ms, mean = 0.865324 ms, median = 0.88208 ms, percentile(90%) = 0.89624 ms, percentile(95%) = 0.901215 ms, percentile(99%) = 0.947876 ms
 6 | [09/02/2024-17:14:34] [I] H2D Latency: min = 0.202148 ms, max = 0.24292 ms, mean = 0.219057 ms, median = 0.221542 ms, percentile(90%) = 0.225708 ms, percentile(95%) = 0.227783 ms, percentile(99%) = 0.236572 ms
 7 | [09/02/2024-17:14:34] [I] GPU Compute Time: min = 0.803711 ms, max = 4.93372 ms, mean = 0.818735 ms, median = 0.809937 ms, percentile(90%) = 0.811035 ms, percentile(95%) = 0.811035 ms, percentile(99%) = 0.812012 ms
 8 | [09/02/2024-17:14:34] [I] D2H Latency: min = 0.112061 ms, max = 0.168457 ms, mean = 0.138235 ms, median = 0.142334 ms, percentile(90%) = 0.147095 ms, percentile(95%) = 0.149506 ms, percentile(99%) = 0.15715 ms
 9 | [09/02/2024-17:14:34] [I] Total Host Walltime: 3.00186 s
10 | [09/02/2024-17:14:34] [I] Total GPU Compute Time: 2.70264 s
11 | [09/02/2024-17:14:34] [W] * Throughput may be bound by Enqueue Time rather than GPU Compute and the GPU may be under-utilized.
12 | [09/02/2024-17:14:34] [W]   If not already in use, --useCudaGraph (utilize CUDA graphs where possible) may increase the throughput.
13 | [09/02/2024-17:14:34] [W] * GPU compute time is unstable, with coefficient of variance = 20.4616%.
14 | [09/02/2024-17:14:34] [W]   If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the stability.
15 | [09/02/2024-17:14:34] [I] Explanations of the performance metrics are printed in the verbose logs.
16 | ```


--------------------------------------------------------------------------------
/assets/yolo_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/D-Robotics-AI-Lab/DOSOD/2b133c3a9f7a9b685635090d219efa84a6951e75/assets/yolo_arch.png


--------------------------------------------------------------------------------
/assets/yolo_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/D-Robotics-AI-Lab/DOSOD/2b133c3a9f7a9b685635090d219efa84a6951e75/assets/yolo_logo.png


--------------------------------------------------------------------------------
/configs/dosod/rep_dosod_mlp3x_l_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py:
--------------------------------------------------------------------------------
 1 | _base_ = '../../third_party/mmyolo/configs/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco.py'
 2 | custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False)
 3 | 
 4 | # hyper-parameters
 5 | num_training_classes = 80
 6 | text_channels = 512
 7 | joint_space_dims = 512
 8 | 
 9 | # model settings
10 | model = dict(
11 |     type='RepDOSODDetector',
12 |     data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
13 |     bbox_head=dict(type='RepDOSODYOLOv8Head',
14 |                    head_module=dict(type='RepDOSODYOLOv8dHeadModule',
15 |                                     text_embed_dims=text_channels,
16 |                                     joint_space_dims=joint_space_dims,
17 |                                     num_classes=num_training_classes)))
18 | 


--------------------------------------------------------------------------------
/configs/dosod/rep_dosod_mlp3x_l_d-robotics.py:
--------------------------------------------------------------------------------
 1 | _base_ = '../../third_party/mmyolo/configs/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco.py'
 2 | custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False)
 3 | 
 4 | # hyper-parameters
 5 | num_training_classes = 80  # lvis 1202, coco 80
 6 | text_channels = 512
 7 | joint_space_dims = 512
 8 | 
 9 | # model settings
10 | model = dict(
11 |     type='RepDOSODDetector',
12 |     data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
13 |     bbox_head=dict(type='RepDOSODYOLOv8Head',
14 |                    head_module=dict(type='RepDOSODYOLOv8dHeadModuleDRobotics',
15 |                                     text_embed_dims=text_channels,
16 |                                     joint_space_dims=joint_space_dims,
17 |                                     num_classes=num_training_classes)))
18 | 


--------------------------------------------------------------------------------
/configs/dosod/rep_dosod_mlp3x_m_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py:
--------------------------------------------------------------------------------
 1 | _base_ = '../../third_party/mmyolo/configs/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco.py'
 2 | custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False)
 3 | 
 4 | # hyper-parameters
 5 | num_training_classes = 80
 6 | text_channels = 512
 7 | joint_space_dims = 512
 8 | 
 9 | # model settings
10 | model = dict(
11 |     type='RepDOSODDetector',
12 |     data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
13 |     bbox_head=dict(type='RepDOSODYOLOv8Head',
14 |                    head_module=dict(type='RepDOSODYOLOv8dHeadModule',
15 |                                     text_embed_dims=text_channels,
16 |                                     joint_space_dims=joint_space_dims,
17 |                                     num_classes=num_training_classes)))
18 | 


--------------------------------------------------------------------------------
/configs/dosod/rep_dosod_mlp3x_m_d-robotics.py:
--------------------------------------------------------------------------------
 1 | _base_ = '../../third_party/mmyolo/configs/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco.py'
 2 | custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False)
 3 | 
 4 | # hyper-parameters
 5 | num_training_classes = 80  # lvis 1202, coco 80
 6 | text_channels = 512
 7 | joint_space_dims = 512
 8 | 
 9 | # model settings
10 | model = dict(
11 |     type='RepDOSODDetector',
12 |     data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
13 |     bbox_head=dict(type='RepDOSODYOLOv8Head',
14 |                    head_module=dict(type='RepDOSODYOLOv8dHeadModuleDRobotics',
15 |                                     text_embed_dims=text_channels,
16 |                                     joint_space_dims=joint_space_dims,
17 |                                     num_classes=num_training_classes)))
18 | 


--------------------------------------------------------------------------------
/configs/dosod/rep_dosod_mlp3x_s_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py:
--------------------------------------------------------------------------------
 1 | _base_ = '../../third_party/mmyolo/configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py'
 2 | custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False)
 3 | 
 4 | # hyper-parameters
 5 | num_training_classes = 80
 6 | text_channels = 512
 7 | joint_space_dims = 512
 8 | 
 9 | # model settings
10 | model = dict(
11 |     type='RepDOSODDetector',
12 |     data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
13 |     bbox_head=dict(type='RepDOSODYOLOv8Head',
14 |                    head_module=dict(type='RepDOSODYOLOv8dHeadModule',
15 |                                     text_embed_dims=text_channels,
16 |                                     joint_space_dims=joint_space_dims,
17 |                                     num_classes=num_training_classes)))
18 | 


--------------------------------------------------------------------------------
/configs/dosod/rep_dosod_mlp3x_s_d-robotics.py:
--------------------------------------------------------------------------------
 1 | _base_ = '../../third_party/mmyolo/configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py'
 2 | custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False)
 3 | 
 4 | # hyper-parameters
 5 | num_training_classes = 80  # lvis 1202, coco 80
 6 | text_channels = 512
 7 | joint_space_dims = 512
 8 | 
 9 | # model settings
10 | model = dict(
11 |     type='RepDOSODDetector',
12 |     data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
13 |     bbox_head=dict(type='RepDOSODYOLOv8Head',
14 |                    head_module=dict(type='RepDOSODYOLOv8dHeadModuleDRobotics',
15 |                                     text_embed_dims=text_channels,
16 |                                     joint_space_dims=joint_space_dims,
17 |                                     num_classes=num_training_classes)))
18 | 


--------------------------------------------------------------------------------
/configs/pretrain/rep_yolo_world_v2_l.py:
--------------------------------------------------------------------------------
 1 | _base_ = ('../../third_party/mmyolo/configs/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco.py')
 2 | custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False)
 3 | 
 4 | # hyper-parameters
 5 | num_classes = 80
 6 | num_training_classes = 80
 7 | text_channels = 512
 8 | neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
 9 | neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
10 | 
11 | # model settings
12 | model = dict(type='SimpleYOLOWorldDetector',
13 |              mm_neck=True,
14 |              num_train_classes=num_classes,
15 |              num_test_classes=num_classes,
16 |              reparameterized=True,
17 |              data_preprocessor=dict(type='YOLOv5DetDataPreprocessor'),
18 |              backbone=dict(_delete_=True,
19 |                            type='MultiModalYOLOBackbone',
20 |                            text_model=None,
21 |                            image_model={{_base_.model.backbone}},
22 |                            with_text_model=False),
23 |              neck=dict(type='YOLOWorldPAFPN',
24 |                        guide_channels=num_classes,
25 |                        embed_channels=neck_embed_channels,
26 |                        num_heads=neck_num_heads,
27 |                        block_cfg=dict(type='RepConvMaxSigmoidCSPLayerWithTwoConv',
28 |                                       guide_channels=num_classes)),
29 |              bbox_head=dict(head_module=dict(type='RepYOLOWorldHeadModule',
30 |                                              embed_dims=text_channels,
31 |                                              num_guide=num_classes,
32 |                                              num_classes=num_classes)),
33 |              train_cfg=dict(assigner=dict(num_classes=num_classes)))
34 | 


--------------------------------------------------------------------------------
/configs/pretrain/rep_yolo_world_v2_m.py:
--------------------------------------------------------------------------------
 1 | _base_ = ('../../third_party/mmyolo/configs/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco.py')
 2 | custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False)
 3 | 
 4 | # hyper-parameters
 5 | num_classes = 80
 6 | num_training_classes = 80
 7 | text_channels = 512
 8 | neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
 9 | neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
10 | 
11 | # model settings
12 | model = dict(type='SimpleYOLOWorldDetector',
13 |              mm_neck=True,
14 |              num_train_classes=num_classes,
15 |              num_test_classes=num_classes,
16 |              reparameterized=True,
17 |              data_preprocessor=dict(type='YOLOv5DetDataPreprocessor'),
18 |              backbone=dict(_delete_=True,
19 |                            type='MultiModalYOLOBackbone',
20 |                            text_model=None,
21 |                            image_model={{_base_.model.backbone}},
22 |                            with_text_model=False),
23 |              neck=dict(type='YOLOWorldPAFPN',
24 |                        guide_channels=num_classes,
25 |                        embed_channels=neck_embed_channels,
26 |                        num_heads=neck_num_heads,
27 |                        block_cfg=dict(type='RepConvMaxSigmoidCSPLayerWithTwoConv',
28 |                                       guide_channels=num_classes)),
29 |              bbox_head=dict(head_module=dict(type='RepYOLOWorldHeadModule',
30 |                                              embed_dims=text_channels,
31 |                                              num_guide=num_classes,
32 |                                              num_classes=num_classes)),
33 |              train_cfg=dict(assigner=dict(num_classes=num_classes)))
34 | 


--------------------------------------------------------------------------------
/configs/pretrain/rep_yolo_world_v2_s.py:
--------------------------------------------------------------------------------
 1 | _base_ = ('../../third_party/mmyolo/configs/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py')
 2 | custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False)
 3 | 
 4 | # hyper-parameters
 5 | num_classes = 80
 6 | num_training_classes = 80
 7 | text_channels = 512
 8 | neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
 9 | neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
10 | 
11 | # model settings
12 | model = dict(type='SimpleYOLOWorldDetector',
13 |              mm_neck=True,
14 |              num_train_classes=num_classes,
15 |              num_test_classes=num_classes,
16 |              reparameterized=True,
17 |              data_preprocessor=dict(type='YOLOv5DetDataPreprocessor'),
18 |              backbone=dict(_delete_=True,
19 |                            type='MultiModalYOLOBackbone',
20 |                            text_model=None,
21 |                            image_model={{_base_.model.backbone}},
22 |                            with_text_model=False),
23 |              neck=dict(type='YOLOWorldPAFPN',
24 |                        guide_channels=num_classes,
25 |                        embed_channels=neck_embed_channels,
26 |                        num_heads=neck_num_heads,
27 |                        block_cfg=dict(type='RepConvMaxSigmoidCSPLayerWithTwoConv',
28 |                                       guide_channels=num_classes)),
29 |              bbox_head=dict(head_module=dict(type='RepYOLOWorldHeadModule',
30 |                                              embed_dims=text_channels,
31 |                                              num_guide=num_classes,
32 |                                              num_classes=num_classes)),
33 |              train_cfg=dict(assigner=dict(num_classes=num_classes)))
34 | 


--------------------------------------------------------------------------------
/configs/prompt_tuning_coco/READEME.md:
--------------------------------------------------------------------------------
 1 | ## Prompt Tuning for YOLO-World
 2 | 
 3 | ### NOTE:
 4 | 
 5 | This folder contains many experimental config files, which will be removed later!!
 6 | 
 7 | ### Experimental Results
 8 | 
 9 | | Model | Config |  AP  | AP50 | AP75  | APS | APM | APL |
10 | | :---- | :----: | :--: | :--: | :---: | :-: | :-: | :-: |
11 | | YOLO-World-v2-L | Zero-shot | 45.7 | 61.6 | 49.8 | 29.9 | 50.0 | 60.8 |
12 | | [YOLO-World-v2-L](./../configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco.py) | Prompt tuning | 47.9 | 64.3 | 52.5 | 31.9 | 52.6 | 61.3 | 
13 | 


--------------------------------------------------------------------------------
/data/texts/coco_class_texts.json:
--------------------------------------------------------------------------------
1 | [["person"], ["bicycle"], ["car"], ["motorcycle"], ["airplane"], ["bus"], ["train"], ["truck"], ["boat"], ["traffic light"], ["fire hydrant"], ["stop sign"], ["parking meter"], ["bench"], ["bird"], ["cat"], ["dog"], ["horse"], ["sheep"], ["cow"], ["elephant"], ["bear"], ["zebra"], ["giraffe"], ["backpack"], ["umbrella"], ["handbag"], ["tie"], ["suitcase"], ["frisbee"], ["skis"], ["snowboard"], ["sports ball"], ["kite"], ["baseball bat"], ["baseball glove"], ["skateboard"], ["surfboard"], ["tennis racket"], ["bottle"], ["wine glass"], ["cup"], ["fork"], ["knife"], ["spoon"], ["bowl"], ["banana"], ["apple"], ["sandwich"], ["orange"], ["broccoli"], ["carrot"], ["hot dog"], ["pizza"], ["donut"], ["cake"], ["chair"], ["couch"], ["potted plant"], ["bed"], ["dining table"], ["toilet"], ["tv"], ["laptop"], ["mouse"], ["remote"], ["keyboard"], ["cell phone"], ["microwave"], ["oven"], ["toaster"], ["sink"], ["refrigerator"], ["book"], ["clock"], ["vase"], ["scissors"], ["teddy bear"], ["hair drier"], ["toothbrush"]]
2 | 


--------------------------------------------------------------------------------
/demo/README.md:
--------------------------------------------------------------------------------
 1 | ## YOLO-World Demo
 2 | 
 3 | ### Getting Started
 4 | 
 5 | Setting `PYTHONPATH` as the path to `YOLO-World` and run:
 6 | 
 7 | ```bash
 8 | PYTHONPATH=/xxxx/YOLO-World python demo/yyyy_demo.py
 9 | # or directly
10 | PYTHONPATH=./ python demo/yyyy_demo.py
11 | ```
12 | 
13 | #### Gradio Demo
14 | 
15 | We provide the [Gradio](https://www.gradio.app/) demo for local devices:
16 | 
17 | ```bash
18 | pip install gradio==4.16.0
19 | python demo/demo.py path/to/config path/to/weights
20 | ```
21 | 
22 | Additionaly, you can use a Dockerfile to build an image with gradio. As a prerequisite, make sure you have respective drivers installed alongside [nvidia-container-runtime](https://stackoverflow.com/questions/59691207/docker-build-with-nvidia-runtime). Replace MODEL_NAME and WEIGHT_NAME with the respective values or ommit this and use default values from the [Dockerfile](Dockerfile#3)
23 | 
24 | ```bash
25 | docker build --build-arg="MODEL=MODEL_NAME" --build-arg="WEIGHT=WEIGHT_NAME" -t yolo_demo .
26 | docker run --runtime nvidia -p 8080:8080
27 | ```
28 | 
29 | #### Image Demo
30 | 
31 | We provide a simple image demo for inference on images with visualization outputs.
32 | 
33 | ```bash
34 | python demo/image_demo.py path/to/config path/to/weights image/path/directory 'person,dog,cat' --topk 100 --threshold 0.005 --output-dir demo_outputs
35 | ```
36 | 
37 | **Notes:**
38 | * The `image` can be a directory or a single image.
39 | * The `texts` can be a string of categories (noun phrases) which is separated by a comma. We also support `txt` file in which each line contains a category ( noun phrases).
40 | * The `topk` and `threshold` control the number of predictions and the confidence threshold.
41 | 
42 | 
43 | #### Video Demo
44 | 
45 | The `video_demo` has similar hyper-parameters with `image_demo`.
46 | 
47 | ```bash
48 | python demo/video_demo.py path/to/config path/to/weights video_path 'person,dog' --out out_video_path
49 | ```
50 | 
51 | ### FAQ
52 | 
53 | > 1. `Failed to custom import!`
54 | ```bash
55 |   File "simple_demo.py", line 37, in <module>
56 |     cfg = Config.fromfile(config_file)
57 |   File "/data/miniconda3/envs/det/lib/python3.8/site-packages/mmengine/config/config.py", line 183, in fromfile
58 |     raise ImportError('Failed to custom import!') from e
59 | ImportError: Failed to custom import!
60 | ```
61 | **Solution:**
62 | 
63 | ```bash
64 | PYTHONPATH=/xxxx/YOLO-World python demo/simple_demo.py
65 | ```


--------------------------------------------------------------------------------
/demo/sample_images/bus.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/D-Robotics-AI-Lab/DOSOD/2b133c3a9f7a9b685635090d219efa84a6951e75/demo/sample_images/bus.jpg


--------------------------------------------------------------------------------
/demo/sample_images/zidane.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/D-Robotics-AI-Lab/DOSOD/2b133c3a9f7a9b685635090d219efa84a6951e75/demo/sample_images/zidane.jpg


--------------------------------------------------------------------------------
/demo/simple_demo.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Tencent Inc. All rights reserved.
 2 | import os.path as osp
 3 | 
 4 | import cv2
 5 | import torch
 6 | from mmengine.config import Config
 7 | from mmengine.dataset import Compose
 8 | from mmdet.apis import init_detector
 9 | from mmdet.utils import get_test_pipeline_cfg
10 | 
11 | 
12 | def inference(model, image, texts, test_pipeline, score_thr=0.3, max_dets=100):
13 |     image = cv2.imread(image)
14 |     image = image[:, :, [2, 1, 0]]
15 |     data_info = dict(img=image, img_id=0, texts=texts)
16 |     data_info = test_pipeline(data_info)
17 |     data_batch = dict(inputs=data_info['inputs'].unsqueeze(0),
18 |                       data_samples=[data_info['data_samples']])
19 |     with torch.no_grad():
20 |         output = model.test_step(data_batch)[0]
21 |     pred_instances = output.pred_instances
22 |     # score thresholding
23 |     pred_instances = pred_instances[pred_instances.scores.float() > score_thr]
24 |     # max detections
25 |     if len(pred_instances.scores) > max_dets:
26 |         indices = pred_instances.scores.float().topk(max_dets)[1]
27 |         pred_instances = pred_instances[indices]
28 | 
29 |     pred_instances = pred_instances.cpu().numpy()
30 |     boxes = pred_instances['bboxes']
31 |     labels = pred_instances['labels']
32 |     scores = pred_instances['scores']
33 |     label_texts = [texts[x][0] for x in labels]
34 |     return boxes, labels, label_texts, scores
35 | 
36 | 
37 | if __name__ == "__main__":
38 | 
39 |     config_file = "configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py"
40 |     checkpoint = "weights/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth"
41 | 
42 |     cfg = Config.fromfile(config_file)
43 |     cfg.work_dir = osp.join('./work_dirs')
44 |     # init model
45 |     cfg.load_from = checkpoint
46 |     model = init_detector(cfg, checkpoint=checkpoint, device='cuda:0')
47 |     test_pipeline_cfg = get_test_pipeline_cfg(cfg=cfg)
48 |     test_pipeline_cfg[0].type = 'mmdet.LoadImageFromNDArray'
49 |     test_pipeline = Compose(test_pipeline_cfg)
50 | 
51 |     texts = [['person'], ['bus'], [' ']]
52 |     image = "demo/sample_images/bus.jpg"
53 |     print(f"starting to detect: {image}")
54 |     results = inference(model, image, texts, test_pipeline)
55 |     format_str = [
56 |         f"obj-{idx}: {box}, label-{lbl}, class-{lbl_text}, score-{score}"
57 |         for idx, (box, lbl, lbl_text, score) in enumerate(zip(*results))
58 |     ]
59 |     print("detecting results:")
60 |     for q in format_str:
61 |         print(q)
62 | 


--------------------------------------------------------------------------------
/deploy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/D-Robotics-AI-Lab/DOSOD/2b133c3a9f7a9b685635090d219efa84a6951e75/deploy/__init__.py


--------------------------------------------------------------------------------
/deploy/easydeploy/README.md:
--------------------------------------------------------------------------------
 1 | # MMYOLO Model Easy-Deployment
 2 | 
 3 | ## Introduction
 4 | 
 5 | This project is developed for easily converting your MMYOLO models to other inference backends without the need of MMDeploy, which reduces the cost of both time and effort on getting familiar with MMDeploy.
 6 | 
 7 | Currently we support converting to `ONNX` and `TensorRT` formats, other inference backends such `ncnn` will be added to this project as well.
 8 | 
 9 | ## Supported Backends
10 | 
11 | - [Model Convert](docs/model_convert.md)
12 | 


--------------------------------------------------------------------------------
/deploy/easydeploy/README_zh-CN.md:
--------------------------------------------------------------------------------
 1 | # MMYOLO 模型转换
 2 | 
 3 | ## 介绍
 4 | 
 5 | 本项目作为 MMYOLO 的部署 project 单独存在，意图剥离 MMDeploy 当前的体系，独自支持用户完成模型训练后的转换和部署功能，使用户的学习和工程成本下降。
 6 | 
 7 | 当前支持对 ONNX 格式和 TensorRT 格式的转换，后续对其他推理平台也会支持起来。
 8 | 
 9 | ## 转换教程
10 | 
11 | - [Model Convert](docs/model_convert.md)
12 | 


--------------------------------------------------------------------------------
/deploy/easydeploy/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .common import DeployC2f
3 | from .focus import DeployFocus, GConvFocus, NcnnFocus
4 | 
5 | __all__ = ['DeployFocus', 'NcnnFocus', 'GConvFocus', 'DeployC2f']
6 | 


--------------------------------------------------------------------------------
/deploy/easydeploy/backbone/common.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch import Tensor
 4 | 
 5 | 
 6 | class DeployC2f(nn.Module):
 7 | 
 8 |     def __init__(self, *args, **kwargs):
 9 |         super().__init__()
10 | 
11 |     def forward(self, x: Tensor) -> Tensor:
12 |         x_main = self.main_conv(x)
13 |         x_main = [x_main, x_main[:, self.mid_channels:, ...]]
14 |         x_main.extend(blocks(x_main[-1]) for blocks in self.blocks)
15 |         x_main.pop(1)
16 |         return self.final_conv(torch.cat(x_main, 1))
17 | 


--------------------------------------------------------------------------------
/deploy/easydeploy/bbox_code/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .bbox_coder import (rtmdet_bbox_decoder, yolov5_bbox_decoder,
3 |                          yolox_bbox_decoder)
4 | 
5 | __all__ = ['yolov5_bbox_decoder', 'rtmdet_bbox_decoder', 'yolox_bbox_decoder']
6 | 


--------------------------------------------------------------------------------
/deploy/easydeploy/bbox_code/bbox_coder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from typing import Optional
 3 | 
 4 | import torch
 5 | from torch import Tensor
 6 | 
 7 | 
 8 | def yolov5_bbox_decoder(priors: Tensor, bbox_preds: Tensor,
 9 |                         stride: Tensor) -> Tensor:
10 |     bbox_preds = bbox_preds.sigmoid()
11 | 
12 |     x_center = (priors[..., 0] + priors[..., 2]) * 0.5
13 |     y_center = (priors[..., 1] + priors[..., 3]) * 0.5
14 |     w = priors[..., 2] - priors[..., 0]
15 |     h = priors[..., 3] - priors[..., 1]
16 | 
17 |     x_center_pred = (bbox_preds[..., 0] - 0.5) * 2 * stride + x_center
18 |     y_center_pred = (bbox_preds[..., 1] - 0.5) * 2 * stride + y_center
19 |     w_pred = (bbox_preds[..., 2] * 2)**2 * w
20 |     h_pred = (bbox_preds[..., 3] * 2)**2 * h
21 | 
22 |     decoded_bboxes = torch.stack(
23 |         [x_center_pred, y_center_pred, w_pred, h_pred], dim=-1)
24 | 
25 |     return decoded_bboxes
26 | 
27 | 
28 | def rtmdet_bbox_decoder(priors: Tensor, bbox_preds: Tensor,
29 |                         stride: Optional[Tensor]) -> Tensor:
30 |     stride = stride[None, :, None]
31 |     bbox_preds *= stride
32 |     tl_x = (priors[..., 0] - bbox_preds[..., 0])
33 |     tl_y = (priors[..., 1] - bbox_preds[..., 1])
34 |     br_x = (priors[..., 0] + bbox_preds[..., 2])
35 |     br_y = (priors[..., 1] + bbox_preds[..., 3])
36 |     decoded_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], -1)
37 |     return decoded_bboxes
38 | 
39 | 
40 | def yolox_bbox_decoder(priors: Tensor, bbox_preds: Tensor,
41 |                        stride: Optional[Tensor]) -> Tensor:
42 |     stride = stride[None, :, None]
43 |     xys = (bbox_preds[..., :2] * stride) + priors
44 |     whs = bbox_preds[..., 2:].exp() * stride
45 |     decoded_bboxes = torch.cat([xys, whs], -1)
46 |     return decoded_bboxes
47 | 


--------------------------------------------------------------------------------
/deploy/easydeploy/deepstream/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8.12)
 2 | 
 3 | set(CMAKE_CUDA_ARCHITECTURES 60 61 62 70 72 75 86)
 4 | set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
 5 | 
 6 | project(nvdsparsebbox_mmyolo LANGUAGES CXX)
 7 | 
 8 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -O3 -g -Wall -Werror -shared -fPIC")
 9 | set(CMAKE_CXX_STANDARD 14)
10 | set(CMAKE_BUILD_TYPE Release)
11 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
12 | 
13 | # CUDA
14 | find_package(CUDA REQUIRED)
15 | 
16 | # TensorRT
17 | set(TensorRT_INCLUDE_DIRS "/usr/include/x86_64-linux-gnu" CACHE STRING "TensorRT headers path")
18 | set(TensorRT_LIBRARIES "/usr/lib/x86_64-linux-gnu" CACHE STRING "TensorRT libs path")
19 | 
20 | # DeepStream
21 | set(DEEPSTREAM "/opt/nvidia/deepstream/deepstream" CACHE STRING "DeepStream root path")
22 | set(DS_LIBRARIES ${DEEPSTREAM}/lib)
23 | set(DS_INCLUDE_DIRS ${DEEPSTREAM}/sources/includes)
24 | 
25 | include_directories(
26 |         ${CUDA_INCLUDE_DIRS}
27 |         ${TensorRT_INCLUDE_DIRS}
28 |         ${DS_INCLUDE_DIRS})
29 | 
30 | add_library(
31 |         ${PROJECT_NAME}
32 |         SHARED
33 |         custom_mmyolo_bbox_parser/nvdsparsebbox_mmyolo.cpp)
34 | 
35 | target_link_libraries(${PROJECT_NAME} PRIVATE nvinfer nvinfer_plugin)
36 | 


--------------------------------------------------------------------------------
/deploy/easydeploy/deepstream/README.md:
--------------------------------------------------------------------------------
 1 | # Inference MMYOLO Models with DeepStream
 2 | 
 3 | This project demonstrates how to inference MMYOLO models with customized parsers in [DeepStream SDK](https://developer.nvidia.com/deepstream-sdk).
 4 | 
 5 | ## Pre-requisites
 6 | 
 7 | ### 1. Install Nvidia Driver and CUDA
 8 | 
 9 | First, please follow the official documents and instructions to install dedicated Nvidia graphic driver and CUDA matched to your gpu and target Nvidia AIoT devices.
10 | 
11 | ### 2. Install DeepStream SDK
12 | 
13 | Second, please follow the official instruction to download and install DeepStream SDK. Currently stable version of DeepStream is v6.2.
14 | 
15 | ### 3. Generate TensorRT Engine
16 | 
17 | As DeepStream builds on top of several NVIDIA libraries, you need to first convert your trained MMYOLO models to TensorRT engine files. We strongly recommend you to try the supported TensorRT deployment solution in [EasyDeploy](../../easydeploy/).
18 | 
19 | ## Build and Run
20 | 
21 | Please make sure that your converted TensorRT engine is already located in the `deepstream` folder as the config shows. Create your own model config files and change the `config-file` parameter in [deepstream_app_config.txt](deepstream_app_config.txt) to the model you want to run with.
22 | 
23 | ```bash
24 | mkdir build && cd build
25 | cmake ..
26 | make -j$(nproc) && make install
27 | ```
28 | 
29 | Then you can run the inference with this command.
30 | 
31 | ```bash
32 | deepstream-app -c deepstream_app_config.txt
33 | ```
34 | 
35 | ## Code Structure
36 | 
37 | ```bash
38 | ├── deepstream
39 | │   ├── configs                   # config file for MMYOLO models
40 | │   │   └── config_infer_rtmdet.txt
41 | │   ├── custom_mmyolo_bbox_parser # customized parser for MMYOLO models to DeepStream formats
42 | │   │   └── nvdsparsebbox_mmyolo.cpp
43 | |   ├── CMakeLists.txt
44 | │   ├── coco_labels.txt           # labels for coco detection
45 | │   ├── deepstream_app_config.txt # deepStream reference app configs for MMYOLO models
46 | │   ├── README_zh-CN.md
47 | │   └── README.md
48 | ```
49 | 


--------------------------------------------------------------------------------
/deploy/easydeploy/deepstream/README_zh-CN.md:
--------------------------------------------------------------------------------
 1 | # 使用 DeepStream SDK 推理 MMYOLO 模型
 2 | 
 3 | 本项目演示了如何使用 [DeepStream SDK](https://developer.nvidia.com/deepstream-sdk) 配合改写的 parser 来推理 MMYOLO 的模型。
 4 | 
 5 | ## 预先准备
 6 | 
 7 | ### 1. 安装 Nidia 驱动和 CUDA
 8 | 
 9 | 首先请根据当前的显卡驱动和目标使用设备的驱动完成显卡驱动和 CUDA 的安装。
10 | 
11 | ### 2. 安装 DeepStream SDK
12 | 
13 | 目前 DeepStream SDK 稳定版本已经更新到 v6.2，官方推荐使用这个版本。
14 | 
15 | ### 3. 将 MMYOLO 模型转换为 TensorRT Engine
16 | 
17 | 推荐使用 EasyDeploy 中的 TensorRT 方案完成目标模型的转换部署，具体可参考 [此文档](../../easydeploy/docs/model_convert.md) 。
18 | 
19 | ## 编译使用
20 | 
21 | 当前项目使用的是 MMYOLO 的 rtmdet 模型，若想使用其他的模型，请参照目录下的配置文件进行改写。然后将转换完的 TensorRT engine 放在当前目录下并执行如下命令：
22 | 
23 | ```bash
24 | mkdir build && cd build
25 | cmake ..
26 | make -j$(nproc) && make install
27 | ```
28 | 
29 | 完成编译后可使用如下命令进行推理：
30 | 
31 | ```bash
32 | deepstream-app -c deepstream_app_config.txt
33 | ```
34 | 
35 | ## 项目代码结构
36 | 
37 | ```bash
38 | ├── deepstream
39 | │   ├── configs                   # MMYOLO 模型对应的 DeepStream 配置
40 | │   │   └── config_infer_rtmdet.txt
41 | │   ├── custom_mmyolo_bbox_parser # 适配 DeepStream formats 的 parser
42 | │   │   └── nvdsparsebbox_mmyolo.cpp
43 | |   ├── CMakeLists.txt
44 | │   ├── coco_labels.txt           # coco labels
45 | │   ├── deepstream_app_config.txt # DeepStream app 配置
46 | │   ├── README_zh-CN.md
47 | │   └── README.md
48 | ```
49 | 


--------------------------------------------------------------------------------
/deploy/easydeploy/deepstream/coco_labels.txt:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorbike
 5 | aeroplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | sofa
59 | pottedplant
60 | bed
61 | diningtable
62 | toilet
63 | tvmonitor
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
81 | 


--------------------------------------------------------------------------------
/deploy/easydeploy/deepstream/configs/config_infer_rtmdet.txt:
--------------------------------------------------------------------------------
 1 | [property]
 2 | gpu-id=0
 3 | net-scale-factor=0.01735207357279195
 4 | offsets=57.375;57.12;58.395
 5 | model-color-format=1
 6 | model-engine-file=../end2end.engine
 7 | labelfile-path=../coco_labels.txt
 8 | batch-size=1
 9 | network-mode=0
10 | num-detected-classes=80
11 | interval=0
12 | gie-unique-id=1
13 | process-mode=1
14 | network-type=0
15 | cluster-mode=2
16 | maintain-aspect-ratio=1
17 | parse-bbox-func-name=NvDsInferParseCustomMMYOLO
18 | custom-lib-path=../build/libnvdsparsebbox_mmyolo.so
19 | 
20 | [class-attrs-all]
21 | pre-cluster-threshold=0.45
22 | topk=100
23 | 


--------------------------------------------------------------------------------
/deploy/easydeploy/deepstream/configs/config_infer_yolov5.txt:
--------------------------------------------------------------------------------
 1 | [property]
 2 | gpu-id=0
 3 | net-scale-factor=0.0039215697906911373
 4 | model-color-format=0
 5 | model-engine-file=../end2end.engine
 6 | labelfile-path=../coco_labels.txt
 7 | batch-size=1
 8 | network-mode=0
 9 | num-detected-classes=80
10 | interval=0
11 | gie-unique-id=1
12 | process-mode=1
13 | network-type=0
14 | cluster-mode=2
15 | maintain-aspect-ratio=1
16 | parse-bbox-func-name=NvDsInferParseCustomMMYOLO
17 | custom-lib-path=../build/libnvdsparsebbox_mmyolo.so
18 | 
19 | [class-attrs-all]
20 | pre-cluster-threshold=0.45
21 | topk=100
22 | 


--------------------------------------------------------------------------------
/deploy/easydeploy/deepstream/configs/config_infer_yolov8.txt:
--------------------------------------------------------------------------------
 1 | [property]
 2 | gpu-id=0
 3 | net-scale-factor=0.0039215697906911373
 4 | model-color-format=0
 5 | model-engine-file=../end2end.engine
 6 | labelfile-path=../coco_labels.txt
 7 | batch-size=1
 8 | network-mode=0
 9 | num-detected-classes=80
10 | interval=0
11 | gie-unique-id=1
12 | process-mode=1
13 | network-type=0
14 | cluster-mode=2
15 | maintain-aspect-ratio=1
16 | parse-bbox-func-name=NvDsInferParseCustomMMYOLO
17 | custom-lib-path=../build/libnvdsparsebbox_mmyolo.so
18 | 
19 | [class-attrs-all]
20 | pre-cluster-threshold=0.45
21 | topk=100
22 | 


--------------------------------------------------------------------------------
/deploy/easydeploy/deepstream/deepstream_app_config.txt:
--------------------------------------------------------------------------------
 1 | [application]
 2 | enable-perf-measurement=1
 3 | perf-measurement-interval-sec=5
 4 | 
 5 | [tiled-display]
 6 | enable=1
 7 | rows=1
 8 | columns=1
 9 | width=1280
10 | height=720
11 | gpu-id=0
12 | nvbuf-memory-type=0
13 | 
14 | [source0]
15 | enable=1
16 | type=3
17 | uri=file:///opt/nvidia/deepstream/deepstream/samples/streams/sample_1080p_h264.mp4
18 | num-sources=1
19 | gpu-id=0
20 | cudadec-memtype=0
21 | 
22 | [sink0]
23 | enable=1
24 | type=2
25 | sync=0
26 | gpu-id=0
27 | nvbuf-memory-type=0
28 | 
29 | [osd]
30 | enable=1
31 | gpu-id=0
32 | border-width=5
33 | text-size=15
34 | text-color=1;1;1;1;
35 | text-bg-color=0.3;0.3;0.3;1
36 | font=Serif
37 | show-clock=0
38 | clock-x-offset=800
39 | clock-y-offset=820
40 | clock-text-size=12
41 | clock-color=1;0;0;0
42 | nvbuf-memory-type=0
43 | 
44 | [streammux]
45 | gpu-id=0
46 | live-source=0
47 | batch-size=1
48 | batched-push-timeout=40000
49 | width=1920
50 | height=1080
51 | enable-padding=0
52 | nvbuf-memory-type=0
53 | 
54 | [primary-gie]
55 | enable=1
56 | gpu-id=0
57 | gie-unique-id=1
58 | nvbuf-memory-type=0
59 | config-file=configs/config_infer_rtmdet.txt
60 | 
61 | [tests]
62 | file-loop=0
63 | 


--------------------------------------------------------------------------------
/deploy/easydeploy/examples/cv2_nms.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple, Union
 2 | 
 3 | import cv2
 4 | from numpy import ndarray
 5 | 
 6 | MAJOR, MINOR = map(int, cv2.__version__.split('.')[:2])
 7 | assert MAJOR == 4
 8 | 
 9 | 
10 | def non_max_suppression(boxes: Union[List[ndarray], Tuple[ndarray]],
11 |                         scores: Union[List[float], Tuple[float]],
12 |                         labels: Union[List[int], Tuple[int]],
13 |                         conf_thres: float = 0.25,
14 |                         iou_thres: float = 0.65) -> Tuple[List, List, List]:
15 |     if MINOR >= 7:
16 |         indices = cv2.dnn.NMSBoxesBatched(boxes, scores, labels, conf_thres,
17 |                                           iou_thres)
18 |     elif MINOR == 6:
19 |         indices = cv2.dnn.NMSBoxes(boxes, scores, conf_thres, iou_thres)
20 |     else:
21 |         indices = cv2.dnn.NMSBoxes(boxes, scores, conf_thres,
22 |                                    iou_thres).flatten()
23 | 
24 |     nmsd_boxes = []
25 |     nmsd_scores = []
26 |     nmsd_labels = []
27 |     for idx in indices:
28 |         box = boxes[idx]
29 |         # x0y0wh -> x0y0x1y1
30 |         box[2:] = box[:2] + box[2:]
31 |         score = scores[idx]
32 |         label = labels[idx]
33 |         nmsd_boxes.append(box)
34 |         nmsd_scores.append(score)
35 |         nmsd_labels.append(label)
36 |     return nmsd_boxes, nmsd_scores, nmsd_labels
37 | 


--------------------------------------------------------------------------------
/deploy/easydeploy/examples/preprocess.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple, Union
 2 | 
 3 | import cv2
 4 | import numpy as np
 5 | from config import ModelType
 6 | from numpy import ndarray
 7 | 
 8 | 
 9 | class Preprocess:
10 | 
11 |     def __init__(self, model_type: ModelType):
12 |         if model_type in (ModelType.YOLOV5, ModelType.YOLOV6, ModelType.YOLOV7,
13 |                           ModelType.YOLOV8):
14 |             mean = np.array([0, 0, 0], dtype=np.float32)
15 |             std = np.array([255, 255, 255], dtype=np.float32)
16 |             is_rgb = True
17 |         elif model_type == ModelType.YOLOX:
18 |             mean = np.array([0, 0, 0], dtype=np.float32)
19 |             std = np.array([1, 1, 1], dtype=np.float32)
20 |             is_rgb = False
21 |         elif model_type == ModelType.PPYOLOE:
22 |             mean = np.array([123.675, 116.28, 103.53], dtype=np.float32)
23 |             std = np.array([58.395, 57.12, 57.375], dtype=np.float32)
24 |             is_rgb = True
25 | 
26 |         elif model_type == ModelType.PPYOLOEP:
27 |             mean = np.array([0, 0, 0], dtype=np.float32)
28 |             std = np.array([255, 255, 255], dtype=np.float32)
29 |             is_rgb = True
30 |         elif model_type == ModelType.RTMDET:
31 |             mean = np.array([103.53, 116.28, 123.675], dtype=np.float32)
32 |             std = np.array([57.375, 57.12, 58.3955], dtype=np.float32)
33 |             is_rgb = False
34 |         else:
35 |             raise NotImplementedError
36 | 
37 |         self.mean = mean.reshape((3, 1, 1))
38 |         self.std = std.reshape((3, 1, 1))
39 |         self.is_rgb = is_rgb
40 | 
41 |     def __call__(self,
42 |                  image: ndarray,
43 |                  new_size: Union[List[int], Tuple[int]] = (640, 640),
44 |                  **kwargs) -> Tuple[ndarray, Tuple[float, float]]:
45 |         # new_size: (height, width)
46 |         height, width = image.shape[:2]
47 |         ratio_h, ratio_w = new_size[0] / height, new_size[1] / width
48 |         image = cv2.resize(
49 |             image, (0, 0),
50 |             fx=ratio_w,
51 |             fy=ratio_h,
52 |             interpolation=cv2.INTER_LINEAR)
53 |         image = np.ascontiguousarray(image.transpose(2, 0, 1))
54 |         image = image.astype(np.float32)
55 |         image -= self.mean
56 |         image /= self.std
57 |         return image[np.newaxis], (ratio_w, ratio_h)
58 | 


--------------------------------------------------------------------------------
/deploy/easydeploy/examples/requirements.txt:
--------------------------------------------------------------------------------
1 | onnxruntime
2 | opencv-python==4.7.0.72
3 | 


--------------------------------------------------------------------------------
/deploy/easydeploy/model/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .backend import MMYOLOBackend
3 | from .backendwrapper import ORTWrapper, TRTWrapper
4 | from .model import DeployModel
5 | 
6 | __all__ = ['DeployModel', 'TRTWrapper', 'ORTWrapper', 'MMYOLOBackend']
7 | 


--------------------------------------------------------------------------------
/deploy/easydeploy/model/backend.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | class MMYOLOBackend(Enum):
 8 |     AX620A = 'ax620a'
 9 |     COREML = 'coreml'
10 |     HORIZONX3 = 'horizonx3'
11 |     NCNN = 'ncnn'
12 |     ONNXRUNTIME = 'onnxruntime'
13 |     OPENVINO = 'openvino'
14 |     PPLNN = 'pplnn'
15 |     RKNN = 'rknn'
16 |     TENSORRT8 = 'tensorrt8'
17 |     TENSORRT7 = 'tensorrt7'
18 |     TORCHSCRIPT = 'torchscript'
19 |     TVM = 'tvm'
20 | 
21 | 
22 | def HSigmoid__forward(self, x: torch.Tensor) -> torch.Tensor:
23 |     return F.hardsigmoid(x, inplace=True)
24 | 


--------------------------------------------------------------------------------
/deploy/easydeploy/nms/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .ort_nms import onnx_nms
3 | from .trt_nms import batched_nms, efficient_nms
4 | 
5 | __all__ = ['efficient_nms', 'batched_nms', 'onnx_nms']
6 | 


--------------------------------------------------------------------------------
/deploy/easydeploy/onnx_demo.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/D-Robotics-AI-Lab/DOSOD/2b133c3a9f7a9b685635090d219efa84a6951e75/deploy/easydeploy/onnx_demo.py


--------------------------------------------------------------------------------
/docs/deploy.md:
--------------------------------------------------------------------------------
 1 | ## Deploy YOLO-World
 2 | 
 3 | - [x] ONNX export
 4 | - [x] ONNX demo
 5 | - [ ] TensorRT
 6 | - [ ] TFLite
 7 | 
 8 | We provide several ways to deploy YOLO-World with ONNX or TensorRT
 9 | 
10 | ### Priliminaries
11 | 
12 | ```bash
13 | pip install supervision onnx onnxruntime onnxsim
14 | ```
15 | 
16 | ### Export ONNX on Gradio Demo
17 | 
18 | start the `demo.py` and you can modify the texts in the demo and output the ONNX model.
19 | 
20 | ```bash
21 | python demo.py path/to/config path/to/weights
22 | ```
23 | 
24 | ### Export YOLO-World to ONNX models
25 | 
26 | You can also use [`export_onnx.py`](../deploy/export_onnx.py) to obtain the ONNX model. You might specify the `--custom-text` with your own `Text JSON` for your custom prompts. The format of `Text JSON` can be found in [`docs/data`](../docs/data.md).
27 | 
28 | ```bash
29 | PYTHONPATH=./ python deploy/export_onnx.py path/to/config path/to/weights --custom-text path/to/customtexts --opset 11
30 | ```
31 | 
32 | If you don't want to include `NMS` or "post-processing" into the ONNX model, you can add `--without-nms`
33 | ```bash
34 | PYTHONPATH=./ python deploy/export_onnx.py path/to/config path/to/weights --custom-text path/to/customtexts --opset 11 --without-nms
35 | ```
36 | 
37 | If you want to quantize YOLO-World with ONNX model, you'd better remove `NMS` and `bbox_decoder` by adding `--without-bbox-decoder`
38 | 
39 | ```bash
40 | PYTHONPATH=./ python deploy/export_onnx.py path/to/config path/to/weights --custom-text path/to/customtexts --opset 11 --without-bbox-decoder
41 | ```
42 | 
43 | **Running ONNX demo**
44 | 
45 | ```bash
46 | python deploy/onnx_demo.py path/to/model.onnx path/to/images path/to/texts
47 | ```
48 | 
49 | 
50 | ### Export YOLO-World to TensorRT models
51 | 
52 | coming soon.
53 | 
54 | ### FAQ
55 | 
56 | **Q1**. `RuntimeError: Exporting the operator einsum to ONNX opset version 11 is not supported. Support for this operator was added in version 12, try exporting with this version.`
57 | 
58 | **A:** This error arises because YOLO-World adopts `einsum` for matrix multiplication while it is not supported by `opset 11`. You can set the `--opset` from `11` to `12` if your device supports or change the `einsum` to normal `permute/reshape/multiplication` by set `use_einsum=False` in the `MaxSigmoidCSPLayerWithTwoConv` and `YOLOWorldHeadModule`. You can refer to the [sample config](../configs/pretrain/yolo_world_v2_m_vlpan_bn_noeinsum_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) without einsum.
59 | 
60 | 


--------------------------------------------------------------------------------
/docs/faq.md:
--------------------------------------------------------------------------------
1 | ## Frequently Asked Questions (FAQ)
2 | 
3 | 
4 | 1. ` Incorrect path_or_model_id`
5 | ```bash
6 | OSError: class `YOLOWorldDetector` in yolo_world/models/detectors/yolo_world.py: class `MultiModalYOLOBackbone` in yolo_world/models/backbones/mm_backbone.py: class `HuggingCLIPLanguageBackbone` in yolo_world/models/backbones/mm_backbone.py: Incorrect path_or_model_id: '../pretrained_models/clip-vit-base-patch32-projection'. Please provide either the path to a local folder or the repo_id of a model on the Hub.
7 | ```
8 | 
9 | **Solution:** 


--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
 1 | ## Installation Guide
 2 | 
 3 | YOLO-World is built based on `pytorch=1.11.0` and `mmcv=2.0.0`.
 4 | 
 5 | We provide the `requirements` files in [./requirements](./../requirements/):
 6 | 
 7 | * `basic_requirements`: training, finetuning, evaluation.
 8 | * `demo_requirements`: running YOLO-World [demos](./../demo/).
 9 | * `onnx_requirements`: converting YOLO-World to ONNX or TFLite models (TFLite is coming soon).
10 | 
11 | #### Install `MMCV`
12 | 
13 | YOLO-World adopts `mmcv>=2.0.0`. There are several ways to install `mmcv`
14 | 
15 | **1. using `openmim`**:
16 | 
17 | see more in [official guide](https://github.com/open-mmlab/mmcv/tree/master?tab=readme-ov-file#install-mmcv-full).
18 | 
19 | ```bash
20 | pip install openmim
21 | mim install mmcv==2.0.0 
22 | ```
23 | 
24 | **2. using `pip`**:
25 | 
26 | go to [install-with-pip](https://mmcv.readthedocs.io/en/latest/get_started/installation.html#install-with-pip) to select the pip index. 
27 | 
28 | ```bash
29 | # cuda=11.3, torch=1.11
30 | pip install mmcv==2.0.0 -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11/index.html
31 | # cuda=11.7, torch=1.13
32 | pip install mmcv==2.2.0 -f https://download.openmmlab.com/mmcv/dist/cu117/torch1.13/index.html
33 | # cuda=12.1, torch=2.1
34 | pip install mmcv==2.1.0 -f https://download.openmmlab.com/mmcv/dist/cu121/torch2.1/index.html
35 | ```
36 | 
37 | **3. using `whl`**
38 | 
39 | go to [index packages](https://download.openmmlab.com/mmcv/dist/cu117/torch1.13/index.html) to find a suitable version and download.
40 | 
41 | ```bash
42 | pip install mmcv-2.0.1-cp38-cp38-manylinux1_x86_64.whl
43 | ```


--------------------------------------------------------------------------------
/docs/updates.md:
--------------------------------------------------------------------------------
 1 | ## Update Notes
 2 | 
 3 | We provide the details for important updates of YOLO-World in this note.
 4 | 
 5 | ### Model Architecture
 6 | 
 7 | **[2024-2-29]:** YOLO-World-v2:
 8 | 
 9 | 1. We remove the `I-PoolingAttention`: though it improves the performance for zero-shot LVIS evaluation, it affects the inference speeds after exporting YOLO-World to ONNX or TensorRT. Considering the trade-off, we remove the `I-PoolingAttention` in the newest version.
10 | 2. We replace the `L2-Norm` in the contrastive head with the `BatchNorm`. The `L2-Norm` contains complex operations, such as `reduce`, which is time-consuming for deployment. However, the `BatchNorm` can be fused into the convolution, which is much more efficient and also improves the zero-shot performance.
11 | 
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools","wheel","torch"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "yolo_world"
 7 | version = "0.1.0"
 8 | description = "YOLO-World: Real-time Open Vocabulary Object Detection"
 9 | readme = "README.md"
10 | keywords = ["object detection"]
11 | authors = [
12 |     { name = "Tencent AILab", email = "ronnysong@tencent.com" },
13 | ]
14 | license = {text = "Apache License 2.0"}
15 | 
16 | classifiers = [
17 |     "Development Status :: 4 - Beta",
18 |     "License :: OSI Approved :: Apache Software License",
19 |     "Operating System :: OS Independent",
20 |     "Programming Language :: Python :: 3",
21 |     "Programming Language :: Python :: 3.7",
22 |     "Programming Language :: Python :: 3.8",
23 |     "Programming Language :: Python :: 3.9",
24 |     "Programming Language :: Python :: 3.10",
25 |     "Programming Language :: Python :: 3.11",
26 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
27 | ]
28 | requires-python = ">= 3.7"
29 | 
30 | dependencies = [
31 |     "wheel",
32 |     "torch>=1.11.0",
33 |     "torchvision>=0.16.2",
34 |     "transformers",
35 |     "tokenizers",
36 |     "numpy",
37 |     "opencv-python",
38 |     "supervision==0.19.0",
39 |     "openmim",
40 |     "mmcv-lite>=2.0.0rc4",
41 |     "mmdet==3.0.0",
42 |     "mmengine>=0.7.1",
43 |     "openmim",
44 |     "mmcv",
45 | 
46 | ]
47 | 
48 | [tool.setuptools]
49 | package-dir = {"yolo_world" = "yolo_world"}
50 | include-package-data = false
51 | license-files = ["LICENSE"]
52 | zip-safe = true
53 | 
54 | [tool.setuptools.packages.find]
55 | include = ["yolo_world*"]
56 | exclude = ["docs*", "tests*","third_party*","assets*"]


--------------------------------------------------------------------------------
/requirements/basic_requirements.txt:
--------------------------------------------------------------------------------
1 | opencv-python==4.9.0.80
2 | opencv-python-headless==4.2.0.34
3 | mmcv==2.0.0
4 | mmdet==3.0.0
5 | mmengine==0.10.3
6 | mmyolo==0.6.0
7 | timm==0.6.13
8 | transformers==4.36.2
9 | albumentations


--------------------------------------------------------------------------------
/requirements/demo_requirements.txt:
--------------------------------------------------------------------------------
1 | gradio==4.16.0
2 | supervision


--------------------------------------------------------------------------------
/requirements/onnx_requirements.txt:
--------------------------------------------------------------------------------
1 | supervision
2 | onnx
3 | onnxruntime
4 | onnxsim


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/_base_/default_runtime.py:
--------------------------------------------------------------------------------
 1 | default_scope = 'mmyolo'
 2 | 
 3 | default_hooks = dict(
 4 |     timer=dict(type='IterTimerHook'),
 5 |     logger=dict(type='LoggerHook', interval=50),
 6 |     param_scheduler=dict(type='ParamSchedulerHook'),
 7 |     checkpoint=dict(type='CheckpointHook', interval=1),
 8 |     sampler_seed=dict(type='DistSamplerSeedHook'),
 9 |     visualization=dict(type='mmdet.DetVisualizationHook'))
10 | 
11 | env_cfg = dict(
12 |     cudnn_benchmark=False,
13 |     mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
14 |     dist_cfg=dict(backend='nccl'),
15 | )
16 | 
17 | vis_backends = [dict(type='LocalVisBackend')]
18 | visualizer = dict(
19 |     type='mmdet.DetLocalVisualizer',
20 |     vis_backends=vis_backends,
21 |     name='visualizer')
22 | log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
23 | 
24 | log_level = 'INFO'
25 | load_from = None
26 | resume = False
27 | 
28 | # Example to use different file client
29 | # Method 1: simply set the data root and let the file I/O module
30 | # automatically infer from prefix (not support LMDB and Memcache yet)
31 | 
32 | # data_root = 's3://openmmlab/datasets/detection/coco/'
33 | 
34 | # Method 2: Use `backend_args`, `file_client_args` in versions
35 | # before MMDet 3.0.0rc6
36 | # backend_args = dict(
37 | #     backend='petrel',
38 | #     path_mapping=dict({
39 | #         './data/': 's3://openmmlab/datasets/detection/',
40 | #         'data/': 's3://openmmlab/datasets/detection/'
41 | #     }))
42 | 
43 | backend_args = None
44 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/_base_/det_p5_tta.py:
--------------------------------------------------------------------------------
 1 | # TODO: Need to solve the problem of multiple backend_args parameters
 2 | # _backend_args = dict(
 3 | #     backend='petrel',
 4 | #     path_mapping=dict({
 5 | #         './data/': 's3://openmmlab/datasets/detection/',
 6 | #         'data/': 's3://openmmlab/datasets/detection/'
 7 | #     }))
 8 | 
 9 | _backend_args = None
10 | 
11 | tta_model = dict(
12 |     type='mmdet.DetTTAModel',
13 |     tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.65), max_per_img=300))
14 | 
15 | img_scales = [(640, 640), (320, 320), (960, 960)]
16 | 
17 | #                                LoadImageFromFile
18 | #                     /                 |                     \
19 | # (RatioResize,LetterResize) (RatioResize,LetterResize) (RatioResize,LetterResize) # noqa
20 | #        /      \                    /      \                    /        \
21 | #  RandomFlip RandomFlip      RandomFlip RandomFlip        RandomFlip RandomFlip # noqa
22 | #      |          |                |         |                  |         |
23 | #  LoadAnn    LoadAnn           LoadAnn    LoadAnn           LoadAnn    LoadAnn
24 | #      |          |                |         |                  |         |
25 | #  PackDetIn  PackDetIn         PackDetIn  PackDetIn        PackDetIn  PackDetIn # noqa
26 | 
27 | _multiscale_resize_transforms = [
28 |     dict(
29 |         type='Compose',
30 |         transforms=[
31 |             dict(type='YOLOv5KeepRatioResize', scale=s),
32 |             dict(
33 |                 type='LetterResize',
34 |                 scale=s,
35 |                 allow_scale_up=False,
36 |                 pad_val=dict(img=114))
37 |         ]) for s in img_scales
38 | ]
39 | 
40 | tta_pipeline = [
41 |     dict(type='LoadImageFromFile', backend_args=_backend_args),
42 |     dict(
43 |         type='TestTimeAug',
44 |         transforms=[
45 |             _multiscale_resize_transforms,
46 |             [
47 |                 dict(type='mmdet.RandomFlip', prob=1.),
48 |                 dict(type='mmdet.RandomFlip', prob=0.)
49 |             ], [dict(type='mmdet.LoadAnnotations', with_bbox=True)],
50 |             [
51 |                 dict(
52 |                     type='mmdet.PackDetInputs',
53 |                     meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
54 |                                'scale_factor', 'pad_param', 'flip',
55 |                                'flip_direction'))
56 |             ]
57 |         ])
58 | ]
59 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/deploy/base_dynamic.py:
--------------------------------------------------------------------------------
 1 | _base_ = ['./base_static.py']
 2 | onnx_config = dict(
 3 |     dynamic_axes={
 4 |         'input': {
 5 |             0: 'batch',
 6 |             2: 'height',
 7 |             3: 'width'
 8 |         },
 9 |         'dets': {
10 |             0: 'batch',
11 |             1: 'num_dets'
12 |         },
13 |         'labels': {
14 |             0: 'batch',
15 |             1: 'num_dets'
16 |         }
17 |     })
18 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/deploy/base_static.py:
--------------------------------------------------------------------------------
 1 | onnx_config = dict(
 2 |     type='onnx',
 3 |     export_params=True,
 4 |     keep_initializers_as_inputs=False,
 5 |     opset_version=11,
 6 |     save_file='end2end.onnx',
 7 |     input_names=['input'],
 8 |     output_names=['dets', 'labels'],
 9 |     input_shape=None,
10 |     optimize=True)
11 | codebase_config = dict(
12 |     type='mmyolo',
13 |     task='ObjectDetection',
14 |     model_type='end2end',
15 |     post_processing=dict(
16 |         score_threshold=0.05,
17 |         confidence_threshold=0.005,
18 |         iou_threshold=0.5,
19 |         max_output_boxes_per_class=200,
20 |         pre_top_k=5000,
21 |         keep_top_k=100,
22 |         background_label_id=-1),
23 |     module=['mmyolo.deploy'])
24 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/deploy/detection_onnxruntime_dynamic.py:
--------------------------------------------------------------------------------
 1 | _base_ = ['./base_dynamic.py']
 2 | codebase_config = dict(
 3 |     type='mmyolo',
 4 |     task='ObjectDetection',
 5 |     model_type='end2end',
 6 |     post_processing=dict(
 7 |         score_threshold=0.05,
 8 |         confidence_threshold=0.005,
 9 |         iou_threshold=0.5,
10 |         max_output_boxes_per_class=200,
11 |         pre_top_k=5000,
12 |         keep_top_k=100,
13 |         background_label_id=-1),
14 |     module=['mmyolo.deploy'])
15 | backend_config = dict(type='onnxruntime')
16 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/deploy/detection_onnxruntime_static.py:
--------------------------------------------------------------------------------
 1 | _base_ = ['./base_static.py']
 2 | codebase_config = dict(
 3 |     type='mmyolo',
 4 |     task='ObjectDetection',
 5 |     model_type='end2end',
 6 |     post_processing=dict(
 7 |         score_threshold=0.05,
 8 |         confidence_threshold=0.005,
 9 |         iou_threshold=0.5,
10 |         max_output_boxes_per_class=200,
11 |         pre_top_k=5000,
12 |         keep_top_k=100,
13 |         background_label_id=-1),
14 |     module=['mmyolo.deploy'])
15 | backend_config = dict(type='onnxruntime')
16 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/deploy/detection_rknn-fp16_static-320x320.py:
--------------------------------------------------------------------------------
 1 | _base_ = ['./base_static.py']
 2 | onnx_config = dict(
 3 |     input_shape=[320, 320], output_names=['feat0', 'feat1', 'feat2'])
 4 | codebase_config = dict(model_type='rknn')
 5 | backend_config = dict(
 6 |     type='rknn',
 7 |     common_config=dict(target_platform='rv1126', optimization_level=1),
 8 |     quantization_config=dict(do_quantization=False, dataset=None),
 9 |     input_size_list=[[3, 320, 320]])
10 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/deploy/detection_rknn-int8_static-320x320.py:
--------------------------------------------------------------------------------
 1 | _base_ = ['./base_static.py']
 2 | onnx_config = dict(
 3 |     input_shape=[320, 320], output_names=['feat0', 'feat1', 'feat2'])
 4 | codebase_config = dict(model_type='rknn')
 5 | backend_config = dict(
 6 |     type='rknn',
 7 |     common_config=dict(target_platform='rv1126', optimization_level=1),
 8 |     quantization_config=dict(do_quantization=True, dataset=None),
 9 |     input_size_list=[[3, 320, 320]])
10 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_dynamic-192x192-960x960.py:
--------------------------------------------------------------------------------
 1 | _base_ = ['./base_dynamic.py']
 2 | backend_config = dict(
 3 |     type='tensorrt',
 4 |     common_config=dict(fp16_mode=True, max_workspace_size=1 << 30),
 5 |     model_inputs=[
 6 |         dict(
 7 |             input_shapes=dict(
 8 |                 input=dict(
 9 |                     min_shape=[1, 3, 192, 192],
10 |                     opt_shape=[1, 3, 640, 640],
11 |                     max_shape=[1, 3, 960, 960])))
12 |     ])
13 | use_efficientnms = False  # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501
14 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_dynamic-64x64-1344x1344.py:
--------------------------------------------------------------------------------
 1 | _base_ = ['./base_dynamic.py']
 2 | backend_config = dict(
 3 |     type='tensorrt',
 4 |     common_config=dict(fp16_mode=True, max_workspace_size=1 << 32),
 5 |     model_inputs=[
 6 |         dict(
 7 |             input_shapes=dict(
 8 |                 input=dict(
 9 |                     min_shape=[1, 3, 64, 64],
10 |                     opt_shape=[1, 3, 640, 640],
11 |                     max_shape=[1, 3, 1344, 1344])))
12 |     ])
13 | use_efficientnms = False  # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501
14 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_static-640x640.py:
--------------------------------------------------------------------------------
 1 | _base_ = ['./base_static.py']
 2 | onnx_config = dict(input_shape=(640, 640))
 3 | backend_config = dict(
 4 |     type='tensorrt',
 5 |     common_config=dict(fp16_mode=True, max_workspace_size=1 << 30),
 6 |     model_inputs=[
 7 |         dict(
 8 |             input_shapes=dict(
 9 |                 input=dict(
10 |                     min_shape=[1, 3, 640, 640],
11 |                     opt_shape=[1, 3, 640, 640],
12 |                     max_shape=[1, 3, 640, 640])))
13 |     ])
14 | use_efficientnms = False  # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501
15 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/deploy/detection_tensorrt-int8_dynamic-192x192-960x960.py:
--------------------------------------------------------------------------------
 1 | _base_ = ['./base_dynamic.py']
 2 | backend_config = dict(
 3 |     type='tensorrt',
 4 |     common_config=dict(
 5 |         fp16_mode=True, max_workspace_size=1 << 30, int8_mode=True),
 6 |     model_inputs=[
 7 |         dict(
 8 |             input_shapes=dict(
 9 |                 input=dict(
10 |                     min_shape=[1, 3, 192, 192],
11 |                     opt_shape=[1, 3, 640, 640],
12 |                     max_shape=[1, 3, 960, 960])))
13 |     ])
14 | calib_config = dict(create_calib=True, calib_file='calib_data.h5')
15 | use_efficientnms = False  # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501
16 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/deploy/detection_tensorrt-int8_static-640x640.py:
--------------------------------------------------------------------------------
 1 | _base_ = ['./base_static.py']
 2 | onnx_config = dict(input_shape=(640, 640))
 3 | backend_config = dict(
 4 |     type='tensorrt',
 5 |     common_config=dict(
 6 |         fp16_mode=True, max_workspace_size=1 << 30, int8_mode=True),
 7 |     model_inputs=[
 8 |         dict(
 9 |             input_shapes=dict(
10 |                 input=dict(
11 |                     min_shape=[1, 3, 640, 640],
12 |                     opt_shape=[1, 3, 640, 640],
13 |                     max_shape=[1, 3, 640, 640])))
14 |     ])
15 | calib_config = dict(create_calib=True, calib_file='calib_data.h5')
16 | use_efficientnms = False  # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501
17 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/deploy/detection_tensorrt_dynamic-192x192-960x960.py:
--------------------------------------------------------------------------------
 1 | _base_ = ['./base_dynamic.py']
 2 | backend_config = dict(
 3 |     type='tensorrt',
 4 |     common_config=dict(fp16_mode=False, max_workspace_size=1 << 30),
 5 |     model_inputs=[
 6 |         dict(
 7 |             input_shapes=dict(
 8 |                 input=dict(
 9 |                     min_shape=[1, 3, 192, 192],
10 |                     opt_shape=[1, 3, 640, 640],
11 |                     max_shape=[1, 3, 960, 960])))
12 |     ])
13 | use_efficientnms = False  # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501
14 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/deploy/detection_tensorrt_static-640x640.py:
--------------------------------------------------------------------------------
 1 | _base_ = ['./base_static.py']
 2 | onnx_config = dict(input_shape=(640, 640))
 3 | backend_config = dict(
 4 |     type='tensorrt',
 5 |     common_config=dict(fp16_mode=False, max_workspace_size=1 << 30),
 6 |     model_inputs=[
 7 |         dict(
 8 |             input_shapes=dict(
 9 |                 input=dict(
10 |                     min_shape=[1, 3, 640, 640],
11 |                     opt_shape=[1, 3, 640, 640],
12 |                     max_shape=[1, 3, 640, 640])))
13 |     ])
14 | use_efficientnms = False  # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501
15 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/deploy/model/yolov5_s-static.py:
--------------------------------------------------------------------------------
 1 | _base_ = '../../yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py'
 2 | 
 3 | test_pipeline = [
 4 |     dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
 5 |     dict(
 6 |         type='LetterResize',
 7 |         scale=_base_.img_scale,
 8 |         allow_scale_up=False,
 9 |         use_mini_pad=False,
10 |     ),
11 |     dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
12 |     dict(
13 |         type='mmdet.PackDetInputs',
14 |         meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
15 |                    'scale_factor', 'pad_param'))
16 | ]
17 | 
18 | test_dataloader = dict(
19 |     dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=None))
20 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/deploy/model/yolov6_s-static.py:
--------------------------------------------------------------------------------
 1 | _base_ = '../../yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py'
 2 | 
 3 | test_pipeline = [
 4 |     dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
 5 |     dict(
 6 |         type='LetterResize',
 7 |         scale=_base_.img_scale,
 8 |         allow_scale_up=False,
 9 |         use_mini_pad=False,
10 |     ),
11 |     dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
12 |     dict(
13 |         type='mmdet.PackDetInputs',
14 |         meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
15 |                    'scale_factor', 'pad_param'))
16 | ]
17 | 
18 | test_dataloader = dict(
19 |     dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=None))
20 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/ppyoloe/ppyoloe_l_fast_8xb20-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './ppyoloe_s_fast_8xb32-300e_coco.py'
 2 | 
 3 | # The pretrained model is geted and converted from official PPYOLOE.
 4 | # https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md
 5 | checkpoint = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/cspresnet_l_imagenet1k_pretrained-c0010e6c.pth'  # noqa
 6 | 
 7 | deepen_factor = 1.0
 8 | widen_factor = 1.0
 9 | 
10 | train_batch_size_per_gpu = 20
11 | 
12 | model = dict(
13 |     backbone=dict(
14 |         deepen_factor=deepen_factor,
15 |         widen_factor=widen_factor,
16 |         init_cfg=dict(checkpoint=checkpoint)),
17 |     neck=dict(
18 |         deepen_factor=deepen_factor,
19 |         widen_factor=widen_factor,
20 |     ),
21 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
22 | 
23 | train_dataloader = dict(batch_size=train_batch_size_per_gpu)
24 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/ppyoloe/ppyoloe_m_fast_8xb28-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './ppyoloe_s_fast_8xb32-300e_coco.py'
 2 | 
 3 | # The pretrained model is geted and converted from official PPYOLOE.
 4 | # https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md
 5 | checkpoint = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/cspresnet_m_imagenet1k_pretrained-09f1eba2.pth'  # noqa
 6 | 
 7 | deepen_factor = 0.67
 8 | widen_factor = 0.75
 9 | 
10 | train_batch_size_per_gpu = 28
11 | 
12 | model = dict(
13 |     backbone=dict(
14 |         deepen_factor=deepen_factor,
15 |         widen_factor=widen_factor,
16 |         init_cfg=dict(checkpoint=checkpoint)),
17 |     neck=dict(
18 |         deepen_factor=deepen_factor,
19 |         widen_factor=widen_factor,
20 |     ),
21 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
22 | 
23 | train_dataloader = dict(batch_size=train_batch_size_per_gpu)
24 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_l_fast_8xb8-80e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './ppyoloe_plus_s_fast_8xb8-80e_coco.py'
 2 | 
 3 | # The pretrained model is geted and converted from official PPYOLOE.
 4 | # https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md
 5 | load_from = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/ppyoloe_plus_l_obj365_pretrained-3dd89562.pth'  # noqa
 6 | 
 7 | deepen_factor = 1.0
 8 | widen_factor = 1.0
 9 | 
10 | model = dict(
11 |     backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
12 |     neck=dict(
13 |         deepen_factor=deepen_factor,
14 |         widen_factor=widen_factor,
15 |     ),
16 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
17 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './ppyoloe_plus_s_fast_8xb8-80e_coco.py'
 2 | 
 3 | # The pretrained model is geted and converted from official PPYOLOE.
 4 | # https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md
 5 | load_from = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/ppyoloe_plus_m_ojb365_pretrained-03206892.pth'  # noqa
 6 | 
 7 | deepen_factor = 0.67
 8 | widen_factor = 0.75
 9 | 
10 | model = dict(
11 |     backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
12 |     neck=dict(
13 |         deepen_factor=deepen_factor,
14 |         widen_factor=widen_factor,
15 |     ),
16 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
17 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_s_fast_1xb12-40e_cat.py:
--------------------------------------------------------------------------------
 1 | # Compared to other same scale models, this configuration consumes too much
 2 | # GPU memory and is not validated for now
 3 | _base_ = 'ppyoloe_plus_s_fast_8xb8-80e_coco.py'
 4 | 
 5 | data_root = './data/cat/'
 6 | class_name = ('cat', )
 7 | num_classes = len(class_name)
 8 | metainfo = dict(classes=class_name, palette=[(20, 220, 60)])
 9 | 
10 | num_last_epochs = 5
11 | 
12 | max_epochs = 40
13 | train_batch_size_per_gpu = 12
14 | train_num_workers = 2
15 | 
16 | load_from = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco/ppyoloe_plus_s_fast_8xb8-80e_coco_20230101_154052-9fee7619.pth'  # noqa
17 | 
18 | model = dict(
19 |     backbone=dict(frozen_stages=4),
20 |     bbox_head=dict(head_module=dict(num_classes=num_classes)),
21 |     train_cfg=dict(
22 |         initial_assigner=dict(num_classes=num_classes),
23 |         assigner=dict(num_classes=num_classes)))
24 | 
25 | train_dataloader = dict(
26 |     batch_size=train_batch_size_per_gpu,
27 |     num_workers=train_num_workers,
28 |     dataset=dict(
29 |         data_root=data_root,
30 |         metainfo=metainfo,
31 |         ann_file='annotations/trainval.json',
32 |         data_prefix=dict(img='images/')))
33 | 
34 | val_dataloader = dict(
35 |     dataset=dict(
36 |         metainfo=metainfo,
37 |         data_root=data_root,
38 |         ann_file='annotations/test.json',
39 |         data_prefix=dict(img='images/')))
40 | 
41 | test_dataloader = val_dataloader
42 | 
43 | default_hooks = dict(
44 |     param_scheduler=dict(
45 |         warmup_min_iter=10,
46 |         warmup_epochs=3,
47 |         total_epochs=int(max_epochs * 1.2)))
48 | 
49 | val_evaluator = dict(ann_file=data_root + 'annotations/test.json')
50 | test_evaluator = val_evaluator
51 | 
52 | default_hooks = dict(
53 |     checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'),
54 |     logger=dict(type='LoggerHook', interval=5))
55 | train_cfg = dict(max_epochs=max_epochs, val_interval=10)
56 | # visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa
57 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_x_fast_8xb8-80e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './ppyoloe_plus_s_fast_8xb8-80e_coco.py'
 2 | 
 3 | # The pretrained model is geted and converted from official PPYOLOE.
 4 | # https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md
 5 | load_from = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/ppyoloe_plus_x_obj365_pretrained-43a8000d.pth'  # noqa
 6 | 
 7 | deepen_factor = 1.33
 8 | widen_factor = 1.25
 9 | 
10 | model = dict(
11 |     backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
12 |     neck=dict(
13 |         deepen_factor=deepen_factor,
14 |         widen_factor=widen_factor,
15 |     ),
16 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
17 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/ppyoloe/ppyoloe_s_fast_8xb32-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './ppyoloe_plus_s_fast_8xb8-80e_coco.py'
 2 | 
 3 | # The pretrained model is geted and converted from official PPYOLOE.
 4 | # https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md
 5 | checkpoint = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/cspresnet_s_imagenet1k_pretrained-2be81763.pth'  # noqa
 6 | 
 7 | train_batch_size_per_gpu = 32
 8 | max_epochs = 300
 9 | 
10 | # Base learning rate for optim_wrapper
11 | base_lr = 0.01
12 | 
13 | model = dict(
14 |     data_preprocessor=dict(
15 |         mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
16 |         std=[0.229 * 255., 0.224 * 255., 0.225 * 255.]),
17 |     backbone=dict(
18 |         block_cfg=dict(use_alpha=False),
19 |         init_cfg=dict(
20 |             type='Pretrained',
21 |             prefix='backbone.',
22 |             checkpoint=checkpoint,
23 |             map_location='cpu')),
24 |     train_cfg=dict(initial_epoch=100))
25 | 
26 | train_dataloader = dict(batch_size=train_batch_size_per_gpu)
27 | 
28 | optim_wrapper = dict(optimizer=dict(lr=base_lr))
29 | 
30 | default_hooks = dict(param_scheduler=dict(total_epochs=int(max_epochs * 1.2)))
31 | 
32 | train_cfg = dict(max_epochs=max_epochs)
33 | 
34 | # PPYOLOE plus use obj365 pretrained model, but PPYOLOE not,
35 | # `load_from` need to set to None.
36 | load_from = None
37 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/ppyoloe/ppyoloe_s_fast_8xb32-400e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './ppyoloe_s_fast_8xb32-300e_coco.py'
 2 | 
 3 | max_epochs = 400
 4 | 
 5 | model = dict(train_cfg=dict(initial_epoch=133))
 6 | 
 7 | default_hooks = dict(param_scheduler=dict(total_epochs=int(max_epochs * 1.2)))
 8 | 
 9 | train_cfg = dict(max_epochs=max_epochs)
10 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/ppyoloe/ppyoloe_x_fast_8xb16-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './ppyoloe_s_fast_8xb32-300e_coco.py'
 2 | 
 3 | # The pretrained model is geted and converted from official PPYOLOE.
 4 | # https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md
 5 | checkpoint = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/cspresnet_x_imagenet1k_pretrained-81c33ccb.pth'  # noqa
 6 | 
 7 | deepen_factor = 1.33
 8 | widen_factor = 1.25
 9 | 
10 | train_batch_size_per_gpu = 16
11 | 
12 | model = dict(
13 |     backbone=dict(
14 |         deepen_factor=deepen_factor,
15 |         widen_factor=widen_factor,
16 |         init_cfg=dict(checkpoint=checkpoint)),
17 |     neck=dict(
18 |         deepen_factor=deepen_factor,
19 |         widen_factor=widen_factor,
20 |     ),
21 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
22 | 
23 | train_dataloader = dict(batch_size=train_batch_size_per_gpu)
24 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/razor/subnets/yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     'mmrazor::_base_/nas_backbones/spos_shufflenet_supernet.py',
 3 |     '../../yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py'
 4 | ]
 5 | 
 6 | checkpoint_file = 'https://download.openmmlab.com/mmrazor/v1/spos/spos_shufflenetv2_subnet_8xb128_in1k_flops_0.33M_acc_73.87_20211222-1f0a0b4d_v3.pth'  # noqa
 7 | fix_subnet = 'https://download.openmmlab.com/mmrazor/v1/spos/spos_shufflenetv2_subnet_8xb128_in1k_flops_0.33M_acc_73.87_20211222-1f0a0b4d_subnet_cfg_v3.yaml'  # noqa
 8 | widen_factor = 1.0
 9 | channels = [160, 320, 640]
10 | 
11 | _base_.nas_backbone.out_indices = (1, 2, 3)
12 | _base_.nas_backbone.init_cfg = dict(
13 |     type='Pretrained',
14 |     checkpoint=checkpoint_file,
15 |     prefix='architecture.backbone.')
16 | nas_backbone = dict(
17 |     type='mmrazor.sub_model',
18 |     fix_subnet=fix_subnet,
19 |     cfg=_base_.nas_backbone,
20 |     extra_prefix='architecture.backbone.')
21 | 
22 | _base_.model.backbone = nas_backbone
23 | _base_.model.neck.widen_factor = widen_factor
24 | _base_.model.neck.in_channels = channels
25 | _base_.model.neck.out_channels = channels
26 | _base_.model.bbox_head.head_module.in_channels = channels
27 | _base_.model.bbox_head.head_module.widen_factor = widen_factor
28 | 
29 | find_unused_parameters = True
30 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/razor/subnets/yolov6_l_attentivenas_a6_d12_syncbn_fast_8xb32-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     'mmrazor::_base_/nas_backbones/attentive_mobilenetv3_supernet.py',
 3 |     '../../yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco.py'
 4 | ]
 5 | 
 6 | checkpoint_file = 'https://download.openmmlab.com/mmrazor/v1/bignas/attentive_mobilenet_subnet_8xb256_in1k_flops-0.93G_acc-80.81_20221229_200440-73d92cc6.pth'  # noqa
 7 | fix_subnet = 'https://download.openmmlab.com/mmrazor/v1/bignas/ATTENTIVE_SUBNET_A6.yaml'  # noqa
 8 | deepen_factor = 1.2
 9 | widen_factor = 1
10 | channels = [40, 128, 224]
11 | mid_channels = [40, 128, 224]
12 | 
13 | _base_.train_dataloader.batch_size = 16
14 | _base_.nas_backbone.out_indices = (2, 4, 6)
15 | _base_.nas_backbone.conv_cfg = dict(type='mmrazor.BigNasConv2d')
16 | _base_.nas_backbone.norm_cfg = dict(type='mmrazor.DynamicBatchNorm2d')
17 | _base_.nas_backbone.init_cfg = dict(
18 |     type='Pretrained',
19 |     checkpoint=checkpoint_file,
20 |     prefix='architecture.backbone.')
21 | nas_backbone = dict(
22 |     type='mmrazor.sub_model',
23 |     fix_subnet=fix_subnet,
24 |     cfg=_base_.nas_backbone,
25 |     extra_prefix='backbone.')
26 | 
27 | _base_.model.backbone = nas_backbone
28 | _base_.model.neck.widen_factor = widen_factor
29 | _base_.model.neck.deepen_factor = deepen_factor
30 | _base_.model.neck.in_channels = channels
31 | _base_.model.neck.out_channels = mid_channels
32 | _base_.model.bbox_head.head_module.in_channels = mid_channels
33 | _base_.model.bbox_head.head_module.widen_factor = widen_factor
34 | 
35 | find_unused_parameters = True
36 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/README.md:
--------------------------------------------------------------------------------
 1 | # CSPNeXt ImageNet Pre-training
 2 | 
 3 | In this folder, we provide the imagenet pre-training config of RTMDet's backbone CSPNeXt.
 4 | 
 5 | ## Requirements
 6 | 
 7 | To train with these configs, please install [MMClassification 1.x](https://github.com/open-mmlab/mmclassification/tree/1.x) first.
 8 | 
 9 | Install by MIM:
10 | 
11 | ```shell
12 | mim install mmcls>=1.0.0rc0
13 | ```
14 | 
15 | or install by pip:
16 | 
17 | ```shell
18 | pip install mmcls>=1.0.0rc0
19 | ```
20 | 
21 | ## Prepare Dataset
22 | 
23 | To pre-train on ImageNet, you need to prepare the dataset first. Please refer to the [guide](https://mmclassification.readthedocs.io/en/1.x/user_guides/dataset_prepare.html#imagenet).
24 | 
25 | ## How to Train
26 | 
27 | You can use the classification config in the same way as the detection config.
28 | 
29 | For single-GPU training, run:
30 | 
31 | ```shell
32 | python tools/train.py \
33 |     ${CONFIG_FILE} \
34 |     [optional arguments]
35 | ```
36 | 
37 | For multi-GPU training, run:
38 | 
39 | ```shell
40 | bash ./tools/dist_train.sh \
41 |     ${CONFIG_FILE} \
42 |     ${GPU_NUM} \
43 |     [optional arguments]
44 | ```
45 | 
46 | More details can be found in [user guides](https://mmdetection.readthedocs.io/en/3.x/user_guides/train.html).
47 | 
48 | ## Results and Models
49 | 
50 | |    Model     | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                                                      Download                                                       |
51 | | :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :-----------------------------------------------------------------------------------------------------------------: |
52 | | CSPNeXt-tiny |  224x224   |   2.73    |  0.339   |   69.44   |   89.45   | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth) |
53 | |  CSPNeXt-s   |  224x224   |   4.89    |  0.664   |   74.41   |   92.23   |  [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth)   |
54 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/cspnext-s_8xb256-rsb-a1-600e_in1k.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     'mmcls::_base_/datasets/imagenet_bs256_rsb_a12.py',
 3 |     'mmcls::_base_/schedules/imagenet_bs2048_rsb.py',
 4 |     'mmcls::_base_/default_runtime.py'
 5 | ]
 6 | 
 7 | custom_imports = dict(
 8 |     imports=['mmdet.models', 'mmyolo.models'], allow_failed_imports=False)
 9 | 
10 | model = dict(
11 |     type='ImageClassifier',
12 |     backbone=dict(
13 |         type='mmyolo.CSPNeXt',
14 |         arch='P5',
15 |         out_indices=(4, ),
16 |         expand_ratio=0.5,
17 |         deepen_factor=0.33,
18 |         widen_factor=0.5,
19 |         channel_attention=True,
20 |         norm_cfg=dict(type='BN'),
21 |         act_cfg=dict(type='mmyolo.SiLU')),
22 |     neck=dict(type='GlobalAveragePooling'),
23 |     head=dict(
24 |         type='LinearClsHead',
25 |         num_classes=1000,
26 |         in_channels=512,
27 |         loss=dict(
28 |             type='LabelSmoothLoss',
29 |             label_smooth_val=0.1,
30 |             mode='original',
31 |             loss_weight=1.0),
32 |         topk=(1, 5)),
33 |     train_cfg=dict(augments=[
34 |         dict(type='Mixup', alpha=0.2, num_classes=1000),
35 |         dict(type='CutMix', alpha=1.0, num_classes=1000)
36 |     ]))
37 | 
38 | # dataset settings
39 | train_dataloader = dict(sampler=dict(type='RepeatAugSampler', shuffle=True))
40 | 
41 | # schedule settings
42 | optim_wrapper = dict(
43 |     optimizer=dict(weight_decay=0.01),
44 |     paramwise_cfg=dict(bias_decay_mult=0., norm_decay_mult=0.),
45 | )
46 | 
47 | param_scheduler = [
48 |     # warm up learning rate scheduler
49 |     dict(
50 |         type='LinearLR',
51 |         start_factor=0.0001,
52 |         by_epoch=True,
53 |         begin=0,
54 |         end=5,
55 |         # update by iter
56 |         convert_to_iter_based=True),
57 |     # main learning rate scheduler
58 |     dict(
59 |         type='CosineAnnealingLR',
60 |         T_max=595,
61 |         eta_min=1.0e-6,
62 |         by_epoch=True,
63 |         begin=5,
64 |         end=600)
65 | ]
66 | 
67 | train_cfg = dict(by_epoch=True, max_epochs=600)
68 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/cspnext-tiny_8xb256-rsb-a1-600e_in1k.py:
--------------------------------------------------------------------------------
1 | _base_ = './cspnext-s_8xb256-rsb-a1-600e_in1k.py'
2 | 
3 | model = dict(
4 |     backbone=dict(deepen_factor=0.167, widen_factor=0.375),
5 |     head=dict(in_channels=384))
6 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py:
--------------------------------------------------------------------------------
 1 | _base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py'
 2 | 
 3 | # ========================modified parameters======================
 4 | data_root = 'data/split_ms_dota/'
 5 | # Path of test images folder
 6 | test_data_prefix = 'test/images/'
 7 | # Submission dir for result submit
 8 | submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission'
 9 | 
10 | # =======================Unmodified in most cases==================
11 | train_dataloader = dict(dataset=dict(data_root=data_root))
12 | 
13 | val_dataloader = dict(dataset=dict(data_root=data_root))
14 | 
15 | # Inference on val dataset
16 | test_dataloader = val_dataloader
17 | 
18 | # Inference on test dataset and format the output results
19 | # for submission. Note: the test set has no annotation.
20 | # test_dataloader = dict(
21 | #     dataset=dict(
22 | #         data_root=data_root,
23 | #         ann_file='', # test set has no annotation
24 | #         data_prefix=dict(img_path=test_data_prefix),
25 | #         pipeline=_base_.test_pipeline))
26 | # test_evaluator = dict(
27 | #     type='mmrotate.DOTAMetric',
28 | #     format_only=True,
29 | #     merge_patches=True,
30 | #     outfile_prefix=submission_dir)
31 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_coco-pretrain_2xb4-36e_dota-ms.py:
--------------------------------------------------------------------------------
 1 | _base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py'
 2 | 
 3 | load_from = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco/rtmdet_l_syncbn_fast_8xb32-300e_coco_20230102_135928-ee3abdc4.pth'  # noqa
 4 | 
 5 | # Submission dir for result submit
 6 | submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission'
 7 | 
 8 | # Inference on test dataset and format the output results
 9 | # for submission. Note: the test set has no annotation.
10 | # test_dataloader = dict(
11 | #     dataset=dict(
12 | #         data_root=_base_.data_root,
13 | #         ann_file='', # test set has no annotation
14 | #         data_prefix=dict(img_path=_base_.test_data_prefix),
15 | #         pipeline=_base_.test_pipeline))
16 | # test_evaluator = dict(
17 | #     type='mmrotate.DOTAMetric',
18 | #     format_only=True,
19 | #     merge_patches=True,
20 | #     outfile_prefix=submission_dir)
21 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota-ms.py:
--------------------------------------------------------------------------------
 1 | _base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py'
 2 | 
 3 | checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth'  # noqa
 4 | 
 5 | # ========================modified parameters======================
 6 | deepen_factor = 0.67
 7 | widen_factor = 0.75
 8 | 
 9 | # Submission dir for result submit
10 | submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission'
11 | 
12 | # =======================Unmodified in most cases==================
13 | model = dict(
14 |     backbone=dict(
15 |         deepen_factor=deepen_factor,
16 |         widen_factor=widen_factor,
17 |         init_cfg=dict(checkpoint=checkpoint)),
18 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
19 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
20 | 
21 | # Inference on test dataset and format the output results
22 | # for submission. Note: the test set has no annotation.
23 | # test_dataloader = dict(
24 | #     dataset=dict(
25 | #         data_root=_base_.data_root,
26 | #         ann_file='', # test set has no annotation
27 | #         data_prefix=dict(img_path=_base_.test_data_prefix),
28 | #         pipeline=_base_.test_pipeline))
29 | # test_evaluator = dict(
30 | #     type='mmrotate.DOTAMetric',
31 | #     format_only=True,
32 | #     merge_patches=True,
33 | #     outfile_prefix=submission_dir)
34 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota.py:
--------------------------------------------------------------------------------
 1 | _base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py'
 2 | 
 3 | checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth'  # noqa
 4 | 
 5 | # ========================modified parameters======================
 6 | deepen_factor = 0.67
 7 | widen_factor = 0.75
 8 | 
 9 | # Submission dir for result submit
10 | submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission'
11 | 
12 | # =======================Unmodified in most cases==================
13 | model = dict(
14 |     backbone=dict(
15 |         deepen_factor=deepen_factor,
16 |         widen_factor=widen_factor,
17 |         init_cfg=dict(checkpoint=checkpoint)),
18 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
19 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
20 | 
21 | # Inference on test dataset and format the output results
22 | # for submission. Note: the test set has no annotation.
23 | # test_dataloader = dict(
24 | #     dataset=dict(
25 | #         data_root=_base_.data_root,
26 | #         ann_file='', # test set has no annotation
27 | #         data_prefix=dict(img_path=_base_.test_data_prefix),
28 | #         pipeline=_base_.test_pipeline))
29 | # test_evaluator = dict(
30 | #     type='mmrotate.DOTAMetric',
31 | #     format_only=True,
32 | #     merge_patches=True,
33 | #     outfile_prefix=submission_dir)
34 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota-ms.py:
--------------------------------------------------------------------------------
 1 | _base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py'
 2 | 
 3 | checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth'  # noqa
 4 | 
 5 | # ========================modified parameters======================
 6 | deepen_factor = 0.33
 7 | widen_factor = 0.5
 8 | 
 9 | # Batch size of a single GPU during training
10 | train_batch_size_per_gpu = 8
11 | 
12 | # Submission dir for result submit
13 | submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission'
14 | 
15 | # =======================Unmodified in most cases==================
16 | model = dict(
17 |     backbone=dict(
18 |         deepen_factor=deepen_factor,
19 |         widen_factor=widen_factor,
20 |         init_cfg=dict(checkpoint=checkpoint)),
21 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
22 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
23 | 
24 | train_dataloader = dict(batch_size=train_batch_size_per_gpu)
25 | 
26 | # Inference on test dataset and format the output results
27 | # for submission. Note: the test set has no annotation.
28 | # test_dataloader = dict(
29 | #     dataset=dict(
30 | #         data_root=_base_.data_root,
31 | #         ann_file='', # test set has no annotation
32 | #         data_prefix=dict(img_path=_base_.test_data_prefix),
33 | #         pipeline=_base_.test_pipeline))
34 | # test_evaluator = dict(
35 | #     type='mmrotate.DOTAMetric',
36 | #     format_only=True,
37 | #     merge_patches=True,
38 | #     outfile_prefix=submission_dir)
39 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota.py:
--------------------------------------------------------------------------------
 1 | _base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py'
 2 | 
 3 | checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth'  # noqa
 4 | 
 5 | # ========================modified parameters======================
 6 | deepen_factor = 0.33
 7 | widen_factor = 0.5
 8 | 
 9 | # Batch size of a single GPU during training
10 | train_batch_size_per_gpu = 8
11 | 
12 | # Submission dir for result submit
13 | submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission'
14 | 
15 | # =======================Unmodified in most cases==================
16 | model = dict(
17 |     backbone=dict(
18 |         deepen_factor=deepen_factor,
19 |         widen_factor=widen_factor,
20 |         init_cfg=dict(checkpoint=checkpoint)),
21 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
22 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
23 | 
24 | train_dataloader = dict(batch_size=train_batch_size_per_gpu)
25 | 
26 | # Inference on test dataset and format the output results
27 | # for submission. Note: the test set has no annotation.
28 | # test_dataloader = dict(
29 | #     dataset=dict(
30 | #         data_root=_base_.data_root,
31 | #         ann_file='', # test set has no annotation
32 | #         data_prefix=dict(img_path=_base_.test_data_prefix),
33 | #         pipeline=_base_.test_pipeline))
34 | # test_evaluator = dict(
35 | #     type='mmrotate.DOTAMetric',
36 | #     format_only=True,
37 | #     merge_patches=True,
38 | #     outfile_prefix=submission_dir)
39 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota-ms.py:
--------------------------------------------------------------------------------
 1 | _base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py'
 2 | 
 3 | checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth'  # noqa
 4 | 
 5 | # ========================modified parameters======================
 6 | deepen_factor = 0.167
 7 | widen_factor = 0.375
 8 | 
 9 | # Batch size of a single GPU during training
10 | train_batch_size_per_gpu = 8
11 | 
12 | # Submission dir for result submit
13 | submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission'
14 | 
15 | # =======================Unmodified in most cases==================
16 | model = dict(
17 |     backbone=dict(
18 |         deepen_factor=deepen_factor,
19 |         widen_factor=widen_factor,
20 |         init_cfg=dict(checkpoint=checkpoint)),
21 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
22 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
23 | 
24 | train_dataloader = dict(batch_size=train_batch_size_per_gpu)
25 | 
26 | # Inference on test dataset and format the output results
27 | # for submission. Note: the test set has no annotation.
28 | # test_dataloader = dict(
29 | #     dataset=dict(
30 | #         data_root=_base_.data_root,
31 | #         ann_file='', # test set has no annotation
32 | #         data_prefix=dict(img_path=_base_.test_data_prefix),
33 | #         pipeline=_base_.test_pipeline))
34 | # test_evaluator = dict(
35 | #     type='mmrotate.DOTAMetric',
36 | #     format_only=True,
37 | #     merge_patches=True,
38 | #     outfile_prefix=submission_dir)
39 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota.py:
--------------------------------------------------------------------------------
 1 | _base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py'
 2 | 
 3 | checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth'  # noqa
 4 | 
 5 | # ========================modified parameters======================
 6 | deepen_factor = 0.167
 7 | widen_factor = 0.375
 8 | 
 9 | # Batch size of a single GPU during training
10 | train_batch_size_per_gpu = 8
11 | 
12 | # Submission dir for result submit
13 | submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission'
14 | 
15 | # =======================Unmodified in most cases==================
16 | model = dict(
17 |     backbone=dict(
18 |         deepen_factor=deepen_factor,
19 |         widen_factor=widen_factor,
20 |         init_cfg=dict(checkpoint=checkpoint)),
21 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
22 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
23 | 
24 | train_dataloader = dict(batch_size=train_batch_size_per_gpu)
25 | 
26 | # Inference on test dataset and format the output results
27 | # for submission. Note: the test set has no annotation.
28 | # test_dataloader = dict(
29 | #     dataset=dict(
30 | #         data_root=_base_.data_root,
31 | #         ann_file='', # test set has no annotation
32 | #         data_prefix=dict(img_path=_base_.test_data_prefix),
33 | #         pipeline=_base_.test_pipeline))
34 | # test_evaluator = dict(
35 | #     type='mmrotate.DOTAMetric',
36 | #     format_only=True,
37 | #     merge_patches=True,
38 | #     outfile_prefix=submission_dir)
39 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/rtmdet/rtmdet-ins_s_syncbn_fast_8xb32-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './rtmdet_s_syncbn_fast_8xb32-300e_coco.py'
 2 | 
 3 | widen_factor = 0.5
 4 | 
 5 | model = dict(
 6 |     bbox_head=dict(
 7 |         type='RTMDetInsSepBNHead',
 8 |         head_module=dict(
 9 |             type='RTMDetInsSepBNHeadModule',
10 |             use_sigmoid_cls=True,
11 |             widen_factor=widen_factor),
12 |         loss_mask=dict(
13 |             type='mmdet.DiceLoss', loss_weight=2.0, eps=5e-6,
14 |             reduction='mean')),
15 |     test_cfg=dict(
16 |         multi_label=True,
17 |         nms_pre=1000,
18 |         min_bbox_size=0,
19 |         score_thr=0.05,
20 |         nms=dict(type='nms', iou_threshold=0.6),
21 |         max_per_img=100,
22 |         mask_thr_binary=0.5))
23 | 
24 | _base_.test_pipeline[-2] = dict(
25 |     type='LoadAnnotations', with_bbox=True, with_mask=True, _scope_='mmdet')
26 | 
27 | val_dataloader = dict(dataset=dict(pipeline=_base_.test_pipeline))
28 | test_dataloader = val_dataloader
29 | 
30 | val_evaluator = dict(metric=['bbox', 'segm'])
31 | test_evaluator = val_evaluator
32 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './rtmdet_l_syncbn_fast_8xb32-300e_coco.py'
 2 | 
 3 | # ========================modified parameters======================
 4 | deepen_factor = 0.67
 5 | widen_factor = 0.75
 6 | 
 7 | # =======================Unmodified in most cases==================
 8 | model = dict(
 9 |     backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
10 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
11 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
12 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/rtmdet/rtmdet_tiny_fast_1xb12-40e_cat.py:
--------------------------------------------------------------------------------
 1 | _base_ = 'rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py'
 2 | 
 3 | data_root = './data/cat/'
 4 | class_name = ('cat', )
 5 | num_classes = len(class_name)
 6 | metainfo = dict(classes=class_name, palette=[(20, 220, 60)])
 7 | 
 8 | num_epochs_stage2 = 5
 9 | 
10 | max_epochs = 40
11 | train_batch_size_per_gpu = 12
12 | train_num_workers = 4
13 | val_batch_size_per_gpu = 1
14 | val_num_workers = 2
15 | 
16 | load_from = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117-dbb1dc83.pth'  # noqa
17 | 
18 | model = dict(
19 |     backbone=dict(frozen_stages=4),
20 |     bbox_head=dict(head_module=dict(num_classes=num_classes)),
21 |     train_cfg=dict(assigner=dict(num_classes=num_classes)))
22 | 
23 | train_dataloader = dict(
24 |     batch_size=train_batch_size_per_gpu,
25 |     num_workers=train_num_workers,
26 |     dataset=dict(
27 |         data_root=data_root,
28 |         metainfo=metainfo,
29 |         ann_file='annotations/trainval.json',
30 |         data_prefix=dict(img='images/')))
31 | 
32 | val_dataloader = dict(
33 |     batch_size=val_batch_size_per_gpu,
34 |     num_workers=val_num_workers,
35 |     dataset=dict(
36 |         metainfo=metainfo,
37 |         data_root=data_root,
38 |         ann_file='annotations/test.json',
39 |         data_prefix=dict(img='images/')))
40 | 
41 | test_dataloader = val_dataloader
42 | 
43 | param_scheduler = [
44 |     dict(
45 |         type='LinearLR',
46 |         start_factor=_base_.lr_start_factor,
47 |         by_epoch=False,
48 |         begin=0,
49 |         end=30),
50 |     dict(
51 |         # use cosine lr from 150 to 300 epoch
52 |         type='CosineAnnealingLR',
53 |         eta_min=_base_.base_lr * 0.05,
54 |         begin=max_epochs // 2,
55 |         end=max_epochs,
56 |         T_max=max_epochs // 2,
57 |         by_epoch=True,
58 |         convert_to_iter_based=True),
59 | ]
60 | 
61 | _base_.custom_hooks[1].switch_epoch = max_epochs - num_epochs_stage2
62 | 
63 | val_evaluator = dict(ann_file=data_root + 'annotations/test.json')
64 | test_evaluator = val_evaluator
65 | 
66 | default_hooks = dict(
67 |     checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'),
68 |     logger=dict(type='LoggerHook', interval=5))
69 | train_cfg = dict(max_epochs=max_epochs, val_interval=10)
70 | # visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa
71 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './rtmdet_s_syncbn_fast_8xb32-300e_coco.py'
 2 | checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth'  # noqa
 3 | 
 4 | # ========================modified parameters======================
 5 | deepen_factor = 0.167
 6 | widen_factor = 0.375
 7 | img_scale = _base_.img_scale
 8 | 
 9 | # ratio range for random resize
10 | random_resize_ratio_range = (0.5, 2.0)
11 | # Number of cached images in mosaic
12 | mosaic_max_cached_images = 20
13 | # Number of cached images in mixup
14 | mixup_max_cached_images = 10
15 | 
16 | # =======================Unmodified in most cases==================
17 | model = dict(
18 |     backbone=dict(
19 |         deepen_factor=deepen_factor,
20 |         widen_factor=widen_factor,
21 |         init_cfg=dict(checkpoint=checkpoint)),
22 |     neck=dict(
23 |         deepen_factor=deepen_factor,
24 |         widen_factor=widen_factor,
25 |     ),
26 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
27 | 
28 | train_pipeline = [
29 |     dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
30 |     dict(type='LoadAnnotations', with_bbox=True),
31 |     dict(
32 |         type='Mosaic',
33 |         img_scale=img_scale,
34 |         use_cached=True,
35 |         max_cached_images=mosaic_max_cached_images,  # note
36 |         random_pop=False,  # note
37 |         pad_val=114.0),
38 |     dict(
39 |         type='mmdet.RandomResize',
40 |         # img_scale is (width, height)
41 |         scale=(img_scale[0] * 2, img_scale[1] * 2),
42 |         ratio_range=random_resize_ratio_range,
43 |         resize_type='mmdet.Resize',
44 |         keep_ratio=True),
45 |     dict(type='mmdet.RandomCrop', crop_size=img_scale),
46 |     dict(type='mmdet.YOLOXHSVRandomAug'),
47 |     dict(type='mmdet.RandomFlip', prob=0.5),
48 |     dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))),
49 |     dict(
50 |         type='YOLOv5MixUp',
51 |         use_cached=True,
52 |         random_pop=False,
53 |         max_cached_images=mixup_max_cached_images,
54 |         prob=0.5),
55 |     dict(type='mmdet.PackDetInputs')
56 | ]
57 | 
58 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
59 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './rtmdet_l_syncbn_fast_8xb32-300e_coco.py'
 2 | 
 3 | # ========================modified parameters======================
 4 | deepen_factor = 1.33
 5 | widen_factor = 1.25
 6 | 
 7 | # =======================Unmodified in most cases==================
 8 | model = dict(
 9 |     backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
10 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
11 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
12 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/crowdhuman/yolov5_s-v61_8xb16-300e_ignore_crowdhuman.py:
--------------------------------------------------------------------------------
 1 | _base_ = 'yolov5_s-v61_fast_8xb16-300e_crowdhuman.py'
 2 | 
 3 | model = dict(
 4 |     data_preprocessor=dict(
 5 |         _delete_=True,
 6 |         type='mmdet.DetDataPreprocessor',
 7 |         mean=[0., 0., 0.],
 8 |         std=[255., 255., 255.],
 9 |         bgr_to_rgb=True),
10 |     bbox_head=dict(ignore_iof_thr=0.5))
11 | 
12 | img_scale = _base_.img_scale
13 | 
14 | albu_train_transforms = [
15 |     dict(type='Blur', p=0.01),
16 |     dict(type='MedianBlur', p=0.01),
17 |     dict(type='ToGray', p=0.01),
18 |     dict(type='CLAHE', p=0.01)
19 | ]
20 | 
21 | pre_transform = [
22 |     dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
23 |     # only change this
24 |     dict(type='mmdet.LoadAnnotations', with_bbox=True)
25 | ]
26 | 
27 | train_pipeline = [
28 |     *pre_transform,
29 |     dict(
30 |         type='Mosaic',
31 |         img_scale=img_scale,
32 |         pad_val=114.0,
33 |         pre_transform=pre_transform),
34 |     dict(
35 |         type='YOLOv5RandomAffine',
36 |         max_rotate_degree=0.0,
37 |         max_shear_degree=0.0,
38 |         scaling_ratio_range=(0.5, 1.5),
39 |         # img_scale is (width, height)
40 |         border=(-img_scale[0] // 2, -img_scale[1] // 2),
41 |         border_val=(114, 114, 114)),
42 |     dict(
43 |         type='mmdet.Albu',
44 |         transforms=albu_train_transforms,
45 |         bbox_params=dict(
46 |             type='BboxParams',
47 |             format='pascal_voc',
48 |             label_fields=['gt_bboxes_labels', 'gt_ignore_flags']),
49 |         keymap={
50 |             'img': 'image',
51 |             'gt_bboxes': 'bboxes'
52 |         }),
53 |     dict(type='YOLOv5HSVRandomAug'),
54 |     dict(type='mmdet.RandomFlip', prob=0.5),
55 |     dict(
56 |         type='mmdet.PackDetInputs',
57 |         meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
58 |                    'flip_direction'))
59 | ]
60 | 
61 | train_dataloader = dict(
62 |     collate_fn=dict(type='pseudo_collate'),
63 |     dataset=dict(pipeline=train_pipeline))
64 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/crowdhuman/yolov5_s-v61_fast_8xb16-300e_crowdhuman.py:
--------------------------------------------------------------------------------
 1 | _base_ = '../yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py'
 2 | 
 3 | # Use the model trained on the COCO as the pretrained model
 4 | load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth'  # noqa
 5 | 
 6 | # dataset settings
 7 | data_root = 'data/CrowdHuman/'
 8 | dataset_type = 'YOLOv5CrowdHumanDataset'
 9 | 
10 | # parameters that often need to be modified
11 | num_classes = 1
12 | 
13 | anchors = [
14 |     [(6, 14), (12, 28), (19, 48)],  # P3/8
15 |     [(29, 79), (46, 124), (142, 54)],  # P4/16
16 |     [(73, 198), (124, 330), (255, 504)]  # P5/32
17 | ]
18 | 
19 | model = dict(
20 |     bbox_head=dict(
21 |         head_module=dict(num_classes=num_classes),
22 |         prior_generator=dict(base_sizes=anchors)))
23 | 
24 | train_dataloader = dict(
25 |     dataset=dict(
26 |         type=dataset_type,
27 |         data_root=data_root,
28 |         ann_file='annotation_train.odgt',
29 |         data_prefix=dict(img='Images/')))
30 | 
31 | val_dataloader = dict(
32 |     dataset=dict(
33 |         type=dataset_type,
34 |         data_root=data_root,
35 |         ann_file='annotation_val.odgt',
36 |         data_prefix=dict(img='Images/'),
37 |         # CrowdHumanMetric does not support out-of-order output images
38 |         # for the time being. batch_shapes_cfg does not support.
39 |         batch_shapes_cfg=None))
40 | test_dataloader = val_dataloader
41 | 
42 | val_evaluator = dict(
43 |     _delete_=True,
44 |     type='mmdet.CrowdHumanMetric',
45 |     ann_file=data_root + 'annotation_val.odgt',
46 |     metric=['AP', 'MR', 'JI'])
47 | test_evaluator = val_evaluator
48 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py'  # noqa
 2 | 
 3 | deepen_factor = 0.33
 4 | widen_factor = 0.25
 5 | 
 6 | model = dict(
 7 |     backbone=dict(
 8 |         deepen_factor=deepen_factor,
 9 |         widen_factor=widen_factor,
10 |     ),
11 |     neck=dict(
12 |         deepen_factor=deepen_factor,
13 |         widen_factor=widen_factor,
14 |     ),
15 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
16 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py'  # noqa
 2 | 
 3 | data_root = 'data/balloon/'
 4 | # Path of train annotation file
 5 | train_ann_file = 'train.json'
 6 | train_data_prefix = 'train/'  # Prefix of train image path
 7 | # Path of val annotation file
 8 | val_ann_file = 'val.json'
 9 | val_data_prefix = 'val/'  # Prefix of val image path
10 | metainfo = {
11 |     'classes': ('balloon', ),
12 |     'palette': [
13 |         (220, 20, 60),
14 |     ]
15 | }
16 | num_classes = 1
17 | 
18 | train_batch_size_per_gpu = 4
19 | train_num_workers = 2
20 | log_interval = 1
21 | #####################
22 | train_dataloader = dict(
23 |     batch_size=train_batch_size_per_gpu,
24 |     num_workers=train_num_workers,
25 |     dataset=dict(
26 |         data_root=data_root,
27 |         metainfo=metainfo,
28 |         data_prefix=dict(img=train_data_prefix),
29 |         ann_file=train_ann_file))
30 | val_dataloader = dict(
31 |     dataset=dict(
32 |         data_root=data_root,
33 |         metainfo=metainfo,
34 |         data_prefix=dict(img=val_data_prefix),
35 |         ann_file=val_ann_file))
36 | test_dataloader = val_dataloader
37 | val_evaluator = dict(ann_file=data_root + val_ann_file)
38 | test_evaluator = val_evaluator
39 | default_hooks = dict(logger=dict(interval=log_interval))
40 | #####################
41 | 
42 | model = dict(bbox_head=dict(head_module=dict(num_classes=num_classes)))
43 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py'  # noqa
 2 | 
 3 | # ========================modified parameters======================
 4 | mask_overlap = False  # Polygon2Mask
 5 | 
 6 | # ===============================Unmodified in most cases====================
 7 | model = dict(bbox_head=dict(mask_overlap=mask_overlap))
 8 | 
 9 | train_pipeline = [
10 |     *_base_.pre_transform,
11 |     dict(
12 |         type='Mosaic',
13 |         img_scale=_base_.img_scale,
14 |         pad_val=114.0,
15 |         pre_transform=_base_.pre_transform),
16 |     dict(
17 |         type='YOLOv5RandomAffine',
18 |         max_rotate_degree=0.0,
19 |         max_shear_degree=0.0,
20 |         scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
21 |         border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
22 |         border_val=(114, 114, 114),
23 |         min_area_ratio=_base_.min_area_ratio,
24 |         max_aspect_ratio=_base_.max_aspect_ratio,
25 |         use_mask_refine=True),
26 |     dict(
27 |         type='mmdet.Albu',
28 |         transforms=_base_.albu_train_transforms,
29 |         bbox_params=dict(
30 |             type='BboxParams',
31 |             format='pascal_voc',
32 |             label_fields=['gt_bboxes_labels', 'gt_ignore_flags']),
33 |         keymap={
34 |             'img': 'image',
35 |             'gt_bboxes': 'bboxes',
36 |         }),
37 |     dict(type='YOLOv5HSVRandomAug'),
38 |     dict(type='mmdet.RandomFlip', prob=0.5),
39 |     dict(
40 |         type='Polygon2Mask',
41 |         downsample_ratio=_base_.downsample_ratio,
42 |         mask_overlap=mask_overlap),
43 |     dict(
44 |         type='PackDetInputs',
45 |         meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
46 |                    'flip_direction'))
47 | ]
48 | 
49 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
50 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py'  # noqa
 2 | 
 3 | deepen_factor = 1.33
 4 | widen_factor = 1.25
 5 | 
 6 | model = dict(
 7 |     backbone=dict(
 8 |         deepen_factor=deepen_factor,
 9 |         widen_factor=widen_factor,
10 |     ),
11 |     neck=dict(
12 |         deepen_factor=deepen_factor,
13 |         widen_factor=widen_factor,
14 |     ),
15 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
16 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py'
 2 | 
 3 | # This config will refine bbox by mask while loading annotations and
 4 | # transforming after `YOLOv5RandomAffine`
 5 | 
 6 | # ========================modified parameters======================
 7 | deepen_factor = 0.33
 8 | widen_factor = 0.25
 9 | 
10 | # ===============================Unmodified in most cases====================
11 | model = dict(
12 |     backbone=dict(
13 |         deepen_factor=deepen_factor,
14 |         widen_factor=widen_factor,
15 |     ),
16 |     neck=dict(
17 |         deepen_factor=deepen_factor,
18 |         widen_factor=widen_factor,
19 |     ),
20 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
21 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = '../yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py'
 2 | 
 3 | # This config will refine bbox by mask while loading annotations and
 4 | # transforming after `YOLOv5RandomAffine`
 5 | 
 6 | # ========================modified parameters======================
 7 | use_mask2refine = True
 8 | min_area_ratio = 0.01  # YOLOv5RandomAffine
 9 | 
10 | # ===============================Unmodified in most cases====================
11 | pre_transform = [
12 |     dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
13 |     dict(
14 |         type='LoadAnnotations',
15 |         with_bbox=True,
16 |         with_mask=True,
17 |         mask2bbox=use_mask2refine)
18 | ]
19 | 
20 | last_transform = [
21 |     # Delete gt_masks to avoid more computation
22 |     dict(type='RemoveDataElement', keys=['gt_masks']),
23 |     dict(
24 |         type='mmdet.Albu',
25 |         transforms=_base_.albu_train_transforms,
26 |         bbox_params=dict(
27 |             type='BboxParams',
28 |             format='pascal_voc',
29 |             label_fields=['gt_bboxes_labels', 'gt_ignore_flags']),
30 |         keymap={
31 |             'img': 'image',
32 |             'gt_bboxes': 'bboxes'
33 |         }),
34 |     dict(type='YOLOv5HSVRandomAug'),
35 |     dict(type='mmdet.RandomFlip', prob=0.5),
36 |     dict(
37 |         type='mmdet.PackDetInputs',
38 |         meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
39 |                    'flip_direction'))
40 | ]
41 | 
42 | train_pipeline = [
43 |     *pre_transform,
44 |     dict(
45 |         type='Mosaic',
46 |         img_scale=_base_.img_scale,
47 |         pad_val=114.0,
48 |         pre_transform=pre_transform),
49 |     dict(
50 |         type='YOLOv5RandomAffine',
51 |         max_rotate_degree=0.0,
52 |         max_shear_degree=0.0,
53 |         scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
54 |         # img_scale is (width, height)
55 |         border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
56 |         border_val=(114, 114, 114),
57 |         min_area_ratio=min_area_ratio,
58 |         use_mask_refine=use_mask2refine),
59 |     *last_transform
60 | ]
61 | 
62 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
63 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py'
 2 | 
 3 | # This config use refining bbox and `YOLOv5CopyPaste`.
 4 | # Refining bbox means refining bbox by mask while loading annotations and
 5 | # transforming after `YOLOv5RandomAffine`
 6 | 
 7 | # ========================modified parameters======================
 8 | deepen_factor = 1.33
 9 | widen_factor = 1.25
10 | 
11 | # ===============================Unmodified in most cases====================
12 | model = dict(
13 |     backbone=dict(
14 |         deepen_factor=deepen_factor,
15 |         widen_factor=widen_factor,
16 |     ),
17 |     neck=dict(
18 |         deepen_factor=deepen_factor,
19 |         widen_factor=widen_factor,
20 |     ),
21 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
22 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/voc/yolov5_l-v61_fast_1xb32-50e_voc.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5_s-v61_fast_1xb64-50e_voc.py'
 2 | 
 3 | deepen_factor = 1.0
 4 | widen_factor = 1.0
 5 | train_batch_size_per_gpu = 32
 6 | train_num_workers = 8
 7 | 
 8 | load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco/yolov5_l-v61_syncbn_fast_8xb16-300e_coco_20220917_031007-096ef0eb.pth'  # noqa
 9 | 
10 | model = dict(
11 |     backbone=dict(
12 |         deepen_factor=deepen_factor,
13 |         widen_factor=widen_factor,
14 |     ),
15 |     neck=dict(
16 |         deepen_factor=deepen_factor,
17 |         widen_factor=widen_factor,
18 |     ),
19 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
20 | 
21 | train_dataloader = dict(
22 |     batch_size=train_batch_size_per_gpu, num_workers=train_num_workers)
23 | 
24 | optim_wrapper = dict(
25 |     optimizer=dict(batch_size_per_gpu=train_batch_size_per_gpu))
26 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/voc/yolov5_m-v61_fast_1xb64-50e_voc.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5_s-v61_fast_1xb64-50e_voc.py'
 2 | 
 3 | deepen_factor = 0.67
 4 | widen_factor = 0.75
 5 | 
 6 | load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944-516a710f.pth'  # noqa
 7 | 
 8 | model = dict(
 9 |     backbone=dict(
10 |         deepen_factor=deepen_factor,
11 |         widen_factor=widen_factor,
12 |     ),
13 |     neck=dict(
14 |         deepen_factor=deepen_factor,
15 |         widen_factor=widen_factor,
16 |     ),
17 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
18 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/voc/yolov5_n-v61_fast_1xb64-50e_voc.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5_s-v61_fast_1xb64-50e_voc.py'
 2 | 
 3 | deepen_factor = 0.33
 4 | widen_factor = 0.25
 5 | 
 6 | load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco/yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739-b804c1ad.pth'  # noqa
 7 | 
 8 | model = dict(
 9 |     backbone=dict(
10 |         deepen_factor=deepen_factor,
11 |         widen_factor=widen_factor,
12 |     ),
13 |     neck=dict(
14 |         deepen_factor=deepen_factor,
15 |         widen_factor=widen_factor,
16 |     ),
17 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
18 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/voc/yolov5_x-v61_fast_1xb32-50e_voc.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5_s-v61_fast_1xb64-50e_voc.py'
 2 | 
 3 | deepen_factor = 1.33
 4 | widen_factor = 1.25
 5 | train_batch_size_per_gpu = 32
 6 | train_num_workers = 8
 7 | 
 8 | # TODO: need to add pretrained_model
 9 | load_from = None
10 | 
11 | model = dict(
12 |     backbone=dict(
13 |         deepen_factor=deepen_factor,
14 |         widen_factor=widen_factor,
15 |     ),
16 |     neck=dict(
17 |         deepen_factor=deepen_factor,
18 |         widen_factor=widen_factor,
19 |     ),
20 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
21 | 
22 | train_dataloader = dict(
23 |     batch_size=train_batch_size_per_gpu, num_workers=train_num_workers)
24 | 
25 | optim_wrapper = dict(
26 |     optimizer=dict(batch_size_per_gpu=train_batch_size_per_gpu))
27 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco.py'
 2 | 
 3 | deepen_factor = 1.0
 4 | widen_factor = 1.0
 5 | 
 6 | model = dict(
 7 |     backbone=dict(
 8 |         deepen_factor=deepen_factor,
 9 |         widen_factor=widen_factor,
10 |     ),
11 |     neck=dict(
12 |         deepen_factor=deepen_factor,
13 |         widen_factor=widen_factor,
14 |     ),
15 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
16 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py'
 2 | 
 3 | deepen_factor = 1.0
 4 | widen_factor = 1.0
 5 | 
 6 | model = dict(
 7 |     backbone=dict(
 8 |         deepen_factor=deepen_factor,
 9 |         widen_factor=widen_factor,
10 |     ),
11 |     neck=dict(
12 |         deepen_factor=deepen_factor,
13 |         widen_factor=widen_factor,
14 |     ),
15 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
16 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = 'yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py'
 2 | 
 3 | deepen_factor = 0.33
 4 | widen_factor = 0.25
 5 | 
 6 | model = dict(
 7 |     backbone=dict(
 8 |         deepen_factor=deepen_factor,
 9 |         widen_factor=widen_factor,
10 |     ),
11 |     neck=dict(
12 |         deepen_factor=deepen_factor,
13 |         widen_factor=widen_factor,
14 |     ),
15 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
16 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py'
 2 | 
 3 | deepen_factor = 0.33
 4 | widen_factor = 0.25
 5 | 
 6 | model = dict(
 7 |     backbone=dict(
 8 |         deepen_factor=deepen_factor,
 9 |         widen_factor=widen_factor,
10 |     ),
11 |     neck=dict(
12 |         deepen_factor=deepen_factor,
13 |         widen_factor=widen_factor,
14 |     ),
15 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
16 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py:
--------------------------------------------------------------------------------
 1 | _base_ = 'yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py'
 2 | 
 3 | data_root = './data/cat/'
 4 | class_name = ('cat', )
 5 | num_classes = len(class_name)
 6 | metainfo = dict(classes=class_name, palette=[(20, 220, 60)])
 7 | 
 8 | anchors = [
 9 |     [(68, 69), (154, 91), (143, 162)],  # P3/8
10 |     [(242, 160), (189, 287), (391, 207)],  # P4/16
11 |     [(353, 337), (539, 341), (443, 432)]  # P5/32
12 | ]
13 | 
14 | max_epochs = 40
15 | train_batch_size_per_gpu = 12
16 | train_num_workers = 4
17 | 
18 | load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth'  # noqa
19 | 
20 | model = dict(
21 |     backbone=dict(frozen_stages=4),
22 |     bbox_head=dict(
23 |         head_module=dict(num_classes=num_classes),
24 |         prior_generator=dict(base_sizes=anchors)))
25 | 
26 | train_dataloader = dict(
27 |     batch_size=train_batch_size_per_gpu,
28 |     num_workers=train_num_workers,
29 |     dataset=dict(
30 |         data_root=data_root,
31 |         metainfo=metainfo,
32 |         ann_file='annotations/trainval.json',
33 |         data_prefix=dict(img='images/')))
34 | 
35 | val_dataloader = dict(
36 |     dataset=dict(
37 |         metainfo=metainfo,
38 |         data_root=data_root,
39 |         ann_file='annotations/test.json',
40 |         data_prefix=dict(img='images/')))
41 | 
42 | test_dataloader = val_dataloader
43 | 
44 | _base_.optim_wrapper.optimizer.batch_size_per_gpu = train_batch_size_per_gpu
45 | 
46 | val_evaluator = dict(ann_file=data_root + 'annotations/test.json')
47 | test_evaluator = val_evaluator
48 | 
49 | default_hooks = dict(
50 |     checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'),
51 |     # The warmup_mim_iter parameter is critical.
52 |     # The default value is 1000 which is not suitable for cat datasets.
53 |     param_scheduler=dict(max_epochs=max_epochs, warmup_mim_iter=10),
54 |     logger=dict(type='LoggerHook', interval=5))
55 | train_cfg = dict(max_epochs=max_epochs, val_interval=10)
56 | # visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa
57 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/yolov5_s-v61_fast_1xb12-ms-40e_cat.py:
--------------------------------------------------------------------------------
 1 | _base_ = 'yolov5_s-v61_fast_1xb12-40e_cat.py'
 2 | 
 3 | model = dict(
 4 |     data_preprocessor=dict(
 5 |         type='YOLOv5DetDataPreprocessor',
 6 |         pad_size_divisor=32,
 7 |         batch_augments=[
 8 |             dict(
 9 |                 type='YOLOXBatchSyncRandomResize',
10 |                 random_size_range=(480, 800),
11 |                 size_divisor=32,
12 |                 interval=1)
13 |         ]))
14 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn-detect_8xb16-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = 'yolov5_s-v61_syncbn_8xb16-300e_coco.py'
 2 | 
 3 | test_pipeline = [
 4 |     dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
 5 |     dict(
 6 |         type='LetterResize',
 7 |         scale=_base_.img_scale,
 8 |         allow_scale_up=True,
 9 |         use_mini_pad=True),
10 |     dict(type='LoadAnnotations', with_bbox=True),
11 |     dict(
12 |         type='mmdet.PackDetInputs',
13 |         meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
14 |                    'scale_factor', 'pad_param'))
15 | ]
16 | 
17 | val_dataloader = dict(
18 |     dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=None))
19 | test_dataloader = val_dataloader
20 | 
21 | model = dict(
22 |     test_cfg=dict(
23 |         multi_label=False, score_thr=0.25, nms=dict(iou_threshold=0.45)))
24 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn_fast_1xb4-300e_balloon.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py'
 2 | 
 3 | # ========================modified parameters======================
 4 | data_root = 'data/balloon/'
 5 | # Path of train annotation file
 6 | train_ann_file = 'train.json'
 7 | train_data_prefix = 'train/'  # Prefix of train image path
 8 | # Path of val annotation file
 9 | val_ann_file = 'val.json'
10 | val_data_prefix = 'val/'  # Prefix of val image path
11 | metainfo = {
12 |     'classes': ('balloon', ),
13 |     'palette': [
14 |         (220, 20, 60),
15 |     ]
16 | }
17 | num_classes = 1
18 | 
19 | train_batch_size_per_gpu = 4
20 | train_num_workers = 2
21 | log_interval = 1
22 | 
23 | # =======================Unmodified in most cases==================
24 | train_dataloader = dict(
25 |     batch_size=train_batch_size_per_gpu,
26 |     num_workers=train_num_workers,
27 |     dataset=dict(
28 |         data_root=data_root,
29 |         metainfo=metainfo,
30 |         data_prefix=dict(img=train_data_prefix),
31 |         ann_file=train_ann_file))
32 | val_dataloader = dict(
33 |     dataset=dict(
34 |         data_root=data_root,
35 |         metainfo=metainfo,
36 |         data_prefix=dict(img=val_data_prefix),
37 |         ann_file=val_ann_file))
38 | test_dataloader = val_dataloader
39 | val_evaluator = dict(ann_file=data_root + val_ann_file)
40 | test_evaluator = val_evaluator
41 | model = dict(bbox_head=dict(head_module=dict(num_classes=num_classes)))
42 | default_hooks = dict(logger=dict(interval=log_interval))
43 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = 'yolov5_s-v61_syncbn_8xb16-300e_coco.py'
 2 | 
 3 | # fast means faster training speed,
 4 | # but less flexibility for multitasking
 5 | model = dict(
 6 |     data_preprocessor=dict(
 7 |         type='YOLOv5DetDataPreprocessor',
 8 |         mean=[0., 0., 0.],
 9 |         std=[255., 255., 255.],
10 |         bgr_to_rgb=True))
11 | 
12 | train_dataloader = dict(collate_fn=dict(type='yolov5_collate'))
13 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/yolov5_x-p6-v62_syncbn_fast_8xb16-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco.py'
 2 | deepen_factor = 1.33
 3 | widen_factor = 1.25
 4 | 
 5 | model = dict(
 6 |     backbone=dict(
 7 |         deepen_factor=deepen_factor,
 8 |         widen_factor=widen_factor,
 9 |     ),
10 |     neck=dict(
11 |         deepen_factor=deepen_factor,
12 |         widen_factor=widen_factor,
13 |     ),
14 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
15 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/yolov5_x-v61_syncbn_fast_8xb16-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py'
 2 | deepen_factor = 1.33
 3 | widen_factor = 1.25
 4 | 
 5 | model = dict(
 6 |     backbone=dict(
 7 |         deepen_factor=deepen_factor,
 8 |         widen_factor=widen_factor,
 9 |     ),
10 |     neck=dict(
11 |         deepen_factor=deepen_factor,
12 |         widen_factor=widen_factor,
13 |     ),
14 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
15 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_l_mask-refine_syncbn_fast_8xb16-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5u_m_mask-refine_syncbn_fast_8xb16-300e_coco.py'
 2 | 
 3 | # This config will refine bbox by mask while loading annotations and
 4 | # transforming after `YOLOv5RandomAffine`
 5 | 
 6 | # ========================modified parameters======================
 7 | deepen_factor = 1.00
 8 | widen_factor = 1.00
 9 | 
10 | mixup_prob = 0.15
11 | copypaste_prob = 0.3
12 | 
13 | # =======================Unmodified in most cases==================
14 | img_scale = _base_.img_scale
15 | pre_transform = _base_.pre_transform
16 | last_transform = _base_.last_transform
17 | affine_scale = _base_.affine_scale
18 | 
19 | model = dict(
20 |     backbone=dict(
21 |         deepen_factor=deepen_factor,
22 |         widen_factor=widen_factor,
23 |     ),
24 |     neck=dict(
25 |         deepen_factor=deepen_factor,
26 |         widen_factor=widen_factor,
27 |     ),
28 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
29 | 
30 | mosaic_affine_transform = [
31 |     dict(
32 |         type='Mosaic',
33 |         img_scale=img_scale,
34 |         pad_val=114.0,
35 |         pre_transform=pre_transform),
36 |     dict(type='YOLOv5CopyPaste', prob=copypaste_prob),
37 |     dict(
38 |         type='YOLOv5RandomAffine',
39 |         max_rotate_degree=0.0,
40 |         max_shear_degree=0.0,
41 |         max_aspect_ratio=100.,
42 |         scaling_ratio_range=(1 - affine_scale, 1 + affine_scale),
43 |         # img_scale is (width, height)
44 |         border=(-img_scale[0] // 2, -img_scale[1] // 2),
45 |         border_val=(114, 114, 114),
46 |         min_area_ratio=_base_.min_area_ratio,
47 |         use_mask_refine=_base_.use_mask2refine)
48 | ]
49 | 
50 | train_pipeline = [
51 |     *pre_transform, *mosaic_affine_transform,
52 |     dict(
53 |         type='YOLOv5MixUp',
54 |         prob=mixup_prob,
55 |         pre_transform=[*pre_transform, *mosaic_affine_transform]),
56 |     *last_transform
57 | ]
58 | 
59 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
60 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_l_syncbn_fast_8xb16-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5u_s_syncbn_fast_8xb16-300e_coco.py'
 2 | 
 3 | # ========================modified parameters======================
 4 | # TODO: Update the training hyperparameters
 5 | deepen_factor = 1.0
 6 | widen_factor = 1.0
 7 | 
 8 | # =======================Unmodified in most cases==================
 9 | model = dict(
10 |     backbone=dict(
11 |         deepen_factor=deepen_factor,
12 |         widen_factor=widen_factor,
13 |     ),
14 |     neck=dict(
15 |         deepen_factor=deepen_factor,
16 |         widen_factor=widen_factor,
17 |     ),
18 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
19 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_m_syncbn_fast_8xb16-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5u_s_syncbn_fast_8xb16-300e_coco.py'
 2 | 
 3 | # ========================modified parameters======================
 4 | # TODO: Update the training hyperparameters
 5 | deepen_factor = 0.67
 6 | widen_factor = 0.75
 7 | 
 8 | # =======================Unmodified in most cases==================
 9 | model = dict(
10 |     backbone=dict(
11 |         deepen_factor=deepen_factor,
12 |         widen_factor=widen_factor,
13 |     ),
14 |     neck=dict(
15 |         deepen_factor=deepen_factor,
16 |         widen_factor=widen_factor,
17 |     ),
18 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
19 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_n_mask-refine_syncbn_fast_8xb16-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5u_s_mask-refine_syncbn_fast_8xb16-300e_coco.py'
 2 | 
 3 | # This config will refine bbox by mask while loading annotations and
 4 | # transforming after `YOLOv5RandomAffine`
 5 | 
 6 | # ========================modified parameters======================
 7 | deepen_factor = 0.33
 8 | widen_factor = 0.25
 9 | 
10 | # ===============================Unmodified in most cases====================
11 | model = dict(
12 |     backbone=dict(
13 |         deepen_factor=deepen_factor,
14 |         widen_factor=widen_factor,
15 |     ),
16 |     neck=dict(
17 |         deepen_factor=deepen_factor,
18 |         widen_factor=widen_factor,
19 |     ),
20 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
21 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_n_syncbn_fast_8xb16-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5u_s_syncbn_fast_8xb16-300e_coco.py'
 2 | 
 3 | # ========================modified parameters======================
 4 | deepen_factor = 0.33
 5 | widen_factor = 0.25
 6 | 
 7 | # =======================Unmodified in most cases==================
 8 | model = dict(
 9 |     backbone=dict(
10 |         deepen_factor=deepen_factor,
11 |         widen_factor=widen_factor,
12 |     ),
13 |     neck=dict(
14 |         deepen_factor=deepen_factor,
15 |         widen_factor=widen_factor,
16 |     ),
17 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
18 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_x_mask-refine_syncbn_fast_8xb16-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5u_l_mask-refine_syncbn_fast_8xb16-300e_coco.py'
 2 | 
 3 | # ========================modified parameters======================
 4 | deepen_factor = 1.33
 5 | widen_factor = 1.25
 6 | 
 7 | # =======================Unmodified in most cases==================
 8 | model = dict(
 9 |     backbone=dict(
10 |         deepen_factor=deepen_factor,
11 |         widen_factor=widen_factor,
12 |     ),
13 |     neck=dict(
14 |         deepen_factor=deepen_factor,
15 |         widen_factor=widen_factor,
16 |     ),
17 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
18 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_x_syncbn_fast_8xb16-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov5u_l_syncbn_fast_8xb16-300e_coco.py'
 2 | 
 3 | # ========================modified parameters======================
 4 | # TODO: Update the training hyperparameters
 5 | deepen_factor = 1.33
 6 | widen_factor = 1.25
 7 | 
 8 | # =======================Unmodified in most cases==================
 9 | model = dict(
10 |     backbone=dict(
11 |         deepen_factor=deepen_factor,
12 |         widen_factor=widen_factor,
13 |     ),
14 |     neck=dict(
15 |         deepen_factor=deepen_factor,
16 |         widen_factor=widen_factor,
17 |     ),
18 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
19 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov6_m_syncbn_fast_8xb32-300e_coco.py'
 2 | 
 3 | # ======================= Possible modified parameters =======================
 4 | # -----model related-----
 5 | # The scaling factor that controls the depth of the network structure
 6 | deepen_factor = 1
 7 | # The scaling factor that controls the width of the network structure
 8 | widen_factor = 1
 9 | 
10 | # ============================== Unmodified in most cases ===================
11 | model = dict(
12 |     backbone=dict(
13 |         deepen_factor=deepen_factor,
14 |         widen_factor=widen_factor,
15 |         hidden_ratio=1. / 2,
16 |         block_cfg=dict(
17 |             type='ConvWrapper',
18 |             norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)),
19 |         act_cfg=dict(type='SiLU', inplace=True)),
20 |     neck=dict(
21 |         deepen_factor=deepen_factor,
22 |         widen_factor=widen_factor,
23 |         hidden_ratio=1. / 2,
24 |         block_cfg=dict(
25 |             type='ConvWrapper',
26 |             norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)),
27 |         block_act_cfg=dict(type='SiLU', inplace=True)),
28 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
29 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov6_s_syncbn_fast_8xb32-300e_coco.py'
 2 | 
 3 | # ======================= Possible modified parameters =======================
 4 | # -----model related-----
 5 | # The scaling factor that controls the depth of the network structure
 6 | deepen_factor = 0.6
 7 | # The scaling factor that controls the width of the network structure
 8 | widen_factor = 0.75
 9 | 
10 | # -----train val related-----
11 | affine_scale = 0.9  # YOLOv5RandomAffine scaling ratio
12 | 
13 | # ============================== Unmodified in most cases ===================
14 | model = dict(
15 |     backbone=dict(
16 |         type='YOLOv6CSPBep',
17 |         deepen_factor=deepen_factor,
18 |         widen_factor=widen_factor,
19 |         hidden_ratio=2. / 3,
20 |         block_cfg=dict(type='RepVGGBlock'),
21 |         act_cfg=dict(type='ReLU', inplace=True)),
22 |     neck=dict(
23 |         type='YOLOv6CSPRepPAFPN',
24 |         deepen_factor=deepen_factor,
25 |         widen_factor=widen_factor,
26 |         block_cfg=dict(type='RepVGGBlock'),
27 |         hidden_ratio=2. / 3,
28 |         block_act_cfg=dict(type='ReLU', inplace=True)),
29 |     bbox_head=dict(
30 |         type='YOLOv6Head', head_module=dict(widen_factor=widen_factor)))
31 | 
32 | mosaic_affine_pipeline = [
33 |     dict(
34 |         type='Mosaic',
35 |         img_scale=_base_.img_scale,
36 |         pad_val=114.0,
37 |         pre_transform=_base_.pre_transform),
38 |     dict(
39 |         type='YOLOv5RandomAffine',
40 |         max_rotate_degree=0.0,
41 |         max_shear_degree=0.0,
42 |         scaling_ratio_range=(1 - affine_scale, 1 + affine_scale),
43 |         # img_scale is (width, height)
44 |         border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
45 |         border_val=(114, 114, 114))
46 | ]
47 | 
48 | train_pipeline = [
49 |     *_base_.pre_transform, *mosaic_affine_pipeline,
50 |     dict(
51 |         type='YOLOv5MixUp',
52 |         prob=0.1,
53 |         pre_transform=[*_base_.pre_transform, *mosaic_affine_pipeline]),
54 |     dict(type='YOLOv5HSVRandomAug'),
55 |     dict(type='mmdet.RandomFlip', prob=0.5),
56 |     dict(
57 |         type='mmdet.PackDetInputs',
58 |         meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
59 |                    'flip_direction'))
60 | ]
61 | 
62 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
63 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov6/yolov6_n_syncbn_fast_8xb32-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov6_s_syncbn_fast_8xb32-300e_coco.py'
 2 | 
 3 | # ======================= Possible modified parameters =======================
 4 | # -----model related-----
 5 | # The scaling factor that controls the depth of the network structure
 6 | deepen_factor = 0.33
 7 | # The scaling factor that controls the width of the network structure
 8 | widen_factor = 0.25
 9 | 
10 | # -----train val related-----
11 | lr_factor = 0.02  # Learning rate scaling factor
12 | 
13 | # ============================== Unmodified in most cases ===================
14 | model = dict(
15 |     backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
16 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
17 |     bbox_head=dict(
18 |         head_module=dict(widen_factor=widen_factor),
19 |         loss_bbox=dict(iou_mode='siou')))
20 | 
21 | default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor))
22 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov6_s_syncbn_fast_8xb32-400e_coco.py'
 2 | 
 3 | # ======================= Possible modified parameters =======================
 4 | # -----model related-----
 5 | # The scaling factor that controls the depth of the network structure
 6 | deepen_factor = 0.33
 7 | # The scaling factor that controls the width of the network structure
 8 | widen_factor = 0.25
 9 | 
10 | # -----train val related-----
11 | lr_factor = 0.02  # Learning rate scaling factor
12 | 
13 | # ============================== Unmodified in most cases ===================
14 | model = dict(
15 |     backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
16 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
17 |     bbox_head=dict(
18 |         head_module=dict(widen_factor=widen_factor),
19 |         loss_bbox=dict(iou_mode='siou')))
20 | 
21 | default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor))
22 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov6/yolov6_s_fast_1xb12-40e_cat.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov6_s_syncbn_fast_8xb32-400e_coco.py'
 2 | 
 3 | data_root = './data/cat/'
 4 | class_name = ('cat', )
 5 | num_classes = len(class_name)
 6 | metainfo = dict(classes=class_name, palette=[(20, 220, 60)])
 7 | 
 8 | max_epochs = 40
 9 | train_batch_size_per_gpu = 12
10 | train_num_workers = 4
11 | num_last_epochs = 5
12 | 
13 | load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth'  # noqa
14 | 
15 | model = dict(
16 |     backbone=dict(frozen_stages=4),
17 |     bbox_head=dict(head_module=dict(num_classes=num_classes)),
18 |     train_cfg=dict(
19 |         initial_assigner=dict(num_classes=num_classes),
20 |         assigner=dict(num_classes=num_classes)))
21 | 
22 | train_dataloader = dict(
23 |     batch_size=train_batch_size_per_gpu,
24 |     num_workers=train_num_workers,
25 |     dataset=dict(
26 |         data_root=data_root,
27 |         metainfo=metainfo,
28 |         ann_file='annotations/trainval.json',
29 |         data_prefix=dict(img='images/')))
30 | 
31 | val_dataloader = dict(
32 |     dataset=dict(
33 |         metainfo=metainfo,
34 |         data_root=data_root,
35 |         ann_file='annotations/test.json',
36 |         data_prefix=dict(img='images/')))
37 | 
38 | test_dataloader = val_dataloader
39 | 
40 | val_evaluator = dict(ann_file=data_root + 'annotations/test.json')
41 | test_evaluator = val_evaluator
42 | 
43 | _base_.optim_wrapper.optimizer.batch_size_per_gpu = train_batch_size_per_gpu
44 | _base_.custom_hooks[1].switch_epoch = max_epochs - num_last_epochs
45 | 
46 | default_hooks = dict(
47 |     checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'),
48 |     # The warmup_mim_iter parameter is critical.
49 |     # The default value is 1000 which is not suitable for cat datasets.
50 |     param_scheduler=dict(max_epochs=max_epochs, warmup_mim_iter=10),
51 |     logger=dict(type='LoggerHook', interval=5))
52 | train_cfg = dict(
53 |     max_epochs=max_epochs,
54 |     val_interval=10,
55 |     dynamic_intervals=[(max_epochs - num_last_epochs, 1)])
56 | # visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa
57 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov6/yolov6_s_syncbn_fast_8xb32-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov6_s_syncbn_fast_8xb32-400e_coco.py'
 2 | 
 3 | # ======================= Frequently modified parameters =====================
 4 | # -----train val related-----
 5 | # Base learning rate for optim_wrapper
 6 | max_epochs = 300  # Maximum training epochs
 7 | num_last_epochs = 15  # Last epoch number to switch training pipeline
 8 | 
 9 | # ============================== Unmodified in most cases ===================
10 | default_hooks = dict(
11 |     param_scheduler=dict(
12 |         type='YOLOv5ParamSchedulerHook',
13 |         scheduler_type='cosine',
14 |         lr_factor=0.01,
15 |         max_epochs=max_epochs))
16 | 
17 | custom_hooks = [
18 |     dict(
19 |         type='EMAHook',
20 |         ema_type='ExpMomentumEMA',
21 |         momentum=0.0001,
22 |         update_buffers=True,
23 |         strict_load=False,
24 |         priority=49),
25 |     dict(
26 |         type='mmdet.PipelineSwitchHook',
27 |         switch_epoch=max_epochs - num_last_epochs,
28 |         switch_pipeline=_base_.train_pipeline_stage2)
29 | ]
30 | 
31 | train_cfg = dict(
32 |     max_epochs=max_epochs,
33 |     dynamic_intervals=[(max_epochs - num_last_epochs, 1)])
34 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov6/yolov6_t_syncbn_fast_8xb32-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov6_s_syncbn_fast_8xb32-300e_coco.py'
 2 | 
 3 | # ======================= Possible modified parameters =======================
 4 | # -----model related-----
 5 | # The scaling factor that controls the depth of the network structure
 6 | deepen_factor = 0.33
 7 | # The scaling factor that controls the width of the network structure
 8 | widen_factor = 0.375
 9 | 
10 | # ============================== Unmodified in most cases ===================
11 | model = dict(
12 |     backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
13 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
14 |     bbox_head=dict(
15 |         type='YOLOv6Head',
16 |         head_module=dict(widen_factor=widen_factor),
17 |         loss_bbox=dict(iou_mode='siou')))
18 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov6_s_syncbn_fast_8xb32-400e_coco.py'
 2 | 
 3 | # ======================= Possible modified parameters =======================
 4 | # -----model related-----
 5 | # The scaling factor that controls the depth of the network structure
 6 | deepen_factor = 0.33
 7 | # The scaling factor that controls the width of the network structure
 8 | widen_factor = 0.375
 9 | 
10 | # ============================== Unmodified in most cases ===================
11 | model = dict(
12 |     backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
13 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
14 |     bbox_head=dict(
15 |         type='YOLOv6Head',
16 |         head_module=dict(widen_factor=widen_factor),
17 |         loss_bbox=dict(iou_mode='siou')))
18 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov6/yolov6_v3_l_syncbn_fast_8xb32-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov6_v3_m_syncbn_fast_8xb32-300e_coco.py'
 2 | 
 3 | # ======================= Possible modified parameters =======================
 4 | # -----model related-----
 5 | # The scaling factor that controls the depth of the network structure
 6 | deepen_factor = 1
 7 | # The scaling factor that controls the width of the network structure
 8 | widen_factor = 1
 9 | 
10 | # ============================== Unmodified in most cases ===================
11 | model = dict(
12 |     backbone=dict(
13 |         deepen_factor=deepen_factor,
14 |         widen_factor=widen_factor,
15 |         hidden_ratio=1. / 2,
16 |         block_cfg=dict(
17 |             type='ConvWrapper',
18 |             norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)),
19 |         act_cfg=dict(type='SiLU', inplace=True)),
20 |     neck=dict(
21 |         deepen_factor=deepen_factor,
22 |         widen_factor=widen_factor,
23 |         hidden_ratio=1. / 2,
24 |         block_cfg=dict(
25 |             type='ConvWrapper',
26 |             norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)),
27 |         block_act_cfg=dict(type='SiLU', inplace=True)),
28 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
29 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov6/yolov6_v3_m_syncbn_fast_8xb32-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py'
 2 | 
 3 | # ======================= Possible modified parameters =======================
 4 | # -----model related-----
 5 | # The scaling factor that controls the depth of the network structure
 6 | deepen_factor = 0.6
 7 | # The scaling factor that controls the width of the network structure
 8 | widen_factor = 0.75
 9 | 
10 | # -----train val related-----
11 | affine_scale = 0.9  # YOLOv5RandomAffine scaling ratio
12 | 
13 | # ============================== Unmodified in most cases ===================
14 | model = dict(
15 |     backbone=dict(
16 |         type='YOLOv6CSPBep',
17 |         deepen_factor=deepen_factor,
18 |         widen_factor=widen_factor,
19 |         hidden_ratio=2. / 3,
20 |         block_cfg=dict(type='RepVGGBlock'),
21 |         act_cfg=dict(type='ReLU', inplace=True)),
22 |     neck=dict(
23 |         type='YOLOv6CSPRepBiPAFPN',
24 |         deepen_factor=deepen_factor,
25 |         widen_factor=widen_factor,
26 |         block_cfg=dict(type='RepVGGBlock'),
27 |         hidden_ratio=2. / 3,
28 |         block_act_cfg=dict(type='ReLU', inplace=True)),
29 |     bbox_head=dict(
30 |         type='YOLOv6Head',
31 |         head_module=dict(reg_max=16, widen_factor=widen_factor)))
32 | 
33 | mosaic_affine_pipeline = [
34 |     dict(
35 |         type='Mosaic',
36 |         img_scale=_base_.img_scale,
37 |         pad_val=114.0,
38 |         pre_transform=_base_.pre_transform),
39 |     dict(
40 |         type='YOLOv5RandomAffine',
41 |         max_rotate_degree=0.0,
42 |         max_shear_degree=0.0,
43 |         scaling_ratio_range=(1 - affine_scale, 1 + affine_scale),
44 |         # img_scale is (width, height)
45 |         border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
46 |         border_val=(114, 114, 114))
47 | ]
48 | 
49 | train_pipeline = [
50 |     *_base_.pre_transform, *mosaic_affine_pipeline,
51 |     dict(
52 |         type='YOLOv5MixUp',
53 |         prob=0.1,
54 |         pre_transform=[*_base_.pre_transform, *mosaic_affine_pipeline]),
55 |     dict(type='YOLOv5HSVRandomAug'),
56 |     dict(type='mmdet.RandomFlip', prob=0.5),
57 |     dict(
58 |         type='mmdet.PackDetInputs',
59 |         meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
60 |                    'flip_direction'))
61 | ]
62 | 
63 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
64 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov6/yolov6_v3_n_syncbn_fast_8xb32-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py'
 2 | 
 3 | # ======================= Possible modified parameters =======================
 4 | # -----model related-----
 5 | # The scaling factor that controls the depth of the network structure
 6 | deepen_factor = 0.33
 7 | # The scaling factor that controls the width of the network structure
 8 | widen_factor = 0.25
 9 | 
10 | # -----train val related-----
11 | lr_factor = 0.02  # Learning rate scaling factor
12 | 
13 | # ============================== Unmodified in most cases ===================
14 | model = dict(
15 |     backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
16 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
17 |     bbox_head=dict(
18 |         head_module=dict(widen_factor=widen_factor),
19 |         loss_bbox=dict(iou_mode='siou')))
20 | 
21 | default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor))
22 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov6/yolov6_v3_t_syncbn_fast_8xb32-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py'
 2 | 
 3 | # ======================= Possible modified parameters =======================
 4 | # -----model related-----
 5 | # The scaling factor that controls the depth of the network structure
 6 | deepen_factor = 0.33
 7 | # The scaling factor that controls the width of the network structure
 8 | widen_factor = 0.375
 9 | 
10 | # ============================== Unmodified in most cases ===================
11 | model = dict(
12 |     backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
13 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
14 |     bbox_head=dict(
15 |         type='YOLOv6Head',
16 |         head_module=dict(widen_factor=widen_factor),
17 |         loss_bbox=dict(iou_mode='siou')))
18 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov7/yolov7_d-p6_syncbn_fast_8x16b-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py'
 2 | 
 3 | model = dict(
 4 |     backbone=dict(arch='D'),
 5 |     neck=dict(
 6 |         use_maxpool_in_downsample=True,
 7 |         use_in_channels_in_downsample=True,
 8 |         block_cfg=dict(
 9 |             type='ELANBlock',
10 |             middle_ratio=0.4,
11 |             block_ratio=0.2,
12 |             num_blocks=6,
13 |             num_convs_in_block=1),
14 |         in_channels=[384, 768, 1152, 1536],
15 |         out_channels=[192, 384, 576, 768]),
16 |     bbox_head=dict(
17 |         head_module=dict(
18 |             in_channels=[192, 384, 576, 768],
19 |             main_out_channels=[384, 768, 1152, 1536],
20 |             aux_out_channels=[384, 768, 1152, 1536],
21 |         )))
22 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov7/yolov7_e-p6_syncbn_fast_8x16b-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py'
 2 | 
 3 | model = dict(
 4 |     backbone=dict(arch='E'),
 5 |     neck=dict(
 6 |         use_maxpool_in_downsample=True,
 7 |         use_in_channels_in_downsample=True,
 8 |         block_cfg=dict(
 9 |             type='ELANBlock',
10 |             middle_ratio=0.4,
11 |             block_ratio=0.2,
12 |             num_blocks=6,
13 |             num_convs_in_block=1),
14 |         in_channels=[320, 640, 960, 1280],
15 |         out_channels=[160, 320, 480, 640]),
16 |     bbox_head=dict(
17 |         head_module=dict(
18 |             in_channels=[160, 320, 480, 640],
19 |             main_out_channels=[320, 640, 960, 1280])))
20 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov7/yolov7_e2e-p6_syncbn_fast_8x16b-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py'
 2 | 
 3 | model = dict(
 4 |     backbone=dict(arch='E2E'),
 5 |     neck=dict(
 6 |         use_maxpool_in_downsample=True,
 7 |         use_in_channels_in_downsample=True,
 8 |         block_cfg=dict(
 9 |             type='EELANBlock',
10 |             num_elan_block=2,
11 |             middle_ratio=0.4,
12 |             block_ratio=0.2,
13 |             num_blocks=6,
14 |             num_convs_in_block=1),
15 |         in_channels=[320, 640, 960, 1280],
16 |         out_channels=[160, 320, 480, 640]),
17 |     bbox_head=dict(
18 |         head_module=dict(
19 |             in_channels=[160, 320, 480, 640],
20 |             main_out_channels=[320, 640, 960, 1280])))
21 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov7/yolov7_tiny_fast_1xb12-40e_cat.py:
--------------------------------------------------------------------------------
 1 | _base_ = 'yolov7_tiny_syncbn_fast_8x16b-300e_coco.py'
 2 | 
 3 | data_root = './data/cat/'
 4 | class_name = ('cat', )
 5 | num_classes = len(class_name)
 6 | metainfo = dict(classes=class_name, palette=[(20, 220, 60)])
 7 | 
 8 | anchors = [
 9 |     [(68, 69), (154, 91), (143, 162)],  # P3/8
10 |     [(242, 160), (189, 287), (391, 207)],  # P4/16
11 |     [(353, 337), (539, 341), (443, 432)]  # P5/32
12 | ]
13 | 
14 | max_epochs = 40
15 | train_batch_size_per_gpu = 12
16 | train_num_workers = 4
17 | 
18 | load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco/yolov7_tiny_syncbn_fast_8x16b-300e_coco_20221126_102719-0ee5bbdf.pth'  # noqa
19 | 
20 | model = dict(
21 |     backbone=dict(frozen_stages=4),
22 |     bbox_head=dict(
23 |         head_module=dict(num_classes=num_classes),
24 |         prior_generator=dict(base_sizes=anchors)))
25 | 
26 | train_dataloader = dict(
27 |     batch_size=train_batch_size_per_gpu,
28 |     num_workers=train_num_workers,
29 |     dataset=dict(
30 |         data_root=data_root,
31 |         metainfo=metainfo,
32 |         ann_file='annotations/trainval.json',
33 |         data_prefix=dict(img='images/')))
34 | 
35 | val_dataloader = dict(
36 |     dataset=dict(
37 |         metainfo=metainfo,
38 |         data_root=data_root,
39 |         ann_file='annotations/test.json',
40 |         data_prefix=dict(img='images/')))
41 | 
42 | test_dataloader = val_dataloader
43 | 
44 | _base_.optim_wrapper.optimizer.batch_size_per_gpu = train_batch_size_per_gpu
45 | 
46 | val_evaluator = dict(ann_file=data_root + 'annotations/test.json')
47 | test_evaluator = val_evaluator
48 | 
49 | default_hooks = dict(
50 |     checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'),
51 |     # The warmup_mim_iter parameter is critical.
52 |     # The default value is 1000 which is not suitable for cat datasets.
53 |     param_scheduler=dict(max_epochs=max_epochs, warmup_mim_iter=10),
54 |     logger=dict(type='LoggerHook', interval=5))
55 | train_cfg = dict(max_epochs=max_epochs, val_interval=10)
56 | # visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa
57 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov7_l_syncbn_fast_8x16b-300e_coco.py'
 2 | 
 3 | model = dict(
 4 |     backbone=dict(arch='X'),
 5 |     neck=dict(
 6 |         in_channels=[640, 1280, 1280],
 7 |         out_channels=[160, 320, 640],
 8 |         block_cfg=dict(
 9 |             type='ELANBlock',
10 |             middle_ratio=0.4,
11 |             block_ratio=0.4,
12 |             num_blocks=3,
13 |             num_convs_in_block=2),
14 |         use_repconv_outs=False),
15 |     bbox_head=dict(head_module=dict(in_channels=[320, 640, 1280])))
16 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py'
 2 | 
 3 | # This config use refining bbox and `YOLOv5CopyPaste`.
 4 | # Refining bbox means refining bbox by mask while loading annotations and
 5 | # transforming after `YOLOv5RandomAffine`
 6 | 
 7 | # ========================modified parameters======================
 8 | deepen_factor = 1.00
 9 | widen_factor = 1.00
10 | last_stage_out_channels = 512
11 | 
12 | mixup_prob = 0.15
13 | copypaste_prob = 0.3
14 | 
15 | # =======================Unmodified in most cases==================
16 | img_scale = _base_.img_scale
17 | pre_transform = _base_.pre_transform
18 | last_transform = _base_.last_transform
19 | affine_scale = _base_.affine_scale
20 | 
21 | model = dict(
22 |     backbone=dict(
23 |         last_stage_out_channels=last_stage_out_channels,
24 |         deepen_factor=deepen_factor,
25 |         widen_factor=widen_factor),
26 |     neck=dict(
27 |         deepen_factor=deepen_factor,
28 |         widen_factor=widen_factor,
29 |         in_channels=[256, 512, last_stage_out_channels],
30 |         out_channels=[256, 512, last_stage_out_channels]),
31 |     bbox_head=dict(
32 |         head_module=dict(
33 |             widen_factor=widen_factor,
34 |             in_channels=[256, 512, last_stage_out_channels])))
35 | 
36 | mosaic_affine_transform = [
37 |     dict(
38 |         type='Mosaic',
39 |         img_scale=img_scale,
40 |         pad_val=114.0,
41 |         pre_transform=pre_transform),
42 |     dict(type='YOLOv5CopyPaste', prob=copypaste_prob),
43 |     dict(
44 |         type='YOLOv5RandomAffine',
45 |         max_rotate_degree=0.0,
46 |         max_shear_degree=0.0,
47 |         max_aspect_ratio=100.,
48 |         scaling_ratio_range=(1 - affine_scale, 1 + affine_scale),
49 |         # img_scale is (width, height)
50 |         border=(-img_scale[0] // 2, -img_scale[1] // 2),
51 |         border_val=(114, 114, 114),
52 |         min_area_ratio=_base_.min_area_ratio,
53 |         use_mask_refine=_base_.use_mask2refine)
54 | ]
55 | 
56 | train_pipeline = [
57 |     *pre_transform, *mosaic_affine_transform,
58 |     dict(
59 |         type='YOLOv5MixUp',
60 |         prob=mixup_prob,
61 |         pre_transform=[*pre_transform, *mosaic_affine_transform]),
62 |     *last_transform
63 | ]
64 | 
65 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
66 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov8_m_syncbn_fast_8xb16-500e_coco.py'
 2 | 
 3 | # ========================modified parameters======================
 4 | deepen_factor = 1.00
 5 | widen_factor = 1.00
 6 | last_stage_out_channels = 512
 7 | 
 8 | mixup_prob = 0.15
 9 | 
10 | # =======================Unmodified in most cases==================
11 | pre_transform = _base_.pre_transform
12 | mosaic_affine_transform = _base_.mosaic_affine_transform
13 | last_transform = _base_.last_transform
14 | 
15 | model = dict(
16 |     backbone=dict(
17 |         last_stage_out_channels=last_stage_out_channels,
18 |         deepen_factor=deepen_factor,
19 |         widen_factor=widen_factor),
20 |     neck=dict(
21 |         deepen_factor=deepen_factor,
22 |         widen_factor=widen_factor,
23 |         in_channels=[256, 512, last_stage_out_channels],
24 |         out_channels=[256, 512, last_stage_out_channels]),
25 |     bbox_head=dict(
26 |         head_module=dict(
27 |             widen_factor=widen_factor,
28 |             in_channels=[256, 512, last_stage_out_channels])))
29 | 
30 | train_pipeline = [
31 |     *pre_transform, *mosaic_affine_transform,
32 |     dict(
33 |         type='YOLOv5MixUp',
34 |         prob=mixup_prob,
35 |         pre_transform=[*pre_transform, *mosaic_affine_transform]),
36 |     *last_transform
37 | ]
38 | 
39 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
40 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov8/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py'
 2 | 
 3 | # This config will refine bbox by mask while loading annotations and
 4 | # transforming after `YOLOv5RandomAffine`
 5 | 
 6 | deepen_factor = 0.33
 7 | widen_factor = 0.25
 8 | 
 9 | model = dict(
10 |     backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
11 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
12 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
13 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov8_s_syncbn_fast_8xb16-500e_coco.py'
 2 | 
 3 | deepen_factor = 0.33
 4 | widen_factor = 0.25
 5 | 
 6 | model = dict(
 7 |     backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
 8 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
 9 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
10 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov8/yolov8_s_fast_1xb12-40e_cat.py:
--------------------------------------------------------------------------------
 1 | _base_ = 'yolov8_s_syncbn_fast_8xb16-500e_coco.py'
 2 | 
 3 | data_root = './data/cat/'
 4 | class_name = ('cat', )
 5 | num_classes = len(class_name)
 6 | metainfo = dict(classes=class_name, palette=[(20, 220, 60)])
 7 | 
 8 | close_mosaic_epochs = 5
 9 | 
10 | max_epochs = 40
11 | train_batch_size_per_gpu = 12
12 | train_num_workers = 4
13 | 
14 | load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco/yolov8_s_syncbn_fast_8xb16-500e_coco_20230117_180101-5aa5f0f1.pth'  # noqa
15 | 
16 | model = dict(
17 |     backbone=dict(frozen_stages=4),
18 |     bbox_head=dict(head_module=dict(num_classes=num_classes)),
19 |     train_cfg=dict(assigner=dict(num_classes=num_classes)))
20 | 
21 | train_dataloader = dict(
22 |     batch_size=train_batch_size_per_gpu,
23 |     num_workers=train_num_workers,
24 |     dataset=dict(
25 |         data_root=data_root,
26 |         metainfo=metainfo,
27 |         ann_file='annotations/trainval.json',
28 |         data_prefix=dict(img='images/')))
29 | 
30 | val_dataloader = dict(
31 |     dataset=dict(
32 |         metainfo=metainfo,
33 |         data_root=data_root,
34 |         ann_file='annotations/test.json',
35 |         data_prefix=dict(img='images/')))
36 | 
37 | test_dataloader = val_dataloader
38 | 
39 | _base_.optim_wrapper.optimizer.batch_size_per_gpu = train_batch_size_per_gpu
40 | _base_.custom_hooks[1].switch_epoch = max_epochs - close_mosaic_epochs
41 | 
42 | val_evaluator = dict(ann_file=data_root + 'annotations/test.json')
43 | test_evaluator = val_evaluator
44 | 
45 | default_hooks = dict(
46 |     checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'),
47 |     # The warmup_mim_iter parameter is critical.
48 |     # The default value is 1000 which is not suitable for cat datasets.
49 |     param_scheduler=dict(max_epochs=max_epochs, warmup_mim_iter=10),
50 |     logger=dict(type='LoggerHook', interval=5))
51 | train_cfg = dict(max_epochs=max_epochs, val_interval=10)
52 | # visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa
53 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov8/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py'
 2 | 
 3 | # This config use refining bbox and `YOLOv5CopyPaste`.
 4 | # Refining bbox means refining bbox by mask while loading annotations and
 5 | # transforming after `YOLOv5RandomAffine`
 6 | 
 7 | deepen_factor = 1.00
 8 | widen_factor = 1.25
 9 | 
10 | model = dict(
11 |     backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
12 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
13 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
14 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolov8/yolov8_x_syncbn_fast_8xb16-500e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolov8_l_syncbn_fast_8xb16-500e_coco.py'
 2 | 
 3 | deepen_factor = 1.00
 4 | widen_factor = 1.25
 5 | 
 6 | model = dict(
 7 |     backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
 8 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
 9 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
10 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolox/pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = ['./yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py']
 2 | 
 3 | load_from = 'https://download.openmmlab.com/mmyolo/v0/yolox/yolox_l_fast_8xb8-300e_coco/yolox_l_fast_8xb8-300e_coco_20230213_160715-c731eb1c.pth'  # noqa
 4 | 
 5 | # ========================modified parameters======================
 6 | deepen_factor = 1.0
 7 | widen_factor = 1.0
 8 | 
 9 | # =======================Unmodified in most cases==================
10 | # model settings
11 | model = dict(
12 |     backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
13 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
14 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
15 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolox/pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = ['./yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py']
 2 | 
 3 | load_from = 'https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco_20230210_144328-e657e182.pth'  # noqa
 4 | 
 5 | # ========================modified parameters======================
 6 | deepen_factor = 0.67
 7 | widen_factor = 0.75
 8 | 
 9 | # =======================Unmodified in most cases==================
10 | # model settings
11 | model = dict(
12 |     backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
13 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
14 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
15 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolox/pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py'
 2 | 
 3 | load_from = 'https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco_20230210_143637-4c338102.pth'  # noqa
 4 | 
 5 | deepen_factor = 0.33
 6 | widen_factor = 0.375
 7 | scaling_ratio_range = (0.75, 1.0)
 8 | 
 9 | # model settings
10 | model = dict(
11 |     data_preprocessor=dict(batch_augments=[
12 |         dict(
13 |             type='YOLOXBatchSyncRandomResize',
14 |             random_size_range=(320, 640),
15 |             size_divisor=32,
16 |             interval=1)
17 |     ]),
18 |     backbone=dict(
19 |         deepen_factor=deepen_factor,
20 |         widen_factor=widen_factor,
21 |     ),
22 |     neck=dict(
23 |         deepen_factor=deepen_factor,
24 |         widen_factor=widen_factor,
25 |     ),
26 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
27 | 
28 | # data settings
29 | img_scale = _base_.img_scale
30 | pre_transform = _base_.pre_transform
31 | 
32 | train_pipeline_stage1 = [
33 |     *pre_transform,
34 |     dict(
35 |         type='Mosaic',
36 |         img_scale=img_scale,
37 |         pad_val=114.0,
38 |         pre_transform=pre_transform),
39 |     dict(
40 |         type='RandomAffine',
41 |         scaling_ratio_range=scaling_ratio_range,
42 |         border=(-img_scale[0] // 2, -img_scale[1] // 2)),
43 |     dict(type='mmdet.YOLOXHSVRandomAug'),
44 |     dict(type='RandomFlip', prob=0.5),
45 |     dict(
46 |         type='FilterAnnotations',
47 |         by_keypoints=True,
48 |         min_gt_bbox_wh=(1, 1),
49 |         keep_empty=False),
50 |     dict(
51 |         type='PackDetInputs',
52 |         meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape'))
53 | ]
54 | 
55 | test_pipeline = [
56 |     *pre_transform,
57 |     dict(type='Resize', scale=(416, 416), keep_ratio=True),
58 |     dict(
59 |         type='mmdet.Pad',
60 |         pad_to_square=True,
61 |         pad_val=dict(img=(114.0, 114.0, 114.0))),
62 |     dict(
63 |         type='PackDetInputs',
64 |         meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
65 |                    'scale_factor', 'flip_indices'))
66 | ]
67 | 
68 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline_stage1))
69 | val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
70 | test_dataloader = val_dataloader
71 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolox/yolox_l_fast_8xb8-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolox_s_fast_8xb8-300e_coco.py'
 2 | 
 3 | # ========================modified parameters======================
 4 | deepen_factor = 1.0
 5 | widen_factor = 1.0
 6 | 
 7 | # =======================Unmodified in most cases==================
 8 | # model settings
 9 | model = dict(
10 |     backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
11 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
12 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
13 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py'
 2 | 
 3 | # ========================modified parameters======================
 4 | deepen_factor = 0.67
 5 | widen_factor = 0.75
 6 | 
 7 | # =======================Unmodified in most cases==================
 8 | # model settings
 9 | model = dict(
10 |     backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
11 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
12 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
13 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolox/yolox_m_fast_8xb8-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolox_s_fast_8xb8-300e_coco.py'
 2 | 
 3 | # ========================modified parameters======================
 4 | deepen_factor = 0.67
 5 | widen_factor = 0.75
 6 | 
 7 | # =======================Unmodified in most cases==================
 8 | # model settings
 9 | model = dict(
10 |     backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
11 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
12 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
13 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolox/yolox_nano_fast_8xb32-300e-rtmdet-hyp_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco.py'
 2 | 
 3 | # ========================modified parameters======================
 4 | deepen_factor = 0.33
 5 | widen_factor = 0.25
 6 | use_depthwise = True
 7 | 
 8 | # =======================Unmodified in most cases==================
 9 | # model settings
10 | model = dict(
11 |     backbone=dict(
12 |         deepen_factor=deepen_factor,
13 |         widen_factor=widen_factor,
14 |         use_depthwise=use_depthwise),
15 |     neck=dict(
16 |         deepen_factor=deepen_factor,
17 |         widen_factor=widen_factor,
18 |         use_depthwise=use_depthwise),
19 |     bbox_head=dict(
20 |         head_module=dict(
21 |             widen_factor=widen_factor, use_depthwise=use_depthwise)))
22 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolox/yolox_nano_fast_8xb8-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolox_tiny_fast_8xb8-300e_coco.py'
 2 | 
 3 | # ========================modified parameters======================
 4 | deepen_factor = 0.33
 5 | widen_factor = 0.25
 6 | use_depthwise = True
 7 | 
 8 | # =======================Unmodified in most cases==================
 9 | # model settings
10 | model = dict(
11 |     backbone=dict(
12 |         deepen_factor=deepen_factor,
13 |         widen_factor=widen_factor,
14 |         use_depthwise=use_depthwise),
15 |     neck=dict(
16 |         deepen_factor=deepen_factor,
17 |         widen_factor=widen_factor,
18 |         use_depthwise=use_depthwise),
19 |     bbox_head=dict(
20 |         head_module=dict(
21 |             widen_factor=widen_factor, use_depthwise=use_depthwise)))
22 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolox/yolox_p5_tta.py:
--------------------------------------------------------------------------------
 1 | # TODO: Need to solve the problem of multiple backend_args parameters
 2 | # _backend_args = dict(
 3 | #     backend='petrel',
 4 | #     path_mapping=dict({
 5 | #         './data/': 's3://openmmlab/datasets/detection/',
 6 | #         'data/': 's3://openmmlab/datasets/detection/'
 7 | #     }))
 8 | 
 9 | _backend_args = None
10 | 
11 | tta_model = dict(
12 |     type='mmdet.DetTTAModel',
13 |     tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.65), max_per_img=300))
14 | 
15 | img_scales = [(640, 640), (320, 320), (960, 960)]
16 | 
17 | #                                LoadImageFromFile
18 | #              /                        |                          \
19 | #          Resize                     Resize                       Resize  # noqa
20 | #        /      \                    /      \                    /        \
21 | #  RandomFlip RandomFlip      RandomFlip RandomFlip        RandomFlip RandomFlip # noqa
22 | #      |          |                |         |                  |         |
23 | #  LoadAnn    LoadAnn           LoadAnn    LoadAnn           LoadAnn    LoadAnn
24 | #      |          |                |         |                  |         |
25 | #  PackDetIn  PackDetIn         PackDetIn  PackDetIn        PackDetIn  PackDetIn # noqa
26 | 
27 | tta_pipeline = [
28 |     dict(type='LoadImageFromFile', backend_args=_backend_args),
29 |     dict(
30 |         type='TestTimeAug',
31 |         transforms=[
32 |             [
33 |                 dict(type='mmdet.Resize', scale=s, keep_ratio=True)
34 |                 for s in img_scales
35 |             ],
36 |             [
37 |                 # ``RandomFlip`` must be placed before ``Pad``, otherwise
38 |                 # bounding box coordinates after flipping cannot be
39 |                 # recovered correctly.
40 |                 dict(type='mmdet.RandomFlip', prob=1.),
41 |                 dict(type='mmdet.RandomFlip', prob=0.)
42 |             ],
43 |             [
44 |                 dict(
45 |                     type='mmdet.Pad',
46 |                     pad_to_square=True,
47 |                     pad_val=dict(img=(114.0, 114.0, 114.0))),
48 |             ],
49 |             [
50 |                 dict(
51 |                     type='mmdet.PackDetInputs',
52 |                     meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
53 |                                'scale_factor', 'flip', 'flip_direction'))
54 |             ]
55 |         ])
56 | ]
57 | 


--------------------------------------------------------------------------------
/third_party/mmyolo/configs/yolox/yolox_x_fast_8xb8-300e_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = './yolox_s_fast_8xb8-300e_coco.py'
 2 | 
 3 | # ========================modified parameters======================
 4 | deepen_factor = 1.33
 5 | widen_factor = 1.25
 6 | 
 7 | # =======================Unmodified in most cases==================
 8 | # model settings
 9 | model = dict(
10 |     backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
11 |     neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
12 |     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
13 | 


--------------------------------------------------------------------------------
/tools/count_num_parameters.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import torch
 3 | 
 4 | 
 5 | def parse_args():
 6 | 
 7 |     parser = argparse.ArgumentParser("Compute the number of parameters of a model")
 8 |     parser.add_argument('checkpoint', type=str, help='model checkpoint path')
 9 | 
10 |     args = parser.parse_args()
11 |     return args
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     args = parse_args()
16 | 
17 |     # load checkpoint
18 |     model = torch.load(args.checkpoint, map_location='cpu')
19 |     state_dict = model['state_dict']
20 |     num_parameters = 0
21 | 
22 |     for k, v in state_dict.items():
23 |         num_parameters += v.numel()
24 | 
25 |     print(f'num_parameters: {num_parameters} | {num_parameters / 1e6:.2f} MB')


--------------------------------------------------------------------------------
/tools/dist_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CONFIG=$1
 4 | CHECKPOINT=$2
 5 | GPUS=$3
 6 | NNODES=${NNODES:-1}
 7 | NODE_RANK=${NODE_RANK:-0}
 8 | PORT=${PORT:-29588}
 9 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
10 | 
11 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
12 | python3 -m torch.distributed.launch \
13 |     --nnodes=$NNODES \
14 |     --node_rank=$NODE_RANK \
15 |     --master_addr=$MASTER_ADDR \
16 |     --nproc_per_node=$GPUS \
17 |     --master_port=$PORT \
18 |     $(dirname "$0")/test.py \
19 |     $CONFIG \
20 |     $CHECKPOINT \
21 |     --launcher pytorch \
22 |     ${@:4}
23 | 


--------------------------------------------------------------------------------
/tools/dist_train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CONFIG=$1
 4 | GPUS=$2
 5 | NNODES=${NNODES:-1}
 6 | NODE_RANK=${NODE_RANK:-0}
 7 | PORT=${MASTER_PORT:-29500}
 8 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
 9 | 
10 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
11 | python3 -m torch.distributed.launch \
12 |     --nnodes=$NNODES \
13 |     --node_rank=$NODE_RANK \
14 |     --master_addr=$MASTER_ADDR \
15 |     --nproc_per_node=$GPUS \
16 |     --master_port=$PORT \
17 |     $(dirname "$0")/train.py \
18 |     $CONFIG \
19 |     --launcher pytorch ${@:3}
20 | 


--------------------------------------------------------------------------------
/tools/evaluate_latency.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-861/developer-guide/index.html#trtexec
3 | 
4 | trtexec --onnx=path_to_onnx_file \
5 |   --fp16 \
6 |   --iterations=2000 \
7 |   --verbose \
8 |   --device=0


--------------------------------------------------------------------------------
/tools/generate_image_prompts.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tqdm
 3 | import argparse
 4 | import os.path as osp
 5 | import numpy as np
 6 | from PIL import Image
 7 | from transformers import (AutoTokenizer, AutoProcessor,
 8 |                           CLIPVisionModelWithProjection,
 9 |                           CLIPTextModelWithProjection)
10 | 
11 | if __name__ == "__main__":
12 | 
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument(
15 |         '--model',
16 |         type=str,
17 |         default='../pretrained_models/open-ai-clip-vit-base-patch32')
18 |     parser.add_argument('--image-dir', type=str, default='data/samples.txt')
19 |     parser.add_argument('--out-dir', type=str, default='')
20 |     parser.add_argument('--out-file', type=str)
21 | 
22 |     args = parser.parse_args()
23 | 
24 |     tokenizer = AutoTokenizer.from_pretrained(args.model)
25 |     vision_model = CLIPVisionModelWithProjection.from_pretrained(args.model)
26 |     text_model = CLIPTextModelWithProjection.from_pretrained(args.model)
27 |     processor = AutoProcessor.from_pretrained(args.model)
28 | 
29 |     # padding prompts
30 |     device = 'cuda:0'
31 |     text_model.to(device)
32 |     texts = tokenizer(text=[' '], return_tensors='pt', padding=True)
33 |     texts = texts.to(device)
34 |     text_outputs = text_model(**texts)
35 |     txt_feats = text_outputs.text_embeds
36 |     txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
37 |     txt_feats = txt_feats.reshape(-1, txt_feats.shape[-1]).cpu().data.numpy()
38 | 
39 |     images = os.listdir(args.image_dir)
40 |     category_embeds = []
41 | 
42 |     def _forward_vision_model(image_name):
43 |         image_path = osp.join(args.image_dir, image_name)
44 |         # category = image_name.split('-')[1]
45 |         image = Image.open(image_path).convert("RGB")
46 |         inputs = processor(images=image, return_tensors="pt", padding=True)
47 |         image_outputs = vision_model(**inputs)
48 |         img_feats = image_outputs.image_embeds
49 |         # img_feats
50 |         img_feats = img_feats / img_feats.norm(p=2, dim=-1, keepdim=True)
51 |         img_feats = img_feats.reshape(
52 |             -1, img_feats.shape[-1])[0].cpu().data.numpy()
53 |         category_embeds.append(img_feats)
54 | 
55 |     for image_ in tqdm.tqdm(images):
56 |         _forward_vision_model(image_)
57 |     category_embeds.append(txt_feats)
58 |     category_embeds = np.stack(category_embeds)
59 |     np.save(osp.join(args.out_dir, args.out_file), category_embeds)
60 | 


--------------------------------------------------------------------------------
/tools/generate_text_prompts_dosod.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import argparse
 4 | import numpy as np
 5 | import torch
 6 | from mmdet.apis import init_detector
 7 | 
 8 | if __name__ == "__main__":
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('config', type=str, help='Path to config file')
11 |     parser.add_argument('checkpoint', type=str, help='Path to checkpoint file')
12 |     parser.add_argument('--text',
13 |                         type=str,
14 |                         default='data/texts/coco_class_texts.json',
15 |                         help='Path to text file''')
16 |     parser.add_argument('--out-dir', type=str, help='The dir to save text embeddings npy')
17 |     parser.add_argument('--device',
18 |                         default='cuda:0',
19 |                         help='Device used for run')
20 | 
21 |     args = parser.parse_args()
22 | 
23 |     device = args.device
24 | 
25 |     with open(args.text) as f:
26 |         data = json.load(f)
27 |     texts = [x[0] for x in data]
28 | 
29 |     # generate text embeddings
30 |     print('init model......')
31 |     model = init_detector(args.config, args.checkpoint, device=device)
32 |     model.eval()
33 | 
34 |     print('start to generate text embeddings......')
35 |     with torch.no_grad():
36 |         text_embeddings = model.backbone_text([texts], enable_assertion=False)
37 |         text_embeddings = model.bbox_head.head_module.forward_text(text_embeddings)
38 |         text_embeddings = text_embeddings.reshape(-1, text_embeddings.shape[-1])
39 | 
40 |     print('start to save text embeddings......')
41 |     os.makedirs(args.out_dir, exist_ok=True)
42 |     text_embeddings = text_embeddings.cpu().data.numpy()
43 |     np.save(os.path.join(args.out_dir,
44 |                          os.path.splitext(os.path.basename(args.text))[0] + '_' + os.path.splitext(os.path.basename(args.checkpoint))[0]) + ".npy",
45 |             text_embeddings)
46 | 


--------------------------------------------------------------------------------
/tools/generate_text_prompts_yoloworld.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | import numpy as np
 4 | from transformers import (AutoTokenizer, CLIPTextModelWithProjection)
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 | 
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument(
11 |         '--model',
12 |         type=str,
13 |         default='/horizon-bucket/AIoT-data-bucket/yonghao01.he/pretrain_models/clip-vit-base-patch32')
14 |     parser.add_argument('--text',
15 |                         type=str,
16 |                         default='/home/users/yonghao01.he/projects/YOLO-World-Workspace/yolo-world-reparameterize-show/open_word.json')
17 |     parser.add_argument('--out', type=str, default='/home/users/yonghao01.he/projects/YOLO-World-Workspace/yolo-world-reparameterize-show/open_word.npy')
18 | 
19 |     args = parser.parse_args()
20 | 
21 |     tokenizer = AutoTokenizer.from_pretrained(args.model)
22 |     model = CLIPTextModelWithProjection.from_pretrained(args.model)
23 | 
24 |     with open(args.text) as f:
25 |         data = json.load(f)
26 |     texts = [x[0] for x in data]
27 |     device = 'cuda:0'
28 |     model.to(device)
29 |     texts = tokenizer(text=texts, return_tensors='pt', padding=True)
30 |     texts = texts.to(device)
31 |     text_outputs = model(**texts)
32 |     txt_feats = text_outputs.text_embeds
33 |     txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
34 |     txt_feats = txt_feats.reshape(-1, txt_feats.shape[-1])
35 | 
36 |     np.save(args.out, txt_feats.cpu().data.numpy())
37 | 


--------------------------------------------------------------------------------
/tools/generate_vocabulary_json.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | 
 4 | if __name__ == "__main__":
 5 |     parser = argparse.ArgumentParser(description="Convert Vocabulary Text to Json File")
 6 |     parser.add_argument("--text", type=str, default="person,bicycle,car,motorcycle,airplane,bus,train,truck,boat,traffic light,fire hydrant,stop sign,parking meter,bench,bird,cat,dog,horse,sheep,cow,elephant,bear,zebra,giraffe,backpack,umbrella,handbag,tie,suitcase,frisbee,skis,snowboard,sports ball,kite,baseball bat,baseball glove,skateboard,surfboard,tennis racket,bottle,wine glass,cup,fork,knife,spoon,bowl,banana,apple,sandwich,orange,broccoli,carrot,hot dog,pizza,donut,cake,chair,couch,potted plant,bed,dining table,toilet,tv,laptop,mouse,remote,keyboard,cell phone,microwave,oven,toaster,sink,refrigerator,book,clock,vase,scissors,teddy bear,hair drier,toothbrush", help='Texts')
 7 |     parser.add_argument("--output", type=str, default='offline_vocabulary.json', help='Output path')
 8 | 
 9 |     args = parser.parse_args()
10 | 
11 |     # 指定输出的 JSON 文件名
12 |     text = args.text
13 |     output_file = args.output
14 | 
15 |     # 将文本按逗号分割并去除多余的空格
16 |     items = [item.strip() for item in text.split(",")]
17 | 
18 |     # 将每个项目转换为单独的列表
19 |     nested_items = [[item] for item in items]
20 | 
21 |     print("len items:", len(nested_items), nested_items)
22 | 
23 |     # 将嵌套列表保存为 JSON 文件
24 |     with open(output_file, "w", encoding="utf-8") as file:
25 |         # indent=4
26 |         json.dump(nested_items, file, ensure_ascii=False)
27 | 
28 |     print(f"Finshed. Save vocabulary file: {output_file}")
29 | 
30 | 


--------------------------------------------------------------------------------
/tools/reparameterize_dosod.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | 
 4 | import torch
 5 | import numpy as np
 6 | 
 7 | 
 8 | def parse_args():
 9 | 
10 |     parser = argparse.ArgumentParser("Reparameterize DOSOD")
11 |     parser.add_argument('--model', help='model checkpoints to reparameterize')
12 |     parser.add_argument('--out-dir', help='output checkpoints')
13 |     parser.add_argument(
14 |         '--text-embed',
15 |         help='text embeddings to be reparameterized')
16 | 
17 |     args = parser.parse_args()
18 |     return args
19 | 
20 | 
21 | def convert_head(scale, bias, text_embed):
22 |     N, D = text_embed.shape
23 |     weight = (text_embed * scale.exp()).view(N, D, 1, 1)
24 |     bias = torch.ones(N) * bias
25 |     return weight, bias
26 | 
27 | 
28 | def reparameterize_head(state_dict, embeds):
29 | 
30 |     cls_layers = [
31 |         'bbox_head.head_module.cls_contrasts.0',
32 |         'bbox_head.head_module.cls_contrasts.1',
33 |         'bbox_head.head_module.cls_contrasts.2'
34 |     ]
35 | 
36 |     for i in range(3):
37 |         scale = state_dict[cls_layers[i] + '.logit_scale']
38 |         bias = state_dict[cls_layers[i] + '.bias']
39 |         weight, bias = convert_head(scale, bias, embeds)
40 |         state_dict[cls_layers[i] + '.conv.weight'] = weight
41 |         state_dict[cls_layers[i] + '.conv.bias'] = bias
42 |         del state_dict[cls_layers[i] + '.bias']
43 |         del state_dict[cls_layers[i] + '.logit_scale']
44 |     return state_dict
45 | 
46 | 
47 | def main():
48 | 
49 |     args = parse_args()
50 | 
51 |     # load checkpoint
52 |     model = torch.load(args.model, map_location='cpu')
53 |     state_dict = model['state_dict']
54 | 
55 |     # load embeddings
56 |     embeddings = torch.from_numpy(np.load(args.text_embed))
57 | 
58 |     # remove text encoder and text adaptor
59 |     keys = list(state_dict.keys())
60 |     keys = [x for x in keys if "backbone_text" not in x and 'text_mlp' not in x]
61 | 
62 |     state_dict_wo_text = {x: state_dict[x] for x in keys}
63 |     print("removing text encoder")
64 | 
65 |     state_dict_wo_text = reparameterize_head(state_dict_wo_text, embeddings)
66 |     print("reparameterizing head")
67 | 
68 |     model['state_dict'] = state_dict_wo_text
69 | 
70 |     model_name = os.path.basename(args.model)
71 |     model_name = model_name.replace('.pth', f'_rep.pth')
72 |     torch.save(model, os.path.join(args.out_dir, model_name))
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     main()
77 | 


--------------------------------------------------------------------------------
/yolo_world/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Tencent Inc. All rights reserved.
 2 | import importlib.metadata as importlib_metadata
 3 | 
 4 | try:
 5 |     __version__ = importlib_metadata.version(__package__ or __name__)
 6 | except importlib_metadata.PackageNotFoundError:
 7 |     __version__ = '0.0.0'
 8 | 
 9 | 
10 | from .models import *  # noqa
11 | from .datasets import *  # noqa
12 | from .engine import *  # noqa
13 | 


--------------------------------------------------------------------------------
/yolo_world/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Tencent Inc. All rights reserved.
 2 | from .mm_dataset import (
 3 |     MultiModalDataset, MultiModalMixedDataset)
 4 | from .yolov5_obj365v1 import YOLOv5Objects365V1Dataset
 5 | from .yolov5_obj365v2 import YOLOv5Objects365V2Dataset
 6 | from .yolov5_mixed_grounding import YOLOv5MixedGroundingDataset
 7 | from .utils import yolow_collate
 8 | from .transformers import *  # NOQA
 9 | from .yolov5_v3det import YOLOv5V3DetDataset
10 | from .yolov5_lvis import YOLOv5LVISV1Dataset
11 | from .yolov5_cc3m_grounding import YOLOv5GeneralGroundingDataset
12 | 
13 | __all__ = [
14 |     'MultiModalDataset', 'YOLOv5Objects365V1Dataset',
15 |     'YOLOv5Objects365V2Dataset', 'YOLOv5MixedGroundingDataset',
16 |     'YOLOv5V3DetDataset', 'yolow_collate',
17 |     'YOLOv5LVISV1Dataset', 'MultiModalMixedDataset',
18 |     'YOLOv5GeneralGroundingDataset'
19 | ]
20 | 


--------------------------------------------------------------------------------
/yolo_world/datasets/transformers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Tencent Inc. All rights reserved.
 2 | from .mm_transforms import RandomLoadText, LoadText
 3 | from .mm_mix_img_transforms import (
 4 |     MultiModalMosaic, MultiModalMosaic9, YOLOv5MultiModalMixUp,
 5 |     YOLOXMultiModalMixUp)
 6 | 
 7 | __all__ = ['RandomLoadText', 'LoadText', 'MultiModalMosaic',
 8 |            'MultiModalMosaic9', 'YOLOv5MultiModalMixUp',
 9 |            'YOLOXMultiModalMixUp']
10 | 


--------------------------------------------------------------------------------
/yolo_world/datasets/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from typing import Sequence
 3 | 
 4 | import torch
 5 | from mmengine.dataset import COLLATE_FUNCTIONS
 6 | 
 7 | 
 8 | @COLLATE_FUNCTIONS.register_module()
 9 | def yolow_collate(data_batch: Sequence,
10 |                   use_ms_training: bool = False) -> dict:
11 |     """Rewrite collate_fn to get faster training speed.
12 | 
13 |     Args:
14 |        data_batch (Sequence): Batch of data.
15 |        use_ms_training (bool): Whether to use multi-scale training.
16 |     """
17 |     batch_imgs = []
18 |     batch_bboxes_labels = []
19 |     batch_masks = []
20 |     for i in range(len(data_batch)):
21 |         datasamples = data_batch[i]['data_samples']
22 |         inputs = data_batch[i]['inputs']
23 |         batch_imgs.append(inputs)
24 | 
25 |         gt_bboxes = datasamples.gt_instances.bboxes.tensor
26 |         gt_labels = datasamples.gt_instances.labels
27 |         if 'masks' in datasamples.gt_instances:
28 |             masks = datasamples.gt_instances.masks.to(
29 |                 dtype=torch.bool, device=gt_bboxes.device)
30 |             batch_masks.append(masks)
31 |         batch_idx = gt_labels.new_full((len(gt_labels), 1), i)
32 |         bboxes_labels = torch.cat((batch_idx, gt_labels[:, None], gt_bboxes),
33 |                                   dim=1)
34 |         batch_bboxes_labels.append(bboxes_labels)
35 | 
36 |     collated_results = {
37 |         'data_samples': {
38 |             'bboxes_labels': torch.cat(batch_bboxes_labels, 0)
39 |         }
40 |     }
41 |     if len(batch_masks) > 0:
42 |         collated_results['data_samples']['masks'] = torch.cat(batch_masks, 0)
43 | 
44 |     if use_ms_training:
45 |         collated_results['inputs'] = batch_imgs
46 |     else:
47 |         collated_results['inputs'] = torch.stack(batch_imgs, 0)
48 | 
49 |     if hasattr(data_batch[0]['data_samples'], 'texts'):
50 |         batch_texts = [meta['data_samples'].texts for meta in data_batch]
51 |         collated_results['data_samples']['texts'] = batch_texts
52 | 
53 |     if hasattr(data_batch[0]['data_samples'], 'is_detection'):
54 |         # detection flag
55 |         batch_detection = [meta['data_samples'].is_detection
56 |                            for meta in data_batch]
57 |         collated_results['data_samples']['is_detection'] = torch.tensor(
58 |             batch_detection)
59 | 
60 |     return collated_results
61 | 


--------------------------------------------------------------------------------
/yolo_world/datasets/yolov5_lvis.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Tencent Inc. All rights reserved.
 2 | from mmdet.datasets import LVISV1Dataset
 3 | 
 4 | from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset
 5 | from mmyolo.registry import DATASETS
 6 | 
 7 | 
 8 | @DATASETS.register_module()
 9 | class YOLOv5LVISV1Dataset(BatchShapePolicyDataset, LVISV1Dataset):
10 |     """Dataset for YOLOv5 LVIS Dataset.
11 | 
12 |     We only add `BatchShapePolicy` function compared with Objects365V1Dataset.
13 |     See `mmyolo/datasets/utils.py#BatchShapePolicy` for details
14 |     """
15 |     pass
16 | 


--------------------------------------------------------------------------------
/yolo_world/datasets/yolov5_obj365v1.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Tencent Inc. All rights reserved.
 2 | from mmdet.datasets import Objects365V1Dataset
 3 | 
 4 | from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset
 5 | from mmyolo.registry import DATASETS
 6 | 
 7 | 
 8 | @DATASETS.register_module()
 9 | class YOLOv5Objects365V1Dataset(BatchShapePolicyDataset, Objects365V1Dataset):
10 |     """Dataset for YOLOv5 VOC Dataset.
11 | 
12 |     We only add `BatchShapePolicy` function compared with Objects365V1Dataset.
13 |     See `mmyolo/datasets/utils.py#BatchShapePolicy` for details
14 |     """
15 |     pass
16 | 


--------------------------------------------------------------------------------
/yolo_world/datasets/yolov5_obj365v2.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Tencent Inc. All rights reserved.
 2 | from mmdet.datasets import Objects365V2Dataset
 3 | 
 4 | from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset
 5 | from mmyolo.registry import DATASETS
 6 | 
 7 | 
 8 | @DATASETS.register_module()
 9 | class YOLOv5Objects365V2Dataset(BatchShapePolicyDataset, Objects365V2Dataset):
10 |     """Dataset for YOLOv5 VOC Dataset.
11 | 
12 |     We only add `BatchShapePolicy` function compared with Objects365V1Dataset.
13 |     See `mmyolo/datasets/utils.py#BatchShapePolicy` for details
14 |     """
15 |     pass
16 | 


--------------------------------------------------------------------------------
/yolo_world/engine/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Tencent Inc. All rights reserved.
2 | from .optimizers import *  # noqa
3 | 


--------------------------------------------------------------------------------
/yolo_world/engine/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Tencent Inc. All rights reserved.
2 | from .yolow_v5_optim_constructor import YOLOWv5OptimizerConstructor
3 | 
4 | __all__ = ['YOLOWv5OptimizerConstructor']
5 | 


--------------------------------------------------------------------------------
/yolo_world/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Tencent Inc. All rights reserved.
 2 | from .backbones import *  # noqa
 3 | from .layers import *  # noqa
 4 | from .detectors import *  # noqa
 5 | from .losses import *  # noqa
 6 | from .data_preprocessors import *  # noqa
 7 | from .dense_heads import *  # noqa
 8 | from .necks import *  # noqa
 9 | from .assigner import *  # noqa
10 | 


--------------------------------------------------------------------------------
/yolo_world/models/assigner/__init__.py:
--------------------------------------------------------------------------------
1 | from .task_aligned_assigner import YOLOWorldSegAssigner
2 | 
3 | __all__ = ['YOLOWorldSegAssigner']


--------------------------------------------------------------------------------
/yolo_world/models/backbones/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Tencent Inc. All rights reserved.
 2 | # YOLO Multi-Modal Backbone (Vision Language)
 3 | # Vision: YOLOv8 CSPDarknet
 4 | # Language: CLIP Text Encoder (12-layer transformer)
 5 | from .mm_backbone import (
 6 |     MultiModalYOLOBackbone,
 7 |     HuggingVisionBackbone,
 8 |     HuggingCLIPLanguageBackbone,
 9 |     PseudoLanguageBackbone)
10 | 
11 | __all__ = [
12 |     'MultiModalYOLOBackbone',
13 |     'HuggingVisionBackbone',
14 |     'HuggingCLIPLanguageBackbone',
15 |     'PseudoLanguageBackbone'
16 | ]
17 | 


--------------------------------------------------------------------------------
/yolo_world/models/data_preprocessors/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Tencent Inc. All rights reserved.
2 | from .data_preprocessor import YOLOWDetDataPreprocessor
3 | 
4 | __all__ = ['YOLOWDetDataPreprocessor']
5 | 


--------------------------------------------------------------------------------
/yolo_world/models/dense_heads/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Tencent Inc. All rights reserved.
 2 | from .yolo_world_head import YOLOWorldHead, YOLOWorldHeadModule, RepYOLOWorldHeadModule, RepYOLOWorldHeadModuleV1
 3 | from .yolo_world_seg_head import YOLOWorldSegHead, YOLOWorldSegHeadModule
 4 | from .dosod_head import (DOSODYOLOv8Head,
 5 |                          DOSODYOLOv8dHeadModule,
 6 |                          DOSODContrastiveHead,
 7 |                          RepDOSODYOLOv8Head,
 8 |                          RepDOSODYOLOv8dHeadModuleDRobotics,
 9 |                          RepDOSODYOLOv8dHeadModule,
10 |                          RepDOSODContrastiveHead, )
11 | 
12 | __all__ = [
13 |     'YOLOWorldHead', 'YOLOWorldHeadModule', 'YOLOWorldSegHead', 'RepYOLOWorldHeadModuleV1',
14 |     'YOLOWorldSegHeadModule', 'RepYOLOWorldHeadModule',
15 |     'DOSODYOLOv8Head', 'DOSODYOLOv8dHeadModule', 'DOSODContrastiveHead',
16 |     'RepDOSODYOLOv8dHeadModuleDRobotics', 'RepDOSODYOLOv8Head', 'RepDOSODYOLOv8dHeadModule', 'RepDOSODContrastiveHead',
17 | ]
18 | 


--------------------------------------------------------------------------------
/yolo_world/models/detectors/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Tencent Inc. All rights reserved.
2 | from .yolo_world import YOLOWorldDetector, SimpleYOLOWorldDetector
3 | from .dosod import DOSODDetector, RepDOSODDetector
4 | 
5 | __all__ = ['YOLOWorldDetector', 'SimpleYOLOWorldDetector',
6 |            'DOSODDetector', 'RepDOSODDetector']
7 | 


--------------------------------------------------------------------------------
/yolo_world/models/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Tencent Inc. All rights reserved.
 2 | # Basic brick modules for PAFPN based on CSPLayers
 3 | 
 4 | from .yolo_bricks import (
 5 |     CSPLayerWithTwoConv,
 6 |     MaxSigmoidAttnBlock,
 7 |     MaxSigmoidCSPLayerWithTwoConv,
 8 |     ImagePoolingAttentionModule,
 9 |     RepConvMaxSigmoidCSPLayerWithTwoConv,
10 |     RepMaxSigmoidCSPLayerWithTwoConv
11 |     )
12 | 
13 | __all__ = ['CSPLayerWithTwoConv',
14 |            'MaxSigmoidAttnBlock',
15 |            'MaxSigmoidCSPLayerWithTwoConv',
16 |            'RepConvMaxSigmoidCSPLayerWithTwoConv',
17 |            'RepMaxSigmoidCSPLayerWithTwoConv',
18 |            'ImagePoolingAttentionModule']
19 | 


--------------------------------------------------------------------------------
/yolo_world/models/losses/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Tencent Inc. All rights reserved.
2 | from .dynamic_loss import CoVMSELoss
3 | 
4 | __all__ = ['CoVMSELoss']
5 | 


--------------------------------------------------------------------------------
/yolo_world/models/losses/dynamic_loss.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Tencent Inc. All rights reserved.
 2 | from typing import Optional
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | from torch import Tensor
 7 | from mmdet.models.losses.mse_loss import mse_loss
 8 | from mmyolo.registry import MODELS
 9 | 
10 | 
11 | @MODELS.register_module()
12 | class CoVMSELoss(nn.Module):
13 | 
14 |     def __init__(self,
15 |                  dim: int = 0,
16 |                  reduction: str = 'mean',
17 |                  loss_weight: float = 1.0,
18 |                  eps: float = 1e-6) -> None:
19 |         super().__init__()
20 |         self.dim = dim
21 |         self.reduction = reduction
22 |         self.loss_weight = loss_weight
23 |         self.eps = eps
24 | 
25 |     def forward(self,
26 |                 pred: Tensor,
27 |                 weight: Optional[Tensor] = None,
28 |                 avg_factor: Optional[int] = None,
29 |                 reduction_override: Optional[str] = None) -> Tensor:
30 |         """Forward function of loss."""
31 |         assert reduction_override in (None, 'none', 'mean', 'sum')
32 |         reduction = (
33 |             reduction_override if reduction_override else self.reduction)
34 |         cov = pred.std(self.dim) / pred.mean(self.dim).clamp(min=self.eps)
35 |         target = torch.zeros_like(cov)
36 |         loss = self.loss_weight * mse_loss(
37 |             cov, target, weight, reduction=reduction, avg_factor=avg_factor)
38 |         return loss
39 | 


--------------------------------------------------------------------------------
/yolo_world/models/necks/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Tencent Inc. All rights reserved.
2 | from .yolo_world_pafpn import YOLOWorldPAFPN, YOLOWorldDualPAFPN
3 | 
4 | __all__ = ['YOLOWorldPAFPN', 'YOLOWorldDualPAFPN']
5 | 


--------------------------------------------------------------------------------
/yolo_world/version.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Tencent Inc. All rights reserved.
 2 | from yolo_world import __version__
 3 | 
 4 | def __version_info() -> tuple:
 5 |     """Parse a version string into a tuple.
 6 |     Returns:
 7 |         tuple[int | str]: The version info, e.g., "1.3.0" is parsed into
 8 |             (1, 3, 0), and "2.0.0rc1" is parsed into (2, 0, 0, 'rc1').
 9 |     """
10 |     version_info = []
11 |     for x in __version__.split('.'):
12 |         if x.isdigit():
13 |             version_info.append(int(x))
14 |         elif x.find('rc') != -1:
15 |             patch_version = x.split('rc')
16 |             version_info.append(int(patch_version[0]))
17 |             version_info.append(f'rc{patch_version[1]}')
18 |     return tuple(version_info)
19 | 
20 | 
21 | version_info = __version_info()
22 | 
23 | __all__ = ['__version__', 'version_info']
24 | 


--------------------------------------------------------------------------------