├── .dockerignore ├── .gitattributes ├── .gitignore ├── Dockerfile ├── LICENSE ├── README-YW.md ├── README.md ├── assets ├── DOSOD_LOGO.png ├── dosod-l-4090.md ├── dosod-m-4090.md ├── dosod-s-4090.md ├── dosod_architecture.png ├── finetune_yoloworld.png ├── render_dosod.jpeg ├── reparameterize.png ├── yolo-worldv1-l-4090.md ├── yolo-worldv1-m-4090.md ├── yolo-worldv1-s-4090.md ├── yolo-worldv2-l-4090.md ├── yolo-worldv2-m-4090.md ├── yolo-worldv2-s-4090.md ├── yolo_arch.png └── yolo_logo.png ├── configs ├── dosod │ ├── dosod_mlp0x_s_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py │ ├── dosod_mlp1x_s_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py │ ├── dosod_mlp2x_s_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py │ ├── dosod_mlp3x_l_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py │ ├── dosod_mlp3x_m_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py │ ├── dosod_mlp3x_s_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py │ ├── dosod_mlp4x_s_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py │ ├── dosod_mlp5x_s_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py │ ├── rep_dosod_mlp3x_l_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py │ ├── rep_dosod_mlp3x_l_d-robotics.py │ ├── rep_dosod_mlp3x_m_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py │ ├── rep_dosod_mlp3x_m_d-robotics.py │ ├── rep_dosod_mlp3x_s_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py │ ├── rep_dosod_mlp3x_s_d-robotics.py │ ├── zero-shot-on-coco_dosod_mlp3x_l.py │ ├── zero-shot-on-coco_dosod_mlp3x_m.py │ └── zero-shot-on-coco_dosod_mlp3x_s.py ├── finetune_coco │ ├── README.md │ ├── yolo_world_l_dual_vlpan_2e-4_80e_8gpus_finetune_coco.py │ ├── yolo_world_l_dual_vlpan_2e-4_80e_8gpus_mask-refine_finetune_coco.py │ ├── yolo_world_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py │ ├── yolo_world_v2_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py │ ├── yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py │ ├── yolo_world_v2_l_vlpan_bn_sgd_1e-3_40e_8gpus_finetune_coco.py │ ├── yolo_world_v2_l_vlpan_bn_sgd_1e-3_80e_8gpus_mask-refine_finetune_coco.py │ ├── yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py │ ├── yolo_world_v2_s_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py │ ├── yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py │ ├── yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py │ ├── yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py │ └── yolo_world_v2_xl_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py ├── pretrain │ ├── rep_yolo_world_v2_l.py │ ├── rep_yolo_world_v2_m.py │ ├── rep_yolo_world_v2_s.py │ ├── yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_800ft_lvis_minival.py │ ├── yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py │ ├── yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py │ ├── yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py │ ├── yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py │ ├── yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py │ ├── yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py │ ├── yolo_world_v2_m_vlpan_bn_noeinsum_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py │ ├── yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py │ ├── yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py │ ├── yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_cc3mlite_train_lvis_minival.py │ ├── yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py │ ├── yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py │ └── yolo_world_v2_xl_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py ├── pretrain_v1 │ ├── README.md │ ├── yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py │ ├── yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py │ ├── yolo_world_m_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py │ ├── yolo_world_s_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py │ └── yolo_world_x_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py ├── prompt_tuning_coco │ ├── READEME.md │ ├── yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco.py │ ├── yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_prompt_tuning_coco.py │ └── yolo_world_v2_l_vlpan_bn_sgd_1e-3_80e_8gpus_all_finetuning_coco.py └── segmentation │ ├── README.md │ ├── yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py │ ├── yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis.py │ ├── yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py │ ├── yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis.py │ ├── yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py │ └── yolo_world_v2_seg_m_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py ├── data ├── coco │ └── lvis │ │ └── lvis_v1_minival_inserted_image_name.json └── texts │ ├── coco_class_texts.json │ ├── lvis_v1_base_class_captions.json │ ├── lvis_v1_class_texts.json │ └── obj365v1_class_texts.json ├── demo ├── README.md ├── gradio_demo.py ├── image_demo.py ├── inference.ipynb ├── sample_images │ ├── bus.jpg │ └── zidane.jpg ├── simple_demo.py └── video_demo.py ├── deploy ├── __init__.py ├── easydeploy │ ├── README.md │ ├── README_zh-CN.md │ ├── backbone │ │ ├── __init__.py │ │ ├── common.py │ │ └── focus.py │ ├── bbox_code │ │ ├── __init__.py │ │ └── bbox_coder.py │ ├── deepstream │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ ├── README_zh-CN.md │ │ ├── coco_labels.txt │ │ ├── configs │ │ │ ├── config_infer_rtmdet.txt │ │ │ ├── config_infer_yolov5.txt │ │ │ └── config_infer_yolov8.txt │ │ ├── custom_mmyolo_bbox_parser │ │ │ └── nvdsparsebbox_mmyolo.cpp │ │ └── deepstream_app_config.txt │ ├── docs │ │ └── model_convert.md │ ├── examples │ │ ├── config.py │ │ ├── cv2_nms.py │ │ ├── main_onnxruntime.py │ │ ├── numpy_coder.py │ │ ├── preprocess.py │ │ └── requirements.txt │ ├── model │ │ ├── __init__.py │ │ ├── backend.py │ │ ├── backendwrapper.py │ │ └── model.py │ ├── nms │ │ ├── __init__.py │ │ ├── ort_nms.py │ │ └── trt_nms.py │ ├── onnx_demo.py │ └── tools │ │ ├── build_engine.py │ │ ├── export_onnx.py │ │ └── image-demo.py ├── export_onnx.py ├── onnx_demo.py └── tflite_demo.py ├── docs ├── data.md ├── deploy.md ├── faq.md ├── finetuning.md ├── installation.md ├── prompt_yolo_world.md ├── reparameterize.md ├── tflite_deploy.md └── updates.md ├── pyproject.toml ├── requirements ├── basic_requirements.txt ├── demo_requirements.txt └── onnx_requirements.txt ├── third_party └── mmyolo │ └── configs │ ├── _base_ │ ├── default_runtime.py │ ├── det_p5_tta.py │ └── pose │ │ └── coco.py │ ├── deploy │ ├── base_dynamic.py │ ├── base_static.py │ ├── detection_onnxruntime_dynamic.py │ ├── detection_onnxruntime_static.py │ ├── detection_rknn-fp16_static-320x320.py │ ├── detection_rknn-int8_static-320x320.py │ ├── detection_tensorrt-fp16_dynamic-192x192-960x960.py │ ├── detection_tensorrt-fp16_dynamic-64x64-1344x1344.py │ ├── detection_tensorrt-fp16_static-640x640.py │ ├── detection_tensorrt-int8_dynamic-192x192-960x960.py │ ├── detection_tensorrt-int8_static-640x640.py │ ├── detection_tensorrt_dynamic-192x192-960x960.py │ ├── detection_tensorrt_static-640x640.py │ └── model │ │ ├── yolov5_s-static.py │ │ └── yolov6_s-static.py │ ├── ppyoloe │ ├── README.md │ ├── metafile.yml │ ├── ppyoloe_l_fast_8xb20-300e_coco.py │ ├── ppyoloe_m_fast_8xb28-300e_coco.py │ ├── ppyoloe_plus_l_fast_8xb8-80e_coco.py │ ├── ppyoloe_plus_m_fast_8xb8-80e_coco.py │ ├── ppyoloe_plus_s_fast_1xb12-40e_cat.py │ ├── ppyoloe_plus_s_fast_8xb8-80e_coco.py │ ├── ppyoloe_plus_x_fast_8xb8-80e_coco.py │ ├── ppyoloe_s_fast_8xb32-300e_coco.py │ ├── ppyoloe_s_fast_8xb32-400e_coco.py │ └── ppyoloe_x_fast_8xb16-300e_coco.py │ ├── razor │ └── subnets │ │ ├── README.md │ │ ├── rtmdet_tiny_ofa_lat31_syncbn_16xb16-300e_coco.py │ │ ├── yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco.py │ │ └── yolov6_l_attentivenas_a6_d12_syncbn_fast_8xb32-300e_coco.py │ ├── rtmdet │ ├── README.md │ ├── cspnext_imagenet_pretrain │ │ ├── README.md │ │ ├── cspnext-s_8xb256-rsb-a1-600e_in1k.py │ │ └── cspnext-tiny_8xb256-rsb-a1-600e_in1k.py │ ├── distillation │ │ ├── README.md │ │ ├── kd_l_rtmdet_x_neck_300e_coco.py │ │ ├── kd_m_rtmdet_l_neck_300e_coco.py │ │ ├── kd_s_rtmdet_m_neck_300e_coco.py │ │ └── kd_tiny_rtmdet_s_neck_300e_coco.py │ ├── metafile.yml │ ├── rotated │ │ ├── rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py │ │ ├── rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py │ │ ├── rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota.py │ │ ├── rtmdet-r_l_syncbn_fast_coco-pretrain_2xb4-36e_dota-ms.py │ │ ├── rtmdet-r_m_syncbn_fast_2xb4-36e_dota-ms.py │ │ ├── rtmdet-r_m_syncbn_fast_2xb4-36e_dota.py │ │ ├── rtmdet-r_s_fast_1xb8-36e_dota-ms.py │ │ ├── rtmdet-r_s_fast_1xb8-36e_dota.py │ │ ├── rtmdet-r_tiny_fast_1xb8-36e_dota-ms.py │ │ └── rtmdet-r_tiny_fast_1xb8-36e_dota.py │ ├── rtmdet-ins_s_syncbn_fast_8xb32-300e_coco.py │ ├── rtmdet_l_syncbn_fast_8xb32-300e_coco.py │ ├── rtmdet_m_syncbn_fast_8xb32-300e_coco.py │ ├── rtmdet_s_syncbn_fast_8xb32-300e_coco.py │ ├── rtmdet_tiny_fast_1xb12-40e_cat.py │ ├── rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py │ └── rtmdet_x_syncbn_fast_8xb32-300e_coco.py │ ├── yolov5 │ ├── README.md │ ├── crowdhuman │ │ ├── yolov5_s-v61_8xb16-300e_ignore_crowdhuman.py │ │ └── yolov5_s-v61_fast_8xb16-300e_crowdhuman.py │ ├── ins_seg │ │ ├── yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py │ │ ├── yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py │ │ ├── yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py │ │ ├── yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py │ │ ├── yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py │ │ ├── yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py │ │ └── yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py │ ├── mask_refine │ │ ├── yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py │ │ ├── yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py │ │ ├── yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py │ │ ├── yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py │ │ └── yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py │ ├── metafile.yml │ ├── voc │ │ ├── yolov5_l-v61_fast_1xb32-50e_voc.py │ │ ├── yolov5_m-v61_fast_1xb64-50e_voc.py │ │ ├── yolov5_n-v61_fast_1xb64-50e_voc.py │ │ ├── yolov5_s-v61_fast_1xb64-50e_voc.py │ │ └── yolov5_x-v61_fast_1xb32-50e_voc.py │ ├── yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco.py │ ├── yolov5_l-v61_syncbn_fast_8xb16-300e_coco.py │ ├── yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco.py │ ├── yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py │ ├── yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco.py │ ├── yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py │ ├── yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py │ ├── yolov5_s-v61_fast_1xb12-40e_608x352_cat.py │ ├── yolov5_s-v61_fast_1xb12-40e_cat.py │ ├── yolov5_s-v61_fast_1xb12-ms-40e_cat.py │ ├── yolov5_s-v61_syncbn-detect_8xb16-300e_coco.py │ ├── yolov5_s-v61_syncbn_8xb16-300e_coco.py │ ├── yolov5_s-v61_syncbn_fast_1xb4-300e_balloon.py │ ├── yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py │ ├── yolov5_x-p6-v62_syncbn_fast_8xb16-300e_coco.py │ ├── yolov5_x-v61_syncbn_fast_8xb16-300e_coco.py │ └── yolov5u │ │ ├── yolov5u_l_mask-refine_syncbn_fast_8xb16-300e_coco.py │ │ ├── yolov5u_l_syncbn_fast_8xb16-300e_coco.py │ │ ├── yolov5u_m_mask-refine_syncbn_fast_8xb16-300e_coco.py │ │ ├── yolov5u_m_syncbn_fast_8xb16-300e_coco.py │ │ ├── yolov5u_n_mask-refine_syncbn_fast_8xb16-300e_coco.py │ │ ├── yolov5u_n_syncbn_fast_8xb16-300e_coco.py │ │ ├── yolov5u_s_mask-refine_syncbn_fast_8xb16-300e_coco.py │ │ ├── yolov5u_s_syncbn_fast_8xb16-300e_coco.py │ │ ├── yolov5u_x_mask-refine_syncbn_fast_8xb16-300e_coco.py │ │ └── yolov5u_x_syncbn_fast_8xb16-300e_coco.py │ ├── yolov6 │ ├── README.md │ ├── metafile.yml │ ├── yolov6_l_syncbn_fast_8xb32-300e_coco.py │ ├── yolov6_m_syncbn_fast_8xb32-300e_coco.py │ ├── yolov6_n_syncbn_fast_8xb32-300e_coco.py │ ├── yolov6_n_syncbn_fast_8xb32-400e_coco.py │ ├── yolov6_s_fast_1xb12-40e_cat.py │ ├── yolov6_s_syncbn_fast_8xb32-300e_coco.py │ ├── yolov6_s_syncbn_fast_8xb32-400e_coco.py │ ├── yolov6_t_syncbn_fast_8xb32-300e_coco.py │ ├── yolov6_t_syncbn_fast_8xb32-400e_coco.py │ ├── yolov6_v3_l_syncbn_fast_8xb32-300e_coco.py │ ├── yolov6_v3_m_syncbn_fast_8xb32-300e_coco.py │ ├── yolov6_v3_n_syncbn_fast_8xb32-300e_coco.py │ ├── yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py │ └── yolov6_v3_t_syncbn_fast_8xb32-300e_coco.py │ ├── yolov7 │ ├── README.md │ ├── metafile.yml │ ├── yolov7_d-p6_syncbn_fast_8x16b-300e_coco.py │ ├── yolov7_e-p6_syncbn_fast_8x16b-300e_coco.py │ ├── yolov7_e2e-p6_syncbn_fast_8x16b-300e_coco.py │ ├── yolov7_l_syncbn_fast_8x16b-300e_coco.py │ ├── yolov7_tiny_fast_1xb12-40e_cat.py │ ├── yolov7_tiny_syncbn_fast_8x16b-300e_coco.py │ ├── yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py │ └── yolov7_x_syncbn_fast_8x16b-300e_coco.py │ ├── yolov8 │ ├── README.md │ ├── metafile.yml │ ├── yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py │ ├── yolov8_l_syncbn_fast_8xb16-500e_coco.py │ ├── yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py │ ├── yolov8_m_syncbn_fast_8xb16-500e_coco.py │ ├── yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco.py │ ├── yolov8_n_syncbn_fast_8xb16-500e_coco.py │ ├── yolov8_s_fast_1xb12-40e_cat.py │ ├── yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py │ ├── yolov8_s_syncbn_fast_8xb16-500e_coco.py │ ├── yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py │ └── yolov8_x_syncbn_fast_8xb16-500e_coco.py │ └── yolox │ ├── README.md │ ├── metafile.yml │ ├── pose │ ├── yolox-pose_l_8xb32-300e-rtmdet-hyp_coco.py │ ├── yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py │ ├── yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py │ └── yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco.py │ ├── yolox_l_fast_8xb8-300e_coco.py │ ├── yolox_m_fast_8xb32-300e-rtmdet-hyp_coco.py │ ├── yolox_m_fast_8xb8-300e_coco.py │ ├── yolox_nano_fast_8xb32-300e-rtmdet-hyp_coco.py │ ├── yolox_nano_fast_8xb8-300e_coco.py │ ├── yolox_p5_tta.py │ ├── yolox_s_fast_1xb12-40e-rtmdet-hyp_cat.py │ ├── yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py │ ├── yolox_s_fast_8xb8-300e_coco.py │ ├── yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco.py │ ├── yolox_tiny_fast_8xb8-300e_coco.py │ └── yolox_x_fast_8xb8-300e_coco.py ├── tools ├── count_num_parameters.py ├── dist_test.sh ├── dist_train.sh ├── evaluate_latency.sh ├── generate_image_prompts.py ├── generate_text_prompts_dosod.py ├── generate_text_prompts_yoloworld.py ├── generate_vocabulary_json.py ├── reparameterize_dosod.py ├── reparameterize_yoloworld.py ├── test.py └── train.py └── yolo_world ├── __init__.py ├── datasets ├── __init__.py ├── mm_dataset.py ├── transformers │ ├── __init__.py │ ├── mm_mix_img_transforms.py │ └── mm_transforms.py ├── utils.py ├── yolov5_cc3m_grounding.py ├── yolov5_lvis.py ├── yolov5_mixed_grounding.py ├── yolov5_obj365v1.py ├── yolov5_obj365v2.py └── yolov5_v3det.py ├── engine ├── __init__.py └── optimizers │ ├── __init__.py │ └── yolow_v5_optim_constructor.py ├── models ├── __init__.py ├── assigner │ ├── __init__.py │ └── task_aligned_assigner.py ├── backbones │ ├── __init__.py │ └── mm_backbone.py ├── data_preprocessors │ ├── __init__.py │ └── data_preprocessor.py ├── dense_heads │ ├── __init__.py │ ├── dosod_head.py │ ├── yolo_world_head.py │ └── yolo_world_seg_head.py ├── detectors │ ├── __init__.py │ ├── dosod.py │ └── yolo_world.py ├── layers │ ├── __init__.py │ └── yolo_bricks.py ├── losses │ ├── __init__.py │ └── dynamic_loss.py └── necks │ ├── __init__.py │ └── yolo_world_pafpn.py └── version.py /.dockerignore: -------------------------------------------------------------------------------- 1 | docs 2 | Dockerfile -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Basic .gitattributes for a python repo. 2 | 3 | # Source files 4 | # ============ 5 | *.pxd text diff=python 6 | *.py text diff=python 7 | *.py3 text diff=python 8 | *.pyw text diff=python 9 | *.pyx text diff=python 10 | *.pyz text diff=python 11 | *.pyi text diff=python 12 | 13 | # Binary files 14 | # ============ 15 | *.db binary 16 | *.p binary 17 | *.pkl binary 18 | *.pickle binary 19 | *.pyc binary export-ignore 20 | *.pyo binary export-ignore 21 | *.pyd binary 22 | 23 | # Jupyter notebook 24 | *.ipynb text eol=lf 25 | 26 | # Others 27 | * text=auto 28 | *.txt text 29 | *.sh text eol=lf 30 | 31 | # Note: .db, .p, and .pkl files are associated 32 | # with the python modules ``pickle``, ``dbm.*``, 33 | # ``shelve``, ``marshal``, ``anydbm``, & ``bsddb`` 34 | # (among others). 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/en/_build/ 68 | docs/zh_cn/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # Environments 86 | .env 87 | .venv 88 | env/ 89 | venv/ 90 | ENV/ 91 | env.bak/ 92 | venv.bak/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | # data/ 107 | # data 108 | .vscode 109 | .idea 110 | .DS_Store 111 | 112 | # custom 113 | *.pkl 114 | *.pkl.json 115 | *.log.json 116 | docs/modelzoo_statistics.md 117 | mmdet/.mim 118 | work_dirs 119 | 120 | # Pytorch 121 | *.pth 122 | *.py~ 123 | *.sh~ 124 | 125 | # venus 126 | venus_run.sh 127 | 128 | /local_test 129 | /dosod_models 130 | data/coco/annotations 131 | data/coco/val2017 132 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 2 | 3 | ARG MODEL="yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py" 4 | ARG WEIGHT="yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth" 5 | 6 | ENV FORCE_CUDA="1" 7 | ENV MMCV_WITH_OPS=1 8 | 9 | RUN apt-get update && apt-get install -y --no-install-recommends \ 10 | python3-pip \ 11 | libgl1-mesa-glx \ 12 | libsm6 \ 13 | libxext6 \ 14 | libxrender-dev \ 15 | libglib2.0-0 \ 16 | git \ 17 | python3-dev \ 18 | python3-wheel 19 | 20 | RUN pip3 install --upgrade pip \ 21 | && pip3 install \ 22 | gradio \ 23 | opencv-python \ 24 | supervision \ 25 | mmengine \ 26 | setuptools \ 27 | openmim \ 28 | && mim install mmcv==2.0.0 \ 29 | && pip3 install --no-cache-dir --index-url https://download.pytorch.org/whl/cu118 \ 30 | wheel \ 31 | torch \ 32 | torchvision \ 33 | torchaudio 34 | 35 | COPY . /yolo 36 | WORKDIR /yolo 37 | 38 | RUN pip3 install -e . 39 | 40 | RUN curl -o weights/$WEIGHT -L https://huggingface.co/wondervictor/YOLO-World/resolve/main/$WEIGHT 41 | 42 | ENTRYPOINT [ "python3", "demo.py" ] 43 | CMD ["configs/pretrain/$MODEL", "weights/$WEIGHT"] -------------------------------------------------------------------------------- /assets/DOSOD_LOGO.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/D-Robotics-AI-Lab/DOSOD/2b133c3a9f7a9b685635090d219efa84a6951e75/assets/DOSOD_LOGO.png -------------------------------------------------------------------------------- /assets/dosod-l-4090.md: -------------------------------------------------------------------------------- 1 | ``` 2 | [09/02/2024-18:04:40] [I] === Performance summary === 3 | [09/02/2024-18:04:40] [I] Throughput: 630.463 qps 4 | [09/02/2024-18:04:40] [I] Latency: min = 1.8894 ms, max = 4.01685 ms, mean = 1.8984 ms, median = 1.896 ms, percentile(90%) = 1.90039 ms, percentile(95%) = 1.90186 ms, percentile(99%) = 1.90576 ms 5 | [09/02/2024-18:04:40] [I] Enqueue Time: min = 0.90802 ms, max = 5.06421 ms, mean = 1.00555 ms, median = 0.993896 ms, percentile(90%) = 1.01172 ms, percentile(95%) = 1.01953 ms, percentile(99%) = 1.15723 ms 6 | [09/02/2024-18:04:40] [I] H2D Latency: min = 0.200684 ms, max = 0.240601 ms, mean = 0.201544 ms, median = 0.201355 ms, percentile(90%) = 0.202026 ms, percentile(95%) = 0.202271 ms, percentile(99%) = 0.20459 ms 7 | [09/02/2024-18:04:40] [I] GPU Compute Time: min = 1.57288 ms, max = 3.65894 ms, mean = 1.58208 ms, median = 1.58002 ms, percentile(90%) = 1.58398 ms, percentile(95%) = 1.58521 ms, percentile(99%) = 1.58813 ms 8 | [09/02/2024-18:04:40] [I] D2H Latency: min = 0.112549 ms, max = 0.167236 ms, mean = 0.114774 ms, median = 0.114502 ms, percentile(90%) = 0.115967 ms, percentile(95%) = 0.116211 ms, percentile(99%) = 0.11792 ms 9 | [09/02/2024-18:04:40] [I] Total Host Walltime: 3.17227 s 10 | [09/02/2024-18:04:40] [I] Total GPU Compute Time: 3.16416 s 11 | [09/02/2024-18:04:40] [W] * GPU compute time is unstable, with coefficient of variance = 3.28975%. 12 | [09/02/2024-18:04:40] [W] If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the stability. 13 | [09/02/2024-18:04:40] [I] Explanations of the performance metrics are printed in the verbose logs. 14 | ``` -------------------------------------------------------------------------------- /assets/dosod-m-4090.md: -------------------------------------------------------------------------------- 1 | ``` 2 | [09/02/2024-17:57:01] [I] === Performance summary === 3 | [09/02/2024-17:57:01] [I] Throughput: 919.495 qps 4 | [09/02/2024-17:57:01] [I] Latency: min = 1.39429 ms, max = 2.81079 ms, mean = 1.40298 ms, median = 1.40112 ms, percentile(90%) = 1.40356 ms, percentile(95%) = 1.40424 ms, percentile(99%) = 1.40601 ms 5 | [09/02/2024-17:57:01] [I] Enqueue Time: min = 0.626221 ms, max = 3.35059 ms, mean = 0.659003 ms, median = 0.644882 ms, percentile(90%) = 0.675415 ms, percentile(95%) = 0.765869 ms, percentile(99%) = 0.790466 ms 6 | [09/02/2024-17:57:01] [I] H2D Latency: min = 0.201172 ms, max = 0.26123 ms, mean = 0.201796 ms, median = 0.20166 ms, percentile(90%) = 0.201904 ms, percentile(95%) = 0.202148 ms, percentile(99%) = 0.20459 ms 7 | [09/02/2024-17:57:01] [I] GPU Compute Time: min = 1.07715 ms, max = 2.45239 ms, mean = 1.08407 ms, median = 1.0824 ms, percentile(90%) = 1.08441 ms, percentile(95%) = 1.08447 ms, percentile(99%) = 1.08545 ms 8 | [09/02/2024-17:57:01] [I] D2H Latency: min = 0.114746 ms, max = 0.156982 ms, mean = 0.117125 ms, median = 0.117126 ms, percentile(90%) = 0.118286 ms, percentile(95%) = 0.11853 ms, percentile(99%) = 0.119141 ms 9 | [09/02/2024-17:57:01] [I] Total Host Walltime: 3.00382 s 10 | [09/02/2024-17:57:01] [I] Total GPU Compute Time: 2.9942 s 11 | [09/02/2024-17:57:01] [W] * GPU compute time is unstable, with coefficient of variance = 3.89596%. 12 | [09/02/2024-17:57:01] [W] If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the stability. 13 | [09/02/2024-17:57:01] [I] Explanations of the performance metrics are printed in the verbose logs. 14 | ``` -------------------------------------------------------------------------------- /assets/dosod-s-4090.md: -------------------------------------------------------------------------------- 1 | ``` 2 | [09/02/2024-17:47:04] [I] === Performance summary === 3 | [09/02/2024-17:47:04] [I] Throughput: 1574.34 qps 4 | [09/02/2024-17:47:04] [I] Latency: min = 0.94249 ms, max = 1.51343 ms, mean = 0.949318 ms, median = 0.948975 ms, percentile(90%) = 0.951172 ms, percentile(95%) = 0.951843 ms, percentile(99%) = 0.953735 ms 5 | [09/02/2024-17:47:04] [I] Enqueue Time: min = 0.539093 ms, max = 1.61792 ms, mean = 0.582978 ms, median = 0.587158 ms, percentile(90%) = 0.598022 ms, percentile(95%) = 0.601318 ms, percentile(99%) = 0.614258 ms 6 | [09/02/2024-17:47:04] [I] H2D Latency: min = 0.201904 ms, max = 0.237549 ms, mean = 0.202861 ms, median = 0.202698 ms, percentile(90%) = 0.203125 ms, percentile(95%) = 0.203369 ms, percentile(99%) = 0.205811 ms 7 | [09/02/2024-17:47:04] [I] GPU Compute Time: min = 0.626678 ms, max = 1.15527 ms, mean = 0.632278 ms, median = 0.631836 ms, percentile(90%) = 0.633789 ms, percentile(95%) = 0.633911 ms, percentile(99%) = 0.634888 ms 8 | [09/02/2024-17:47:04] [I] D2H Latency: min = 0.112061 ms, max = 0.156982 ms, mean = 0.114173 ms, median = 0.113556 ms, percentile(90%) = 0.115723 ms, percentile(95%) = 0.115967 ms, percentile(99%) = 0.116699 ms 9 | [09/02/2024-17:47:04] [I] Total Host Walltime: 3.0019 s 10 | [09/02/2024-17:47:04] [I] Total GPU Compute Time: 2.98815 s 11 | [09/02/2024-17:47:04] [W] * Throughput may be bound by Enqueue Time rather than GPU Compute and the GPU may be under-utilized. 12 | [09/02/2024-17:47:04] [W] If not already in use, --useCudaGraph (utilize CUDA graphs where possible) may increase the throughput. 13 | [09/02/2024-17:47:04] [W] * GPU compute time is unstable, with coefficient of variance = 1.22015%. 14 | [09/02/2024-17:47:04] [W] If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the stability. 15 | [09/02/2024-17:47:04] [I] Explanations of the performance metrics are printed in the verbose logs. 16 | ``` -------------------------------------------------------------------------------- /assets/dosod_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/D-Robotics-AI-Lab/DOSOD/2b133c3a9f7a9b685635090d219efa84a6951e75/assets/dosod_architecture.png -------------------------------------------------------------------------------- /assets/finetune_yoloworld.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/D-Robotics-AI-Lab/DOSOD/2b133c3a9f7a9b685635090d219efa84a6951e75/assets/finetune_yoloworld.png -------------------------------------------------------------------------------- /assets/render_dosod.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/D-Robotics-AI-Lab/DOSOD/2b133c3a9f7a9b685635090d219efa84a6951e75/assets/render_dosod.jpeg -------------------------------------------------------------------------------- /assets/reparameterize.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/D-Robotics-AI-Lab/DOSOD/2b133c3a9f7a9b685635090d219efa84a6951e75/assets/reparameterize.png -------------------------------------------------------------------------------- /assets/yolo-worldv1-l-4090.md: -------------------------------------------------------------------------------- 1 | ``` 2 | [09/07/2024-21:35:18] [I] === Performance summary === 3 | [09/07/2024-21:35:18] [I] Throughput: 481.898 qps 4 | [09/07/2024-21:35:18] [I] Latency: min = 2.21338 ms, max = 21.5757 ms, mean = 2.36123 ms, median = 2.22137 ms, percentile(90%) = 2.2915 ms, percentile(95%) = 2.64233 ms, percentile(99%) = 5.23608 ms 5 | [09/07/2024-21:35:18] [I] Enqueue Time: min = 1.29114 ms, max = 21.397 ms, mean = 1.80525 ms, median = 1.59399 ms, percentile(90%) = 2.1189 ms, percentile(95%) = 2.59033 ms, percentile(99%) = 5.50952 ms 6 | [09/07/2024-21:35:18] [I] H2D Latency: min = 0.200684 ms, max = 0.259521 ms, mean = 0.206849 ms, median = 0.202393 ms, percentile(90%) = 0.231079 ms, percentile(95%) = 0.235962 ms, percentile(99%) = 0.244141 ms 7 | [09/07/2024-21:35:18] [I] GPU Compute Time: min = 1.89233 ms, max = 21.2131 ms, mean = 2.02429 ms, median = 1.89844 ms, percentile(90%) = 1.90262 ms, percentile(95%) = 2.25281 ms, percentile(99%) = 4.84247 ms 8 | [09/07/2024-21:35:18] [I] D2H Latency: min = 0.116577 ms, max = 10.974 ms, mean = 0.130076 ms, median = 0.120117 ms, percentile(90%) = 0.152832 ms, percentile(95%) = 0.157898 ms, percentile(99%) = 0.166748 ms 9 | [09/07/2024-21:35:18] [I] Total Host Walltime: 4.15026 s 10 | [09/07/2024-21:35:18] [I] Total GPU Compute Time: 4.04858 s 11 | [09/07/2024-21:35:18] [W] * Throughput may be bound by Enqueue Time rather than GPU Compute and the GPU may be under-utilized. 12 | [09/07/2024-21:35:18] [W] If not already in use, --useCudaGraph (utilize CUDA graphs where possible) may increase the throughput. 13 | [09/07/2024-21:35:18] [W] * GPU compute time is unstable, with coefficient of variance = 44.7285%. 14 | [09/07/2024-21:35:18] [W] If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the stability. 15 | [09/07/2024-21:35:18] [I] Explanations of the performance metrics are printed in the verbose logs. 16 | ``` -------------------------------------------------------------------------------- /assets/yolo-worldv1-m-4090.md: -------------------------------------------------------------------------------- 1 | ``` 2 | [09/07/2024-21:22:23] [I] === Performance summary === 3 | [09/07/2024-21:22:23] [I] Throughput: 687.676 qps 4 | [09/07/2024-21:22:23] [I] Latency: min = 1.68785 ms, max = 10.8534 ms, mean = 1.75215 ms, median = 1.69934 ms, percentile(90%) = 1.74597 ms, percentile(95%) = 1.78418 ms, percentile(99%) = 2.92371 ms 5 | [09/07/2024-21:22:23] [I] Enqueue Time: min = 1.00952 ms, max = 10.7031 ms, mean = 1.29381 ms, median = 1.20312 ms, percentile(90%) = 1.4812 ms, percentile(95%) = 1.64844 ms, percentile(99%) = 2.86414 ms 6 | [09/07/2024-21:22:23] [I] H2D Latency: min = 0.201416 ms, max = 0.246094 ms, mean = 0.20623 ms, median = 0.202393 ms, percentile(90%) = 0.221924 ms, percentile(95%) = 0.234863 ms, percentile(99%) = 0.24231 ms 7 | [09/07/2024-21:22:23] [I] GPU Compute Time: min = 1.37114 ms, max = 10.4899 ms, mean = 1.42454 ms, median = 1.38147 ms, percentile(90%) = 1.3855 ms, percentile(95%) = 1.3894 ms, percentile(99%) = 2.47192 ms 8 | [09/07/2024-21:22:23] [I] D2H Latency: min = 0.112793 ms, max = 4.37329 ms, mean = 0.121365 ms, median = 0.114746 ms, percentile(90%) = 0.139648 ms, percentile(95%) = 0.154541 ms, percentile(99%) = 0.166626 ms 9 | [09/07/2024-21:22:23] [I] Total Host Walltime: 3.00432 s 10 | [09/07/2024-21:22:23] [I] Total GPU Compute Time: 2.94311 s 11 | [09/07/2024-21:22:23] [W] * Throughput may be bound by Enqueue Time rather than GPU Compute and the GPU may be under-utilized. 12 | [09/07/2024-21:22:23] [W] If not already in use, --useCudaGraph (utilize CUDA graphs where possible) may increase the throughput. 13 | [09/07/2024-21:22:23] [W] * GPU compute time is unstable, with coefficient of variance = 26.399%. 14 | [09/07/2024-21:22:23] [W] If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the stability. 15 | [09/07/2024-21:22:23] [I] Explanations of the performance metrics are printed in the verbose logs. 16 | ``` -------------------------------------------------------------------------------- /assets/yolo-worldv1-s-4090.md: -------------------------------------------------------------------------------- 1 | ``` 2 | [09/07/2024-21:04:10] [I] === Performance summary === 3 | [09/07/2024-21:04:10] [I] Throughput: 802.713 qps 4 | [09/07/2024-21:04:10] [I] Latency: min = 1.26953 ms, max = 6.30933 ms, mean = 1.40011 ms, median = 1.29671 ms, percentile(90%) = 1.5542 ms, percentile(95%) = 1.68701 ms, percentile(99%) = 2.52319 ms 5 | [09/07/2024-21:04:10] [I] Enqueue Time: min = 0.995972 ms, max = 6.14551 ms, mean = 1.20034 ms, median = 1.1221 ms, percentile(90%) = 1.39355 ms, percentile(95%) = 1.51538 ms, percentile(99%) = 2.34814 ms 6 | [09/07/2024-21:04:10] [I] H2D Latency: min = 0.203857 ms, max = 0.341919 ms, mean = 0.250004 ms, median = 0.242584 ms, percentile(90%) = 0.280029 ms, percentile(95%) = 0.292236 ms, percentile(99%) = 0.311523 ms 7 | [09/07/2024-21:04:10] [I] GPU Compute Time: min = 0.881592 ms, max = 5.87183 ms, mean = 0.99288 ms, median = 0.887817 ms, percentile(90%) = 1.151 ms, percentile(95%) = 1.27808 ms, percentile(99%) = 2.11157 ms 8 | [09/07/2024-21:04:10] [I] D2H Latency: min = 0.115723 ms, max = 3.60303 ms, mean = 0.157228 ms, median = 0.155029 ms, percentile(90%) = 0.166626 ms, percentile(95%) = 0.167114 ms, percentile(99%) = 0.168457 ms 9 | [09/07/2024-21:04:10] [I] Total Host Walltime: 3.00232 s 10 | [09/07/2024-21:04:10] [I] Total GPU Compute Time: 2.39284 s 11 | [09/07/2024-21:04:10] [W] * Throughput may be bound by Enqueue Time rather than GPU Compute and the GPU may be under-utilized. 12 | [09/07/2024-21:04:10] [W] If not already in use, --useCudaGraph (utilize CUDA graphs where possible) may increase the throughput. 13 | [09/07/2024-21:04:10] [W] * GPU compute time is unstable, with coefficient of variance = 31.6956%. 14 | [09/07/2024-21:04:10] [W] If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the stability. 15 | [09/07/2024-21:04:10] [I] Explanations of the performance metrics are printed in the verbose logs. 16 | ``` -------------------------------------------------------------------------------- /assets/yolo-worldv2-l-4090.md: -------------------------------------------------------------------------------- 1 | ``` 2 | [09/02/2024-17:39:48] [I] === Performance summary === 3 | [09/02/2024-17:39:48] [I] Throughput: 551.925 qps 4 | [09/02/2024-17:39:48] [I] Latency: min = 2.12134 ms, max = 3.32349 ms, mean = 2.12996 ms, median = 2.12842 ms, percentile(90%) = 2.1311 ms, percentile(95%) = 2.13202 ms, percentile(99%) = 2.13391 ms 5 | [09/02/2024-17:39:48] [I] Enqueue Time: min = 1.11572 ms, max = 4.59204 ms, mean = 1.14546 ms, median = 1.13232 ms, percentile(90%) = 1.14771 ms, percentile(95%) = 1.15222 ms, percentile(99%) = 1.18799 ms 6 | [09/02/2024-17:39:48] [I] H2D Latency: min = 0.200684 ms, max = 0.236572 ms, mean = 0.201542 ms, median = 0.201416 ms, percentile(90%) = 0.202087 ms, percentile(95%) = 0.202332 ms, percentile(99%) = 0.203857 ms 7 | [09/02/2024-17:39:48] [I] GPU Compute Time: min = 1.8002 ms, max = 2.9646 ms, mean = 1.80803 ms, median = 1.8064 ms, percentile(90%) = 1.80859 ms, percentile(95%) = 1.80945 ms, percentile(99%) = 1.8114 ms 8 | [09/02/2024-17:39:48] [I] D2H Latency: min = 0.116699 ms, max = 0.157471 ms, mean = 0.120401 ms, median = 0.120117 ms, percentile(90%) = 0.121582 ms, percentile(95%) = 0.122192 ms, percentile(99%) = 0.123047 ms 9 | [09/02/2024-17:39:48] [I] Total Host Walltime: 3.62368 s 10 | [09/02/2024-17:39:48] [I] Total GPU Compute Time: 3.61605 s 11 | [09/02/2024-17:39:48] [W] * GPU compute time is unstable, with coefficient of variance = 1.75854%. 12 | [09/02/2024-17:39:48] [W] If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the stability. 13 | [09/02/2024-17:39:48] [I] Explanations of the performance metrics are printed in the verbose logs. 14 | ``` -------------------------------------------------------------------------------- /assets/yolo-worldv2-m-4090.md: -------------------------------------------------------------------------------- 1 | ``` 2 | [09/02/2024-17:28:49] [I] === Performance summary === 3 | [09/02/2024-17:28:49] [I] Throughput: 768.53 qps 4 | [09/02/2024-17:28:49] [I] Latency: min = 1.6106 ms, max = 3.10254 ms, mean = 1.61813 ms, median = 1.61597 ms, percentile(90%) = 1.61835 ms, percentile(95%) = 1.61905 ms, percentile(99%) = 1.62074 ms 5 | [09/02/2024-17:28:49] [I] Enqueue Time: min = 0.873291 ms, max = 3.86377 ms, mean = 0.895041 ms, median = 0.886475 ms, percentile(90%) = 0.893494 ms, percentile(95%) = 0.898438 ms, percentile(99%) = 0.942871 ms 6 | [09/02/2024-17:28:49] [I] H2D Latency: min = 0.201172 ms, max = 0.240723 ms, mean = 0.20176 ms, median = 0.20166 ms, percentile(90%) = 0.201904 ms, percentile(95%) = 0.202026 ms, percentile(99%) = 0.204102 ms 7 | [09/02/2024-17:28:49] [I] GPU Compute Time: min = 1.29126 ms, max = 2.74023 ms, mean = 1.2974 ms, median = 1.29541 ms, percentile(90%) = 1.29739 ms, percentile(95%) = 1.29834 ms, percentile(99%) = 1.29932 ms 8 | [09/02/2024-17:28:49] [I] D2H Latency: min = 0.116455 ms, max = 0.168213 ms, mean = 0.118966 ms, median = 0.118652 ms, percentile(90%) = 0.120117 ms, percentile(95%) = 0.120361 ms, percentile(99%) = 0.121094 ms 9 | [09/02/2024-17:28:49] [I] Total Host Walltime: 3.00313 s 10 | [09/02/2024-17:28:49] [I] Total GPU Compute Time: 2.99441 s 11 | [09/02/2024-17:28:49] [W] * GPU compute time is unstable, with coefficient of variance = 3.23952%. 12 | [09/02/2024-17:28:49] [W] If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the stability. 13 | [09/02/2024-17:28:49] [I] Explanations of the performance metrics are printed in the verbose logs. 14 | ``` -------------------------------------------------------------------------------- /assets/yolo-worldv2-s-4090.md: -------------------------------------------------------------------------------- 1 | ``` 2 | [09/02/2024-17:14:34] [I] === Performance summary === 3 | [09/02/2024-17:14:34] [I] Throughput: 1099.65 qps 4 | [09/02/2024-17:14:34] [I] Latency: min = 1.12085 ms, max = 5.30969 ms, mean = 1.17602 ms, median = 1.17322 ms, percentile(90%) = 1.18164 ms, percentile(95%) = 1.18457 ms, percentile(99%) = 1.20386 ms 5 | [09/02/2024-17:14:34] [I] Enqueue Time: min = 0.697998 ms, max = 5.14844 ms, mean = 0.865324 ms, median = 0.88208 ms, percentile(90%) = 0.89624 ms, percentile(95%) = 0.901215 ms, percentile(99%) = 0.947876 ms 6 | [09/02/2024-17:14:34] [I] H2D Latency: min = 0.202148 ms, max = 0.24292 ms, mean = 0.219057 ms, median = 0.221542 ms, percentile(90%) = 0.225708 ms, percentile(95%) = 0.227783 ms, percentile(99%) = 0.236572 ms 7 | [09/02/2024-17:14:34] [I] GPU Compute Time: min = 0.803711 ms, max = 4.93372 ms, mean = 0.818735 ms, median = 0.809937 ms, percentile(90%) = 0.811035 ms, percentile(95%) = 0.811035 ms, percentile(99%) = 0.812012 ms 8 | [09/02/2024-17:14:34] [I] D2H Latency: min = 0.112061 ms, max = 0.168457 ms, mean = 0.138235 ms, median = 0.142334 ms, percentile(90%) = 0.147095 ms, percentile(95%) = 0.149506 ms, percentile(99%) = 0.15715 ms 9 | [09/02/2024-17:14:34] [I] Total Host Walltime: 3.00186 s 10 | [09/02/2024-17:14:34] [I] Total GPU Compute Time: 2.70264 s 11 | [09/02/2024-17:14:34] [W] * Throughput may be bound by Enqueue Time rather than GPU Compute and the GPU may be under-utilized. 12 | [09/02/2024-17:14:34] [W] If not already in use, --useCudaGraph (utilize CUDA graphs where possible) may increase the throughput. 13 | [09/02/2024-17:14:34] [W] * GPU compute time is unstable, with coefficient of variance = 20.4616%. 14 | [09/02/2024-17:14:34] [W] If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the stability. 15 | [09/02/2024-17:14:34] [I] Explanations of the performance metrics are printed in the verbose logs. 16 | ``` -------------------------------------------------------------------------------- /assets/yolo_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/D-Robotics-AI-Lab/DOSOD/2b133c3a9f7a9b685635090d219efa84a6951e75/assets/yolo_arch.png -------------------------------------------------------------------------------- /assets/yolo_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/D-Robotics-AI-Lab/DOSOD/2b133c3a9f7a9b685635090d219efa84a6951e75/assets/yolo_logo.png -------------------------------------------------------------------------------- /configs/dosod/rep_dosod_mlp3x_l_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py: -------------------------------------------------------------------------------- 1 | _base_ = '../../third_party/mmyolo/configs/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco.py' 2 | custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) 3 | 4 | # hyper-parameters 5 | num_training_classes = 80 6 | text_channels = 512 7 | joint_space_dims = 512 8 | 9 | # model settings 10 | model = dict( 11 | type='RepDOSODDetector', 12 | data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), 13 | bbox_head=dict(type='RepDOSODYOLOv8Head', 14 | head_module=dict(type='RepDOSODYOLOv8dHeadModule', 15 | text_embed_dims=text_channels, 16 | joint_space_dims=joint_space_dims, 17 | num_classes=num_training_classes))) 18 | -------------------------------------------------------------------------------- /configs/dosod/rep_dosod_mlp3x_l_d-robotics.py: -------------------------------------------------------------------------------- 1 | _base_ = '../../third_party/mmyolo/configs/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco.py' 2 | custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) 3 | 4 | # hyper-parameters 5 | num_training_classes = 80 # lvis 1202, coco 80 6 | text_channels = 512 7 | joint_space_dims = 512 8 | 9 | # model settings 10 | model = dict( 11 | type='RepDOSODDetector', 12 | data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), 13 | bbox_head=dict(type='RepDOSODYOLOv8Head', 14 | head_module=dict(type='RepDOSODYOLOv8dHeadModuleDRobotics', 15 | text_embed_dims=text_channels, 16 | joint_space_dims=joint_space_dims, 17 | num_classes=num_training_classes))) 18 | -------------------------------------------------------------------------------- /configs/dosod/rep_dosod_mlp3x_m_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py: -------------------------------------------------------------------------------- 1 | _base_ = '../../third_party/mmyolo/configs/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco.py' 2 | custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) 3 | 4 | # hyper-parameters 5 | num_training_classes = 80 6 | text_channels = 512 7 | joint_space_dims = 512 8 | 9 | # model settings 10 | model = dict( 11 | type='RepDOSODDetector', 12 | data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), 13 | bbox_head=dict(type='RepDOSODYOLOv8Head', 14 | head_module=dict(type='RepDOSODYOLOv8dHeadModule', 15 | text_embed_dims=text_channels, 16 | joint_space_dims=joint_space_dims, 17 | num_classes=num_training_classes))) 18 | -------------------------------------------------------------------------------- /configs/dosod/rep_dosod_mlp3x_m_d-robotics.py: -------------------------------------------------------------------------------- 1 | _base_ = '../../third_party/mmyolo/configs/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco.py' 2 | custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) 3 | 4 | # hyper-parameters 5 | num_training_classes = 80 # lvis 1202, coco 80 6 | text_channels = 512 7 | joint_space_dims = 512 8 | 9 | # model settings 10 | model = dict( 11 | type='RepDOSODDetector', 12 | data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), 13 | bbox_head=dict(type='RepDOSODYOLOv8Head', 14 | head_module=dict(type='RepDOSODYOLOv8dHeadModuleDRobotics', 15 | text_embed_dims=text_channels, 16 | joint_space_dims=joint_space_dims, 17 | num_classes=num_training_classes))) 18 | -------------------------------------------------------------------------------- /configs/dosod/rep_dosod_mlp3x_s_100e_1x8gpus_obj365v1_goldg_train_lvis_minival.py: -------------------------------------------------------------------------------- 1 | _base_ = '../../third_party/mmyolo/configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py' 2 | custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) 3 | 4 | # hyper-parameters 5 | num_training_classes = 80 6 | text_channels = 512 7 | joint_space_dims = 512 8 | 9 | # model settings 10 | model = dict( 11 | type='RepDOSODDetector', 12 | data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), 13 | bbox_head=dict(type='RepDOSODYOLOv8Head', 14 | head_module=dict(type='RepDOSODYOLOv8dHeadModule', 15 | text_embed_dims=text_channels, 16 | joint_space_dims=joint_space_dims, 17 | num_classes=num_training_classes))) 18 | -------------------------------------------------------------------------------- /configs/dosod/rep_dosod_mlp3x_s_d-robotics.py: -------------------------------------------------------------------------------- 1 | _base_ = '../../third_party/mmyolo/configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py' 2 | custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) 3 | 4 | # hyper-parameters 5 | num_training_classes = 80 # lvis 1202, coco 80 6 | text_channels = 512 7 | joint_space_dims = 512 8 | 9 | # model settings 10 | model = dict( 11 | type='RepDOSODDetector', 12 | data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), 13 | bbox_head=dict(type='RepDOSODYOLOv8Head', 14 | head_module=dict(type='RepDOSODYOLOv8dHeadModuleDRobotics', 15 | text_embed_dims=text_channels, 16 | joint_space_dims=joint_space_dims, 17 | num_classes=num_training_classes))) 18 | -------------------------------------------------------------------------------- /configs/pretrain/rep_yolo_world_v2_l.py: -------------------------------------------------------------------------------- 1 | _base_ = ('../../third_party/mmyolo/configs/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco.py') 2 | custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) 3 | 4 | # hyper-parameters 5 | num_classes = 80 6 | num_training_classes = 80 7 | text_channels = 512 8 | neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] 9 | neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] 10 | 11 | # model settings 12 | model = dict(type='SimpleYOLOWorldDetector', 13 | mm_neck=True, 14 | num_train_classes=num_classes, 15 | num_test_classes=num_classes, 16 | reparameterized=True, 17 | data_preprocessor=dict(type='YOLOv5DetDataPreprocessor'), 18 | backbone=dict(_delete_=True, 19 | type='MultiModalYOLOBackbone', 20 | text_model=None, 21 | image_model={{_base_.model.backbone}}, 22 | with_text_model=False), 23 | neck=dict(type='YOLOWorldPAFPN', 24 | guide_channels=num_classes, 25 | embed_channels=neck_embed_channels, 26 | num_heads=neck_num_heads, 27 | block_cfg=dict(type='RepConvMaxSigmoidCSPLayerWithTwoConv', 28 | guide_channels=num_classes)), 29 | bbox_head=dict(head_module=dict(type='RepYOLOWorldHeadModule', 30 | embed_dims=text_channels, 31 | num_guide=num_classes, 32 | num_classes=num_classes)), 33 | train_cfg=dict(assigner=dict(num_classes=num_classes))) 34 | -------------------------------------------------------------------------------- /configs/pretrain/rep_yolo_world_v2_m.py: -------------------------------------------------------------------------------- 1 | _base_ = ('../../third_party/mmyolo/configs/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco.py') 2 | custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) 3 | 4 | # hyper-parameters 5 | num_classes = 80 6 | num_training_classes = 80 7 | text_channels = 512 8 | neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] 9 | neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] 10 | 11 | # model settings 12 | model = dict(type='SimpleYOLOWorldDetector', 13 | mm_neck=True, 14 | num_train_classes=num_classes, 15 | num_test_classes=num_classes, 16 | reparameterized=True, 17 | data_preprocessor=dict(type='YOLOv5DetDataPreprocessor'), 18 | backbone=dict(_delete_=True, 19 | type='MultiModalYOLOBackbone', 20 | text_model=None, 21 | image_model={{_base_.model.backbone}}, 22 | with_text_model=False), 23 | neck=dict(type='YOLOWorldPAFPN', 24 | guide_channels=num_classes, 25 | embed_channels=neck_embed_channels, 26 | num_heads=neck_num_heads, 27 | block_cfg=dict(type='RepConvMaxSigmoidCSPLayerWithTwoConv', 28 | guide_channels=num_classes)), 29 | bbox_head=dict(head_module=dict(type='RepYOLOWorldHeadModule', 30 | embed_dims=text_channels, 31 | num_guide=num_classes, 32 | num_classes=num_classes)), 33 | train_cfg=dict(assigner=dict(num_classes=num_classes))) 34 | -------------------------------------------------------------------------------- /configs/pretrain/rep_yolo_world_v2_s.py: -------------------------------------------------------------------------------- 1 | _base_ = ('../../third_party/mmyolo/configs/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py') 2 | custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) 3 | 4 | # hyper-parameters 5 | num_classes = 80 6 | num_training_classes = 80 7 | text_channels = 512 8 | neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] 9 | neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] 10 | 11 | # model settings 12 | model = dict(type='SimpleYOLOWorldDetector', 13 | mm_neck=True, 14 | num_train_classes=num_classes, 15 | num_test_classes=num_classes, 16 | reparameterized=True, 17 | data_preprocessor=dict(type='YOLOv5DetDataPreprocessor'), 18 | backbone=dict(_delete_=True, 19 | type='MultiModalYOLOBackbone', 20 | text_model=None, 21 | image_model={{_base_.model.backbone}}, 22 | with_text_model=False), 23 | neck=dict(type='YOLOWorldPAFPN', 24 | guide_channels=num_classes, 25 | embed_channels=neck_embed_channels, 26 | num_heads=neck_num_heads, 27 | block_cfg=dict(type='RepConvMaxSigmoidCSPLayerWithTwoConv', 28 | guide_channels=num_classes)), 29 | bbox_head=dict(head_module=dict(type='RepYOLOWorldHeadModule', 30 | embed_dims=text_channels, 31 | num_guide=num_classes, 32 | num_classes=num_classes)), 33 | train_cfg=dict(assigner=dict(num_classes=num_classes))) 34 | -------------------------------------------------------------------------------- /configs/prompt_tuning_coco/READEME.md: -------------------------------------------------------------------------------- 1 | ## Prompt Tuning for YOLO-World 2 | 3 | ### NOTE: 4 | 5 | This folder contains many experimental config files, which will be removed later!! 6 | 7 | ### Experimental Results 8 | 9 | | Model | Config | AP | AP50 | AP75 | APS | APM | APL | 10 | | :---- | :----: | :--: | :--: | :---: | :-: | :-: | :-: | 11 | | YOLO-World-v2-L | Zero-shot | 45.7 | 61.6 | 49.8 | 29.9 | 50.0 | 60.8 | 12 | | [YOLO-World-v2-L](./../configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco.py) | Prompt tuning | 47.9 | 64.3 | 52.5 | 31.9 | 52.6 | 61.3 | 13 | -------------------------------------------------------------------------------- /data/texts/coco_class_texts.json: -------------------------------------------------------------------------------- 1 | [["person"], ["bicycle"], ["car"], ["motorcycle"], ["airplane"], ["bus"], ["train"], ["truck"], ["boat"], ["traffic light"], ["fire hydrant"], ["stop sign"], ["parking meter"], ["bench"], ["bird"], ["cat"], ["dog"], ["horse"], ["sheep"], ["cow"], ["elephant"], ["bear"], ["zebra"], ["giraffe"], ["backpack"], ["umbrella"], ["handbag"], ["tie"], ["suitcase"], ["frisbee"], ["skis"], ["snowboard"], ["sports ball"], ["kite"], ["baseball bat"], ["baseball glove"], ["skateboard"], ["surfboard"], ["tennis racket"], ["bottle"], ["wine glass"], ["cup"], ["fork"], ["knife"], ["spoon"], ["bowl"], ["banana"], ["apple"], ["sandwich"], ["orange"], ["broccoli"], ["carrot"], ["hot dog"], ["pizza"], ["donut"], ["cake"], ["chair"], ["couch"], ["potted plant"], ["bed"], ["dining table"], ["toilet"], ["tv"], ["laptop"], ["mouse"], ["remote"], ["keyboard"], ["cell phone"], ["microwave"], ["oven"], ["toaster"], ["sink"], ["refrigerator"], ["book"], ["clock"], ["vase"], ["scissors"], ["teddy bear"], ["hair drier"], ["toothbrush"]] 2 | -------------------------------------------------------------------------------- /demo/README.md: -------------------------------------------------------------------------------- 1 | ## YOLO-World Demo 2 | 3 | ### Getting Started 4 | 5 | Setting `PYTHONPATH` as the path to `YOLO-World` and run: 6 | 7 | ```bash 8 | PYTHONPATH=/xxxx/YOLO-World python demo/yyyy_demo.py 9 | # or directly 10 | PYTHONPATH=./ python demo/yyyy_demo.py 11 | ``` 12 | 13 | #### Gradio Demo 14 | 15 | We provide the [Gradio](https://www.gradio.app/) demo for local devices: 16 | 17 | ```bash 18 | pip install gradio==4.16.0 19 | python demo/demo.py path/to/config path/to/weights 20 | ``` 21 | 22 | Additionaly, you can use a Dockerfile to build an image with gradio. As a prerequisite, make sure you have respective drivers installed alongside [nvidia-container-runtime](https://stackoverflow.com/questions/59691207/docker-build-with-nvidia-runtime). Replace MODEL_NAME and WEIGHT_NAME with the respective values or ommit this and use default values from the [Dockerfile](Dockerfile#3) 23 | 24 | ```bash 25 | docker build --build-arg="MODEL=MODEL_NAME" --build-arg="WEIGHT=WEIGHT_NAME" -t yolo_demo . 26 | docker run --runtime nvidia -p 8080:8080 27 | ``` 28 | 29 | #### Image Demo 30 | 31 | We provide a simple image demo for inference on images with visualization outputs. 32 | 33 | ```bash 34 | python demo/image_demo.py path/to/config path/to/weights image/path/directory 'person,dog,cat' --topk 100 --threshold 0.005 --output-dir demo_outputs 35 | ``` 36 | 37 | **Notes:** 38 | * The `image` can be a directory or a single image. 39 | * The `texts` can be a string of categories (noun phrases) which is separated by a comma. We also support `txt` file in which each line contains a category ( noun phrases). 40 | * The `topk` and `threshold` control the number of predictions and the confidence threshold. 41 | 42 | 43 | #### Video Demo 44 | 45 | The `video_demo` has similar hyper-parameters with `image_demo`. 46 | 47 | ```bash 48 | python demo/video_demo.py path/to/config path/to/weights video_path 'person,dog' --out out_video_path 49 | ``` 50 | 51 | ### FAQ 52 | 53 | > 1. `Failed to custom import!` 54 | ```bash 55 | File "simple_demo.py", line 37, in 56 | cfg = Config.fromfile(config_file) 57 | File "/data/miniconda3/envs/det/lib/python3.8/site-packages/mmengine/config/config.py", line 183, in fromfile 58 | raise ImportError('Failed to custom import!') from e 59 | ImportError: Failed to custom import! 60 | ``` 61 | **Solution:** 62 | 63 | ```bash 64 | PYTHONPATH=/xxxx/YOLO-World python demo/simple_demo.py 65 | ``` -------------------------------------------------------------------------------- /demo/sample_images/bus.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/D-Robotics-AI-Lab/DOSOD/2b133c3a9f7a9b685635090d219efa84a6951e75/demo/sample_images/bus.jpg -------------------------------------------------------------------------------- /demo/sample_images/zidane.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/D-Robotics-AI-Lab/DOSOD/2b133c3a9f7a9b685635090d219efa84a6951e75/demo/sample_images/zidane.jpg -------------------------------------------------------------------------------- /demo/simple_demo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tencent Inc. All rights reserved. 2 | import os.path as osp 3 | 4 | import cv2 5 | import torch 6 | from mmengine.config import Config 7 | from mmengine.dataset import Compose 8 | from mmdet.apis import init_detector 9 | from mmdet.utils import get_test_pipeline_cfg 10 | 11 | 12 | def inference(model, image, texts, test_pipeline, score_thr=0.3, max_dets=100): 13 | image = cv2.imread(image) 14 | image = image[:, :, [2, 1, 0]] 15 | data_info = dict(img=image, img_id=0, texts=texts) 16 | data_info = test_pipeline(data_info) 17 | data_batch = dict(inputs=data_info['inputs'].unsqueeze(0), 18 | data_samples=[data_info['data_samples']]) 19 | with torch.no_grad(): 20 | output = model.test_step(data_batch)[0] 21 | pred_instances = output.pred_instances 22 | # score thresholding 23 | pred_instances = pred_instances[pred_instances.scores.float() > score_thr] 24 | # max detections 25 | if len(pred_instances.scores) > max_dets: 26 | indices = pred_instances.scores.float().topk(max_dets)[1] 27 | pred_instances = pred_instances[indices] 28 | 29 | pred_instances = pred_instances.cpu().numpy() 30 | boxes = pred_instances['bboxes'] 31 | labels = pred_instances['labels'] 32 | scores = pred_instances['scores'] 33 | label_texts = [texts[x][0] for x in labels] 34 | return boxes, labels, label_texts, scores 35 | 36 | 37 | if __name__ == "__main__": 38 | 39 | config_file = "configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py" 40 | checkpoint = "weights/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth" 41 | 42 | cfg = Config.fromfile(config_file) 43 | cfg.work_dir = osp.join('./work_dirs') 44 | # init model 45 | cfg.load_from = checkpoint 46 | model = init_detector(cfg, checkpoint=checkpoint, device='cuda:0') 47 | test_pipeline_cfg = get_test_pipeline_cfg(cfg=cfg) 48 | test_pipeline_cfg[0].type = 'mmdet.LoadImageFromNDArray' 49 | test_pipeline = Compose(test_pipeline_cfg) 50 | 51 | texts = [['person'], ['bus'], [' ']] 52 | image = "demo/sample_images/bus.jpg" 53 | print(f"starting to detect: {image}") 54 | results = inference(model, image, texts, test_pipeline) 55 | format_str = [ 56 | f"obj-{idx}: {box}, label-{lbl}, class-{lbl_text}, score-{score}" 57 | for idx, (box, lbl, lbl_text, score) in enumerate(zip(*results)) 58 | ] 59 | print("detecting results:") 60 | for q in format_str: 61 | print(q) 62 | -------------------------------------------------------------------------------- /deploy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/D-Robotics-AI-Lab/DOSOD/2b133c3a9f7a9b685635090d219efa84a6951e75/deploy/__init__.py -------------------------------------------------------------------------------- /deploy/easydeploy/README.md: -------------------------------------------------------------------------------- 1 | # MMYOLO Model Easy-Deployment 2 | 3 | ## Introduction 4 | 5 | This project is developed for easily converting your MMYOLO models to other inference backends without the need of MMDeploy, which reduces the cost of both time and effort on getting familiar with MMDeploy. 6 | 7 | Currently we support converting to `ONNX` and `TensorRT` formats, other inference backends such `ncnn` will be added to this project as well. 8 | 9 | ## Supported Backends 10 | 11 | - [Model Convert](docs/model_convert.md) 12 | -------------------------------------------------------------------------------- /deploy/easydeploy/README_zh-CN.md: -------------------------------------------------------------------------------- 1 | # MMYOLO 模型转换 2 | 3 | ## 介绍 4 | 5 | 本项目作为 MMYOLO 的部署 project 单独存在,意图剥离 MMDeploy 当前的体系,独自支持用户完成模型训练后的转换和部署功能,使用户的学习和工程成本下降。 6 | 7 | 当前支持对 ONNX 格式和 TensorRT 格式的转换,后续对其他推理平台也会支持起来。 8 | 9 | ## 转换教程 10 | 11 | - [Model Convert](docs/model_convert.md) 12 | -------------------------------------------------------------------------------- /deploy/easydeploy/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .common import DeployC2f 3 | from .focus import DeployFocus, GConvFocus, NcnnFocus 4 | 5 | __all__ = ['DeployFocus', 'NcnnFocus', 'GConvFocus', 'DeployC2f'] 6 | -------------------------------------------------------------------------------- /deploy/easydeploy/backbone/common.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch import Tensor 4 | 5 | 6 | class DeployC2f(nn.Module): 7 | 8 | def __init__(self, *args, **kwargs): 9 | super().__init__() 10 | 11 | def forward(self, x: Tensor) -> Tensor: 12 | x_main = self.main_conv(x) 13 | x_main = [x_main, x_main[:, self.mid_channels:, ...]] 14 | x_main.extend(blocks(x_main[-1]) for blocks in self.blocks) 15 | x_main.pop(1) 16 | return self.final_conv(torch.cat(x_main, 1)) 17 | -------------------------------------------------------------------------------- /deploy/easydeploy/bbox_code/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .bbox_coder import (rtmdet_bbox_decoder, yolov5_bbox_decoder, 3 | yolox_bbox_decoder) 4 | 5 | __all__ = ['yolov5_bbox_decoder', 'rtmdet_bbox_decoder', 'yolox_bbox_decoder'] 6 | -------------------------------------------------------------------------------- /deploy/easydeploy/bbox_code/bbox_coder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from typing import Optional 3 | 4 | import torch 5 | from torch import Tensor 6 | 7 | 8 | def yolov5_bbox_decoder(priors: Tensor, bbox_preds: Tensor, 9 | stride: Tensor) -> Tensor: 10 | bbox_preds = bbox_preds.sigmoid() 11 | 12 | x_center = (priors[..., 0] + priors[..., 2]) * 0.5 13 | y_center = (priors[..., 1] + priors[..., 3]) * 0.5 14 | w = priors[..., 2] - priors[..., 0] 15 | h = priors[..., 3] - priors[..., 1] 16 | 17 | x_center_pred = (bbox_preds[..., 0] - 0.5) * 2 * stride + x_center 18 | y_center_pred = (bbox_preds[..., 1] - 0.5) * 2 * stride + y_center 19 | w_pred = (bbox_preds[..., 2] * 2)**2 * w 20 | h_pred = (bbox_preds[..., 3] * 2)**2 * h 21 | 22 | decoded_bboxes = torch.stack( 23 | [x_center_pred, y_center_pred, w_pred, h_pred], dim=-1) 24 | 25 | return decoded_bboxes 26 | 27 | 28 | def rtmdet_bbox_decoder(priors: Tensor, bbox_preds: Tensor, 29 | stride: Optional[Tensor]) -> Tensor: 30 | stride = stride[None, :, None] 31 | bbox_preds *= stride 32 | tl_x = (priors[..., 0] - bbox_preds[..., 0]) 33 | tl_y = (priors[..., 1] - bbox_preds[..., 1]) 34 | br_x = (priors[..., 0] + bbox_preds[..., 2]) 35 | br_y = (priors[..., 1] + bbox_preds[..., 3]) 36 | decoded_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], -1) 37 | return decoded_bboxes 38 | 39 | 40 | def yolox_bbox_decoder(priors: Tensor, bbox_preds: Tensor, 41 | stride: Optional[Tensor]) -> Tensor: 42 | stride = stride[None, :, None] 43 | xys = (bbox_preds[..., :2] * stride) + priors 44 | whs = bbox_preds[..., 2:].exp() * stride 45 | decoded_bboxes = torch.cat([xys, whs], -1) 46 | return decoded_bboxes 47 | -------------------------------------------------------------------------------- /deploy/easydeploy/deepstream/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8.12) 2 | 3 | set(CMAKE_CUDA_ARCHITECTURES 60 61 62 70 72 75 86) 4 | set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) 5 | 6 | project(nvdsparsebbox_mmyolo LANGUAGES CXX) 7 | 8 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -O3 -g -Wall -Werror -shared -fPIC") 9 | set(CMAKE_CXX_STANDARD 14) 10 | set(CMAKE_BUILD_TYPE Release) 11 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 12 | 13 | # CUDA 14 | find_package(CUDA REQUIRED) 15 | 16 | # TensorRT 17 | set(TensorRT_INCLUDE_DIRS "/usr/include/x86_64-linux-gnu" CACHE STRING "TensorRT headers path") 18 | set(TensorRT_LIBRARIES "/usr/lib/x86_64-linux-gnu" CACHE STRING "TensorRT libs path") 19 | 20 | # DeepStream 21 | set(DEEPSTREAM "/opt/nvidia/deepstream/deepstream" CACHE STRING "DeepStream root path") 22 | set(DS_LIBRARIES ${DEEPSTREAM}/lib) 23 | set(DS_INCLUDE_DIRS ${DEEPSTREAM}/sources/includes) 24 | 25 | include_directories( 26 | ${CUDA_INCLUDE_DIRS} 27 | ${TensorRT_INCLUDE_DIRS} 28 | ${DS_INCLUDE_DIRS}) 29 | 30 | add_library( 31 | ${PROJECT_NAME} 32 | SHARED 33 | custom_mmyolo_bbox_parser/nvdsparsebbox_mmyolo.cpp) 34 | 35 | target_link_libraries(${PROJECT_NAME} PRIVATE nvinfer nvinfer_plugin) 36 | -------------------------------------------------------------------------------- /deploy/easydeploy/deepstream/README.md: -------------------------------------------------------------------------------- 1 | # Inference MMYOLO Models with DeepStream 2 | 3 | This project demonstrates how to inference MMYOLO models with customized parsers in [DeepStream SDK](https://developer.nvidia.com/deepstream-sdk). 4 | 5 | ## Pre-requisites 6 | 7 | ### 1. Install Nvidia Driver and CUDA 8 | 9 | First, please follow the official documents and instructions to install dedicated Nvidia graphic driver and CUDA matched to your gpu and target Nvidia AIoT devices. 10 | 11 | ### 2. Install DeepStream SDK 12 | 13 | Second, please follow the official instruction to download and install DeepStream SDK. Currently stable version of DeepStream is v6.2. 14 | 15 | ### 3. Generate TensorRT Engine 16 | 17 | As DeepStream builds on top of several NVIDIA libraries, you need to first convert your trained MMYOLO models to TensorRT engine files. We strongly recommend you to try the supported TensorRT deployment solution in [EasyDeploy](../../easydeploy/). 18 | 19 | ## Build and Run 20 | 21 | Please make sure that your converted TensorRT engine is already located in the `deepstream` folder as the config shows. Create your own model config files and change the `config-file` parameter in [deepstream_app_config.txt](deepstream_app_config.txt) to the model you want to run with. 22 | 23 | ```bash 24 | mkdir build && cd build 25 | cmake .. 26 | make -j$(nproc) && make install 27 | ``` 28 | 29 | Then you can run the inference with this command. 30 | 31 | ```bash 32 | deepstream-app -c deepstream_app_config.txt 33 | ``` 34 | 35 | ## Code Structure 36 | 37 | ```bash 38 | ├── deepstream 39 | │ ├── configs # config file for MMYOLO models 40 | │ │ └── config_infer_rtmdet.txt 41 | │ ├── custom_mmyolo_bbox_parser # customized parser for MMYOLO models to DeepStream formats 42 | │ │ └── nvdsparsebbox_mmyolo.cpp 43 | | ├── CMakeLists.txt 44 | │ ├── coco_labels.txt # labels for coco detection 45 | │ ├── deepstream_app_config.txt # deepStream reference app configs for MMYOLO models 46 | │ ├── README_zh-CN.md 47 | │ └── README.md 48 | ``` 49 | -------------------------------------------------------------------------------- /deploy/easydeploy/deepstream/README_zh-CN.md: -------------------------------------------------------------------------------- 1 | # 使用 DeepStream SDK 推理 MMYOLO 模型 2 | 3 | 本项目演示了如何使用 [DeepStream SDK](https://developer.nvidia.com/deepstream-sdk) 配合改写的 parser 来推理 MMYOLO 的模型。 4 | 5 | ## 预先准备 6 | 7 | ### 1. 安装 Nidia 驱动和 CUDA 8 | 9 | 首先请根据当前的显卡驱动和目标使用设备的驱动完成显卡驱动和 CUDA 的安装。 10 | 11 | ### 2. 安装 DeepStream SDK 12 | 13 | 目前 DeepStream SDK 稳定版本已经更新到 v6.2,官方推荐使用这个版本。 14 | 15 | ### 3. 将 MMYOLO 模型转换为 TensorRT Engine 16 | 17 | 推荐使用 EasyDeploy 中的 TensorRT 方案完成目标模型的转换部署,具体可参考 [此文档](../../easydeploy/docs/model_convert.md) 。 18 | 19 | ## 编译使用 20 | 21 | 当前项目使用的是 MMYOLO 的 rtmdet 模型,若想使用其他的模型,请参照目录下的配置文件进行改写。然后将转换完的 TensorRT engine 放在当前目录下并执行如下命令: 22 | 23 | ```bash 24 | mkdir build && cd build 25 | cmake .. 26 | make -j$(nproc) && make install 27 | ``` 28 | 29 | 完成编译后可使用如下命令进行推理: 30 | 31 | ```bash 32 | deepstream-app -c deepstream_app_config.txt 33 | ``` 34 | 35 | ## 项目代码结构 36 | 37 | ```bash 38 | ├── deepstream 39 | │ ├── configs # MMYOLO 模型对应的 DeepStream 配置 40 | │ │ └── config_infer_rtmdet.txt 41 | │ ├── custom_mmyolo_bbox_parser # 适配 DeepStream formats 的 parser 42 | │ │ └── nvdsparsebbox_mmyolo.cpp 43 | | ├── CMakeLists.txt 44 | │ ├── coco_labels.txt # coco labels 45 | │ ├── deepstream_app_config.txt # DeepStream app 配置 46 | │ ├── README_zh-CN.md 47 | │ └── README.md 48 | ``` 49 | -------------------------------------------------------------------------------- /deploy/easydeploy/deepstream/coco_labels.txt: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /deploy/easydeploy/deepstream/configs/config_infer_rtmdet.txt: -------------------------------------------------------------------------------- 1 | [property] 2 | gpu-id=0 3 | net-scale-factor=0.01735207357279195 4 | offsets=57.375;57.12;58.395 5 | model-color-format=1 6 | model-engine-file=../end2end.engine 7 | labelfile-path=../coco_labels.txt 8 | batch-size=1 9 | network-mode=0 10 | num-detected-classes=80 11 | interval=0 12 | gie-unique-id=1 13 | process-mode=1 14 | network-type=0 15 | cluster-mode=2 16 | maintain-aspect-ratio=1 17 | parse-bbox-func-name=NvDsInferParseCustomMMYOLO 18 | custom-lib-path=../build/libnvdsparsebbox_mmyolo.so 19 | 20 | [class-attrs-all] 21 | pre-cluster-threshold=0.45 22 | topk=100 23 | -------------------------------------------------------------------------------- /deploy/easydeploy/deepstream/configs/config_infer_yolov5.txt: -------------------------------------------------------------------------------- 1 | [property] 2 | gpu-id=0 3 | net-scale-factor=0.0039215697906911373 4 | model-color-format=0 5 | model-engine-file=../end2end.engine 6 | labelfile-path=../coco_labels.txt 7 | batch-size=1 8 | network-mode=0 9 | num-detected-classes=80 10 | interval=0 11 | gie-unique-id=1 12 | process-mode=1 13 | network-type=0 14 | cluster-mode=2 15 | maintain-aspect-ratio=1 16 | parse-bbox-func-name=NvDsInferParseCustomMMYOLO 17 | custom-lib-path=../build/libnvdsparsebbox_mmyolo.so 18 | 19 | [class-attrs-all] 20 | pre-cluster-threshold=0.45 21 | topk=100 22 | -------------------------------------------------------------------------------- /deploy/easydeploy/deepstream/configs/config_infer_yolov8.txt: -------------------------------------------------------------------------------- 1 | [property] 2 | gpu-id=0 3 | net-scale-factor=0.0039215697906911373 4 | model-color-format=0 5 | model-engine-file=../end2end.engine 6 | labelfile-path=../coco_labels.txt 7 | batch-size=1 8 | network-mode=0 9 | num-detected-classes=80 10 | interval=0 11 | gie-unique-id=1 12 | process-mode=1 13 | network-type=0 14 | cluster-mode=2 15 | maintain-aspect-ratio=1 16 | parse-bbox-func-name=NvDsInferParseCustomMMYOLO 17 | custom-lib-path=../build/libnvdsparsebbox_mmyolo.so 18 | 19 | [class-attrs-all] 20 | pre-cluster-threshold=0.45 21 | topk=100 22 | -------------------------------------------------------------------------------- /deploy/easydeploy/deepstream/deepstream_app_config.txt: -------------------------------------------------------------------------------- 1 | [application] 2 | enable-perf-measurement=1 3 | perf-measurement-interval-sec=5 4 | 5 | [tiled-display] 6 | enable=1 7 | rows=1 8 | columns=1 9 | width=1280 10 | height=720 11 | gpu-id=0 12 | nvbuf-memory-type=0 13 | 14 | [source0] 15 | enable=1 16 | type=3 17 | uri=file:///opt/nvidia/deepstream/deepstream/samples/streams/sample_1080p_h264.mp4 18 | num-sources=1 19 | gpu-id=0 20 | cudadec-memtype=0 21 | 22 | [sink0] 23 | enable=1 24 | type=2 25 | sync=0 26 | gpu-id=0 27 | nvbuf-memory-type=0 28 | 29 | [osd] 30 | enable=1 31 | gpu-id=0 32 | border-width=5 33 | text-size=15 34 | text-color=1;1;1;1; 35 | text-bg-color=0.3;0.3;0.3;1 36 | font=Serif 37 | show-clock=0 38 | clock-x-offset=800 39 | clock-y-offset=820 40 | clock-text-size=12 41 | clock-color=1;0;0;0 42 | nvbuf-memory-type=0 43 | 44 | [streammux] 45 | gpu-id=0 46 | live-source=0 47 | batch-size=1 48 | batched-push-timeout=40000 49 | width=1920 50 | height=1080 51 | enable-padding=0 52 | nvbuf-memory-type=0 53 | 54 | [primary-gie] 55 | enable=1 56 | gpu-id=0 57 | gie-unique-id=1 58 | nvbuf-memory-type=0 59 | config-file=configs/config_infer_rtmdet.txt 60 | 61 | [tests] 62 | file-loop=0 63 | -------------------------------------------------------------------------------- /deploy/easydeploy/examples/cv2_nms.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Union 2 | 3 | import cv2 4 | from numpy import ndarray 5 | 6 | MAJOR, MINOR = map(int, cv2.__version__.split('.')[:2]) 7 | assert MAJOR == 4 8 | 9 | 10 | def non_max_suppression(boxes: Union[List[ndarray], Tuple[ndarray]], 11 | scores: Union[List[float], Tuple[float]], 12 | labels: Union[List[int], Tuple[int]], 13 | conf_thres: float = 0.25, 14 | iou_thres: float = 0.65) -> Tuple[List, List, List]: 15 | if MINOR >= 7: 16 | indices = cv2.dnn.NMSBoxesBatched(boxes, scores, labels, conf_thres, 17 | iou_thres) 18 | elif MINOR == 6: 19 | indices = cv2.dnn.NMSBoxes(boxes, scores, conf_thres, iou_thres) 20 | else: 21 | indices = cv2.dnn.NMSBoxes(boxes, scores, conf_thres, 22 | iou_thres).flatten() 23 | 24 | nmsd_boxes = [] 25 | nmsd_scores = [] 26 | nmsd_labels = [] 27 | for idx in indices: 28 | box = boxes[idx] 29 | # x0y0wh -> x0y0x1y1 30 | box[2:] = box[:2] + box[2:] 31 | score = scores[idx] 32 | label = labels[idx] 33 | nmsd_boxes.append(box) 34 | nmsd_scores.append(score) 35 | nmsd_labels.append(label) 36 | return nmsd_boxes, nmsd_scores, nmsd_labels 37 | -------------------------------------------------------------------------------- /deploy/easydeploy/examples/preprocess.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Union 2 | 3 | import cv2 4 | import numpy as np 5 | from config import ModelType 6 | from numpy import ndarray 7 | 8 | 9 | class Preprocess: 10 | 11 | def __init__(self, model_type: ModelType): 12 | if model_type in (ModelType.YOLOV5, ModelType.YOLOV6, ModelType.YOLOV7, 13 | ModelType.YOLOV8): 14 | mean = np.array([0, 0, 0], dtype=np.float32) 15 | std = np.array([255, 255, 255], dtype=np.float32) 16 | is_rgb = True 17 | elif model_type == ModelType.YOLOX: 18 | mean = np.array([0, 0, 0], dtype=np.float32) 19 | std = np.array([1, 1, 1], dtype=np.float32) 20 | is_rgb = False 21 | elif model_type == ModelType.PPYOLOE: 22 | mean = np.array([123.675, 116.28, 103.53], dtype=np.float32) 23 | std = np.array([58.395, 57.12, 57.375], dtype=np.float32) 24 | is_rgb = True 25 | 26 | elif model_type == ModelType.PPYOLOEP: 27 | mean = np.array([0, 0, 0], dtype=np.float32) 28 | std = np.array([255, 255, 255], dtype=np.float32) 29 | is_rgb = True 30 | elif model_type == ModelType.RTMDET: 31 | mean = np.array([103.53, 116.28, 123.675], dtype=np.float32) 32 | std = np.array([57.375, 57.12, 58.3955], dtype=np.float32) 33 | is_rgb = False 34 | else: 35 | raise NotImplementedError 36 | 37 | self.mean = mean.reshape((3, 1, 1)) 38 | self.std = std.reshape((3, 1, 1)) 39 | self.is_rgb = is_rgb 40 | 41 | def __call__(self, 42 | image: ndarray, 43 | new_size: Union[List[int], Tuple[int]] = (640, 640), 44 | **kwargs) -> Tuple[ndarray, Tuple[float, float]]: 45 | # new_size: (height, width) 46 | height, width = image.shape[:2] 47 | ratio_h, ratio_w = new_size[0] / height, new_size[1] / width 48 | image = cv2.resize( 49 | image, (0, 0), 50 | fx=ratio_w, 51 | fy=ratio_h, 52 | interpolation=cv2.INTER_LINEAR) 53 | image = np.ascontiguousarray(image.transpose(2, 0, 1)) 54 | image = image.astype(np.float32) 55 | image -= self.mean 56 | image /= self.std 57 | return image[np.newaxis], (ratio_w, ratio_h) 58 | -------------------------------------------------------------------------------- /deploy/easydeploy/examples/requirements.txt: -------------------------------------------------------------------------------- 1 | onnxruntime 2 | opencv-python==4.7.0.72 3 | -------------------------------------------------------------------------------- /deploy/easydeploy/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .backend import MMYOLOBackend 3 | from .backendwrapper import ORTWrapper, TRTWrapper 4 | from .model import DeployModel 5 | 6 | __all__ = ['DeployModel', 'TRTWrapper', 'ORTWrapper', 'MMYOLOBackend'] 7 | -------------------------------------------------------------------------------- /deploy/easydeploy/model/backend.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | 7 | class MMYOLOBackend(Enum): 8 | AX620A = 'ax620a' 9 | COREML = 'coreml' 10 | HORIZONX3 = 'horizonx3' 11 | NCNN = 'ncnn' 12 | ONNXRUNTIME = 'onnxruntime' 13 | OPENVINO = 'openvino' 14 | PPLNN = 'pplnn' 15 | RKNN = 'rknn' 16 | TENSORRT8 = 'tensorrt8' 17 | TENSORRT7 = 'tensorrt7' 18 | TORCHSCRIPT = 'torchscript' 19 | TVM = 'tvm' 20 | 21 | 22 | def HSigmoid__forward(self, x: torch.Tensor) -> torch.Tensor: 23 | return F.hardsigmoid(x, inplace=True) 24 | -------------------------------------------------------------------------------- /deploy/easydeploy/nms/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .ort_nms import onnx_nms 3 | from .trt_nms import batched_nms, efficient_nms 4 | 5 | __all__ = ['efficient_nms', 'batched_nms', 'onnx_nms'] 6 | -------------------------------------------------------------------------------- /deploy/easydeploy/onnx_demo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/D-Robotics-AI-Lab/DOSOD/2b133c3a9f7a9b685635090d219efa84a6951e75/deploy/easydeploy/onnx_demo.py -------------------------------------------------------------------------------- /docs/deploy.md: -------------------------------------------------------------------------------- 1 | ## Deploy YOLO-World 2 | 3 | - [x] ONNX export 4 | - [x] ONNX demo 5 | - [ ] TensorRT 6 | - [ ] TFLite 7 | 8 | We provide several ways to deploy YOLO-World with ONNX or TensorRT 9 | 10 | ### Priliminaries 11 | 12 | ```bash 13 | pip install supervision onnx onnxruntime onnxsim 14 | ``` 15 | 16 | ### Export ONNX on Gradio Demo 17 | 18 | start the `demo.py` and you can modify the texts in the demo and output the ONNX model. 19 | 20 | ```bash 21 | python demo.py path/to/config path/to/weights 22 | ``` 23 | 24 | ### Export YOLO-World to ONNX models 25 | 26 | You can also use [`export_onnx.py`](../deploy/export_onnx.py) to obtain the ONNX model. You might specify the `--custom-text` with your own `Text JSON` for your custom prompts. The format of `Text JSON` can be found in [`docs/data`](../docs/data.md). 27 | 28 | ```bash 29 | PYTHONPATH=./ python deploy/export_onnx.py path/to/config path/to/weights --custom-text path/to/customtexts --opset 11 30 | ``` 31 | 32 | If you don't want to include `NMS` or "post-processing" into the ONNX model, you can add `--without-nms` 33 | ```bash 34 | PYTHONPATH=./ python deploy/export_onnx.py path/to/config path/to/weights --custom-text path/to/customtexts --opset 11 --without-nms 35 | ``` 36 | 37 | If you want to quantize YOLO-World with ONNX model, you'd better remove `NMS` and `bbox_decoder` by adding `--without-bbox-decoder` 38 | 39 | ```bash 40 | PYTHONPATH=./ python deploy/export_onnx.py path/to/config path/to/weights --custom-text path/to/customtexts --opset 11 --without-bbox-decoder 41 | ``` 42 | 43 | **Running ONNX demo** 44 | 45 | ```bash 46 | python deploy/onnx_demo.py path/to/model.onnx path/to/images path/to/texts 47 | ``` 48 | 49 | 50 | ### Export YOLO-World to TensorRT models 51 | 52 | coming soon. 53 | 54 | ### FAQ 55 | 56 | **Q1**. `RuntimeError: Exporting the operator einsum to ONNX opset version 11 is not supported. Support for this operator was added in version 12, try exporting with this version.` 57 | 58 | **A:** This error arises because YOLO-World adopts `einsum` for matrix multiplication while it is not supported by `opset 11`. You can set the `--opset` from `11` to `12` if your device supports or change the `einsum` to normal `permute/reshape/multiplication` by set `use_einsum=False` in the `MaxSigmoidCSPLayerWithTwoConv` and `YOLOWorldHeadModule`. You can refer to the [sample config](../configs/pretrain/yolo_world_v2_m_vlpan_bn_noeinsum_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) without einsum. 59 | 60 | -------------------------------------------------------------------------------- /docs/faq.md: -------------------------------------------------------------------------------- 1 | ## Frequently Asked Questions (FAQ) 2 | 3 | 4 | 1. ` Incorrect path_or_model_id` 5 | ```bash 6 | OSError: class `YOLOWorldDetector` in yolo_world/models/detectors/yolo_world.py: class `MultiModalYOLOBackbone` in yolo_world/models/backbones/mm_backbone.py: class `HuggingCLIPLanguageBackbone` in yolo_world/models/backbones/mm_backbone.py: Incorrect path_or_model_id: '../pretrained_models/clip-vit-base-patch32-projection'. Please provide either the path to a local folder or the repo_id of a model on the Hub. 7 | ``` 8 | 9 | **Solution:** -------------------------------------------------------------------------------- /docs/installation.md: -------------------------------------------------------------------------------- 1 | ## Installation Guide 2 | 3 | YOLO-World is built based on `pytorch=1.11.0` and `mmcv=2.0.0`. 4 | 5 | We provide the `requirements` files in [./requirements](./../requirements/): 6 | 7 | * `basic_requirements`: training, finetuning, evaluation. 8 | * `demo_requirements`: running YOLO-World [demos](./../demo/). 9 | * `onnx_requirements`: converting YOLO-World to ONNX or TFLite models (TFLite is coming soon). 10 | 11 | #### Install `MMCV` 12 | 13 | YOLO-World adopts `mmcv>=2.0.0`. There are several ways to install `mmcv` 14 | 15 | **1. using `openmim`**: 16 | 17 | see more in [official guide](https://github.com/open-mmlab/mmcv/tree/master?tab=readme-ov-file#install-mmcv-full). 18 | 19 | ```bash 20 | pip install openmim 21 | mim install mmcv==2.0.0 22 | ``` 23 | 24 | **2. using `pip`**: 25 | 26 | go to [install-with-pip](https://mmcv.readthedocs.io/en/latest/get_started/installation.html#install-with-pip) to select the pip index. 27 | 28 | ```bash 29 | # cuda=11.3, torch=1.11 30 | pip install mmcv==2.0.0 -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11/index.html 31 | # cuda=11.7, torch=1.13 32 | pip install mmcv==2.2.0 -f https://download.openmmlab.com/mmcv/dist/cu117/torch1.13/index.html 33 | # cuda=12.1, torch=2.1 34 | pip install mmcv==2.1.0 -f https://download.openmmlab.com/mmcv/dist/cu121/torch2.1/index.html 35 | ``` 36 | 37 | **3. using `whl`** 38 | 39 | go to [index packages](https://download.openmmlab.com/mmcv/dist/cu117/torch1.13/index.html) to find a suitable version and download. 40 | 41 | ```bash 42 | pip install mmcv-2.0.1-cp38-cp38-manylinux1_x86_64.whl 43 | ``` -------------------------------------------------------------------------------- /docs/updates.md: -------------------------------------------------------------------------------- 1 | ## Update Notes 2 | 3 | We provide the details for important updates of YOLO-World in this note. 4 | 5 | ### Model Architecture 6 | 7 | **[2024-2-29]:** YOLO-World-v2: 8 | 9 | 1. We remove the `I-PoolingAttention`: though it improves the performance for zero-shot LVIS evaluation, it affects the inference speeds after exporting YOLO-World to ONNX or TensorRT. Considering the trade-off, we remove the `I-PoolingAttention` in the newest version. 10 | 2. We replace the `L2-Norm` in the contrastive head with the `BatchNorm`. The `L2-Norm` contains complex operations, such as `reduce`, which is time-consuming for deployment. However, the `BatchNorm` can be fused into the convolution, which is much more efficient and also improves the zero-shot performance. 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools","wheel","torch"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "yolo_world" 7 | version = "0.1.0" 8 | description = "YOLO-World: Real-time Open Vocabulary Object Detection" 9 | readme = "README.md" 10 | keywords = ["object detection"] 11 | authors = [ 12 | { name = "Tencent AILab", email = "ronnysong@tencent.com" }, 13 | ] 14 | license = {text = "Apache License 2.0"} 15 | 16 | classifiers = [ 17 | "Development Status :: 4 - Beta", 18 | "License :: OSI Approved :: Apache Software License", 19 | "Operating System :: OS Independent", 20 | "Programming Language :: Python :: 3", 21 | "Programming Language :: Python :: 3.7", 22 | "Programming Language :: Python :: 3.8", 23 | "Programming Language :: Python :: 3.9", 24 | "Programming Language :: Python :: 3.10", 25 | "Programming Language :: Python :: 3.11", 26 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 27 | ] 28 | requires-python = ">= 3.7" 29 | 30 | dependencies = [ 31 | "wheel", 32 | "torch>=1.11.0", 33 | "torchvision>=0.16.2", 34 | "transformers", 35 | "tokenizers", 36 | "numpy", 37 | "opencv-python", 38 | "supervision==0.19.0", 39 | "openmim", 40 | "mmcv-lite>=2.0.0rc4", 41 | "mmdet==3.0.0", 42 | "mmengine>=0.7.1", 43 | "openmim", 44 | "mmcv", 45 | 46 | ] 47 | 48 | [tool.setuptools] 49 | package-dir = {"yolo_world" = "yolo_world"} 50 | include-package-data = false 51 | license-files = ["LICENSE"] 52 | zip-safe = true 53 | 54 | [tool.setuptools.packages.find] 55 | include = ["yolo_world*"] 56 | exclude = ["docs*", "tests*","third_party*","assets*"] -------------------------------------------------------------------------------- /requirements/basic_requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python==4.9.0.80 2 | opencv-python-headless==4.2.0.34 3 | mmcv==2.0.0 4 | mmdet==3.0.0 5 | mmengine==0.10.3 6 | mmyolo==0.6.0 7 | timm==0.6.13 8 | transformers==4.36.2 9 | albumentations -------------------------------------------------------------------------------- /requirements/demo_requirements.txt: -------------------------------------------------------------------------------- 1 | gradio==4.16.0 2 | supervision -------------------------------------------------------------------------------- /requirements/onnx_requirements.txt: -------------------------------------------------------------------------------- 1 | supervision 2 | onnx 3 | onnxruntime 4 | onnxsim -------------------------------------------------------------------------------- /third_party/mmyolo/configs/_base_/default_runtime.py: -------------------------------------------------------------------------------- 1 | default_scope = 'mmyolo' 2 | 3 | default_hooks = dict( 4 | timer=dict(type='IterTimerHook'), 5 | logger=dict(type='LoggerHook', interval=50), 6 | param_scheduler=dict(type='ParamSchedulerHook'), 7 | checkpoint=dict(type='CheckpointHook', interval=1), 8 | sampler_seed=dict(type='DistSamplerSeedHook'), 9 | visualization=dict(type='mmdet.DetVisualizationHook')) 10 | 11 | env_cfg = dict( 12 | cudnn_benchmark=False, 13 | mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), 14 | dist_cfg=dict(backend='nccl'), 15 | ) 16 | 17 | vis_backends = [dict(type='LocalVisBackend')] 18 | visualizer = dict( 19 | type='mmdet.DetLocalVisualizer', 20 | vis_backends=vis_backends, 21 | name='visualizer') 22 | log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True) 23 | 24 | log_level = 'INFO' 25 | load_from = None 26 | resume = False 27 | 28 | # Example to use different file client 29 | # Method 1: simply set the data root and let the file I/O module 30 | # automatically infer from prefix (not support LMDB and Memcache yet) 31 | 32 | # data_root = 's3://openmmlab/datasets/detection/coco/' 33 | 34 | # Method 2: Use `backend_args`, `file_client_args` in versions 35 | # before MMDet 3.0.0rc6 36 | # backend_args = dict( 37 | # backend='petrel', 38 | # path_mapping=dict({ 39 | # './data/': 's3://openmmlab/datasets/detection/', 40 | # 'data/': 's3://openmmlab/datasets/detection/' 41 | # })) 42 | 43 | backend_args = None 44 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/_base_/det_p5_tta.py: -------------------------------------------------------------------------------- 1 | # TODO: Need to solve the problem of multiple backend_args parameters 2 | # _backend_args = dict( 3 | # backend='petrel', 4 | # path_mapping=dict({ 5 | # './data/': 's3://openmmlab/datasets/detection/', 6 | # 'data/': 's3://openmmlab/datasets/detection/' 7 | # })) 8 | 9 | _backend_args = None 10 | 11 | tta_model = dict( 12 | type='mmdet.DetTTAModel', 13 | tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.65), max_per_img=300)) 14 | 15 | img_scales = [(640, 640), (320, 320), (960, 960)] 16 | 17 | # LoadImageFromFile 18 | # / | \ 19 | # (RatioResize,LetterResize) (RatioResize,LetterResize) (RatioResize,LetterResize) # noqa 20 | # / \ / \ / \ 21 | # RandomFlip RandomFlip RandomFlip RandomFlip RandomFlip RandomFlip # noqa 22 | # | | | | | | 23 | # LoadAnn LoadAnn LoadAnn LoadAnn LoadAnn LoadAnn 24 | # | | | | | | 25 | # PackDetIn PackDetIn PackDetIn PackDetIn PackDetIn PackDetIn # noqa 26 | 27 | _multiscale_resize_transforms = [ 28 | dict( 29 | type='Compose', 30 | transforms=[ 31 | dict(type='YOLOv5KeepRatioResize', scale=s), 32 | dict( 33 | type='LetterResize', 34 | scale=s, 35 | allow_scale_up=False, 36 | pad_val=dict(img=114)) 37 | ]) for s in img_scales 38 | ] 39 | 40 | tta_pipeline = [ 41 | dict(type='LoadImageFromFile', backend_args=_backend_args), 42 | dict( 43 | type='TestTimeAug', 44 | transforms=[ 45 | _multiscale_resize_transforms, 46 | [ 47 | dict(type='mmdet.RandomFlip', prob=1.), 48 | dict(type='mmdet.RandomFlip', prob=0.) 49 | ], [dict(type='mmdet.LoadAnnotations', with_bbox=True)], 50 | [ 51 | dict( 52 | type='mmdet.PackDetInputs', 53 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 54 | 'scale_factor', 'pad_param', 'flip', 55 | 'flip_direction')) 56 | ] 57 | ]) 58 | ] 59 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/deploy/base_dynamic.py: -------------------------------------------------------------------------------- 1 | _base_ = ['./base_static.py'] 2 | onnx_config = dict( 3 | dynamic_axes={ 4 | 'input': { 5 | 0: 'batch', 6 | 2: 'height', 7 | 3: 'width' 8 | }, 9 | 'dets': { 10 | 0: 'batch', 11 | 1: 'num_dets' 12 | }, 13 | 'labels': { 14 | 0: 'batch', 15 | 1: 'num_dets' 16 | } 17 | }) 18 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/deploy/base_static.py: -------------------------------------------------------------------------------- 1 | onnx_config = dict( 2 | type='onnx', 3 | export_params=True, 4 | keep_initializers_as_inputs=False, 5 | opset_version=11, 6 | save_file='end2end.onnx', 7 | input_names=['input'], 8 | output_names=['dets', 'labels'], 9 | input_shape=None, 10 | optimize=True) 11 | codebase_config = dict( 12 | type='mmyolo', 13 | task='ObjectDetection', 14 | model_type='end2end', 15 | post_processing=dict( 16 | score_threshold=0.05, 17 | confidence_threshold=0.005, 18 | iou_threshold=0.5, 19 | max_output_boxes_per_class=200, 20 | pre_top_k=5000, 21 | keep_top_k=100, 22 | background_label_id=-1), 23 | module=['mmyolo.deploy']) 24 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/deploy/detection_onnxruntime_dynamic.py: -------------------------------------------------------------------------------- 1 | _base_ = ['./base_dynamic.py'] 2 | codebase_config = dict( 3 | type='mmyolo', 4 | task='ObjectDetection', 5 | model_type='end2end', 6 | post_processing=dict( 7 | score_threshold=0.05, 8 | confidence_threshold=0.005, 9 | iou_threshold=0.5, 10 | max_output_boxes_per_class=200, 11 | pre_top_k=5000, 12 | keep_top_k=100, 13 | background_label_id=-1), 14 | module=['mmyolo.deploy']) 15 | backend_config = dict(type='onnxruntime') 16 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/deploy/detection_onnxruntime_static.py: -------------------------------------------------------------------------------- 1 | _base_ = ['./base_static.py'] 2 | codebase_config = dict( 3 | type='mmyolo', 4 | task='ObjectDetection', 5 | model_type='end2end', 6 | post_processing=dict( 7 | score_threshold=0.05, 8 | confidence_threshold=0.005, 9 | iou_threshold=0.5, 10 | max_output_boxes_per_class=200, 11 | pre_top_k=5000, 12 | keep_top_k=100, 13 | background_label_id=-1), 14 | module=['mmyolo.deploy']) 15 | backend_config = dict(type='onnxruntime') 16 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/deploy/detection_rknn-fp16_static-320x320.py: -------------------------------------------------------------------------------- 1 | _base_ = ['./base_static.py'] 2 | onnx_config = dict( 3 | input_shape=[320, 320], output_names=['feat0', 'feat1', 'feat2']) 4 | codebase_config = dict(model_type='rknn') 5 | backend_config = dict( 6 | type='rknn', 7 | common_config=dict(target_platform='rv1126', optimization_level=1), 8 | quantization_config=dict(do_quantization=False, dataset=None), 9 | input_size_list=[[3, 320, 320]]) 10 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/deploy/detection_rknn-int8_static-320x320.py: -------------------------------------------------------------------------------- 1 | _base_ = ['./base_static.py'] 2 | onnx_config = dict( 3 | input_shape=[320, 320], output_names=['feat0', 'feat1', 'feat2']) 4 | codebase_config = dict(model_type='rknn') 5 | backend_config = dict( 6 | type='rknn', 7 | common_config=dict(target_platform='rv1126', optimization_level=1), 8 | quantization_config=dict(do_quantization=True, dataset=None), 9 | input_size_list=[[3, 320, 320]]) 10 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_dynamic-192x192-960x960.py: -------------------------------------------------------------------------------- 1 | _base_ = ['./base_dynamic.py'] 2 | backend_config = dict( 3 | type='tensorrt', 4 | common_config=dict(fp16_mode=True, max_workspace_size=1 << 30), 5 | model_inputs=[ 6 | dict( 7 | input_shapes=dict( 8 | input=dict( 9 | min_shape=[1, 3, 192, 192], 10 | opt_shape=[1, 3, 640, 640], 11 | max_shape=[1, 3, 960, 960]))) 12 | ]) 13 | use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 14 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_dynamic-64x64-1344x1344.py: -------------------------------------------------------------------------------- 1 | _base_ = ['./base_dynamic.py'] 2 | backend_config = dict( 3 | type='tensorrt', 4 | common_config=dict(fp16_mode=True, max_workspace_size=1 << 32), 5 | model_inputs=[ 6 | dict( 7 | input_shapes=dict( 8 | input=dict( 9 | min_shape=[1, 3, 64, 64], 10 | opt_shape=[1, 3, 640, 640], 11 | max_shape=[1, 3, 1344, 1344]))) 12 | ]) 13 | use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 14 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_static-640x640.py: -------------------------------------------------------------------------------- 1 | _base_ = ['./base_static.py'] 2 | onnx_config = dict(input_shape=(640, 640)) 3 | backend_config = dict( 4 | type='tensorrt', 5 | common_config=dict(fp16_mode=True, max_workspace_size=1 << 30), 6 | model_inputs=[ 7 | dict( 8 | input_shapes=dict( 9 | input=dict( 10 | min_shape=[1, 3, 640, 640], 11 | opt_shape=[1, 3, 640, 640], 12 | max_shape=[1, 3, 640, 640]))) 13 | ]) 14 | use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 15 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/deploy/detection_tensorrt-int8_dynamic-192x192-960x960.py: -------------------------------------------------------------------------------- 1 | _base_ = ['./base_dynamic.py'] 2 | backend_config = dict( 3 | type='tensorrt', 4 | common_config=dict( 5 | fp16_mode=True, max_workspace_size=1 << 30, int8_mode=True), 6 | model_inputs=[ 7 | dict( 8 | input_shapes=dict( 9 | input=dict( 10 | min_shape=[1, 3, 192, 192], 11 | opt_shape=[1, 3, 640, 640], 12 | max_shape=[1, 3, 960, 960]))) 13 | ]) 14 | calib_config = dict(create_calib=True, calib_file='calib_data.h5') 15 | use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 16 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/deploy/detection_tensorrt-int8_static-640x640.py: -------------------------------------------------------------------------------- 1 | _base_ = ['./base_static.py'] 2 | onnx_config = dict(input_shape=(640, 640)) 3 | backend_config = dict( 4 | type='tensorrt', 5 | common_config=dict( 6 | fp16_mode=True, max_workspace_size=1 << 30, int8_mode=True), 7 | model_inputs=[ 8 | dict( 9 | input_shapes=dict( 10 | input=dict( 11 | min_shape=[1, 3, 640, 640], 12 | opt_shape=[1, 3, 640, 640], 13 | max_shape=[1, 3, 640, 640]))) 14 | ]) 15 | calib_config = dict(create_calib=True, calib_file='calib_data.h5') 16 | use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 17 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/deploy/detection_tensorrt_dynamic-192x192-960x960.py: -------------------------------------------------------------------------------- 1 | _base_ = ['./base_dynamic.py'] 2 | backend_config = dict( 3 | type='tensorrt', 4 | common_config=dict(fp16_mode=False, max_workspace_size=1 << 30), 5 | model_inputs=[ 6 | dict( 7 | input_shapes=dict( 8 | input=dict( 9 | min_shape=[1, 3, 192, 192], 10 | opt_shape=[1, 3, 640, 640], 11 | max_shape=[1, 3, 960, 960]))) 12 | ]) 13 | use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 14 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/deploy/detection_tensorrt_static-640x640.py: -------------------------------------------------------------------------------- 1 | _base_ = ['./base_static.py'] 2 | onnx_config = dict(input_shape=(640, 640)) 3 | backend_config = dict( 4 | type='tensorrt', 5 | common_config=dict(fp16_mode=False, max_workspace_size=1 << 30), 6 | model_inputs=[ 7 | dict( 8 | input_shapes=dict( 9 | input=dict( 10 | min_shape=[1, 3, 640, 640], 11 | opt_shape=[1, 3, 640, 640], 12 | max_shape=[1, 3, 640, 640]))) 13 | ]) 14 | use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 15 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/deploy/model/yolov5_s-static.py: -------------------------------------------------------------------------------- 1 | _base_ = '../../yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py' 2 | 3 | test_pipeline = [ 4 | dict(type='LoadImageFromFile', backend_args=_base_.backend_args), 5 | dict( 6 | type='LetterResize', 7 | scale=_base_.img_scale, 8 | allow_scale_up=False, 9 | use_mini_pad=False, 10 | ), 11 | dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), 12 | dict( 13 | type='mmdet.PackDetInputs', 14 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 15 | 'scale_factor', 'pad_param')) 16 | ] 17 | 18 | test_dataloader = dict( 19 | dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=None)) 20 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/deploy/model/yolov6_s-static.py: -------------------------------------------------------------------------------- 1 | _base_ = '../../yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py' 2 | 3 | test_pipeline = [ 4 | dict(type='LoadImageFromFile', backend_args=_base_.backend_args), 5 | dict( 6 | type='LetterResize', 7 | scale=_base_.img_scale, 8 | allow_scale_up=False, 9 | use_mini_pad=False, 10 | ), 11 | dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), 12 | dict( 13 | type='mmdet.PackDetInputs', 14 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 15 | 'scale_factor', 'pad_param')) 16 | ] 17 | 18 | test_dataloader = dict( 19 | dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=None)) 20 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/ppyoloe/ppyoloe_l_fast_8xb20-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './ppyoloe_s_fast_8xb32-300e_coco.py' 2 | 3 | # The pretrained model is geted and converted from official PPYOLOE. 4 | # https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md 5 | checkpoint = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/cspresnet_l_imagenet1k_pretrained-c0010e6c.pth' # noqa 6 | 7 | deepen_factor = 1.0 8 | widen_factor = 1.0 9 | 10 | train_batch_size_per_gpu = 20 11 | 12 | model = dict( 13 | backbone=dict( 14 | deepen_factor=deepen_factor, 15 | widen_factor=widen_factor, 16 | init_cfg=dict(checkpoint=checkpoint)), 17 | neck=dict( 18 | deepen_factor=deepen_factor, 19 | widen_factor=widen_factor, 20 | ), 21 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 22 | 23 | train_dataloader = dict(batch_size=train_batch_size_per_gpu) 24 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/ppyoloe/ppyoloe_m_fast_8xb28-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './ppyoloe_s_fast_8xb32-300e_coco.py' 2 | 3 | # The pretrained model is geted and converted from official PPYOLOE. 4 | # https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md 5 | checkpoint = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/cspresnet_m_imagenet1k_pretrained-09f1eba2.pth' # noqa 6 | 7 | deepen_factor = 0.67 8 | widen_factor = 0.75 9 | 10 | train_batch_size_per_gpu = 28 11 | 12 | model = dict( 13 | backbone=dict( 14 | deepen_factor=deepen_factor, 15 | widen_factor=widen_factor, 16 | init_cfg=dict(checkpoint=checkpoint)), 17 | neck=dict( 18 | deepen_factor=deepen_factor, 19 | widen_factor=widen_factor, 20 | ), 21 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 22 | 23 | train_dataloader = dict(batch_size=train_batch_size_per_gpu) 24 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_l_fast_8xb8-80e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './ppyoloe_plus_s_fast_8xb8-80e_coco.py' 2 | 3 | # The pretrained model is geted and converted from official PPYOLOE. 4 | # https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md 5 | load_from = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/ppyoloe_plus_l_obj365_pretrained-3dd89562.pth' # noqa 6 | 7 | deepen_factor = 1.0 8 | widen_factor = 1.0 9 | 10 | model = dict( 11 | backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 12 | neck=dict( 13 | deepen_factor=deepen_factor, 14 | widen_factor=widen_factor, 15 | ), 16 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 17 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './ppyoloe_plus_s_fast_8xb8-80e_coco.py' 2 | 3 | # The pretrained model is geted and converted from official PPYOLOE. 4 | # https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md 5 | load_from = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/ppyoloe_plus_m_ojb365_pretrained-03206892.pth' # noqa 6 | 7 | deepen_factor = 0.67 8 | widen_factor = 0.75 9 | 10 | model = dict( 11 | backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 12 | neck=dict( 13 | deepen_factor=deepen_factor, 14 | widen_factor=widen_factor, 15 | ), 16 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 17 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_s_fast_1xb12-40e_cat.py: -------------------------------------------------------------------------------- 1 | # Compared to other same scale models, this configuration consumes too much 2 | # GPU memory and is not validated for now 3 | _base_ = 'ppyoloe_plus_s_fast_8xb8-80e_coco.py' 4 | 5 | data_root = './data/cat/' 6 | class_name = ('cat', ) 7 | num_classes = len(class_name) 8 | metainfo = dict(classes=class_name, palette=[(20, 220, 60)]) 9 | 10 | num_last_epochs = 5 11 | 12 | max_epochs = 40 13 | train_batch_size_per_gpu = 12 14 | train_num_workers = 2 15 | 16 | load_from = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco/ppyoloe_plus_s_fast_8xb8-80e_coco_20230101_154052-9fee7619.pth' # noqa 17 | 18 | model = dict( 19 | backbone=dict(frozen_stages=4), 20 | bbox_head=dict(head_module=dict(num_classes=num_classes)), 21 | train_cfg=dict( 22 | initial_assigner=dict(num_classes=num_classes), 23 | assigner=dict(num_classes=num_classes))) 24 | 25 | train_dataloader = dict( 26 | batch_size=train_batch_size_per_gpu, 27 | num_workers=train_num_workers, 28 | dataset=dict( 29 | data_root=data_root, 30 | metainfo=metainfo, 31 | ann_file='annotations/trainval.json', 32 | data_prefix=dict(img='images/'))) 33 | 34 | val_dataloader = dict( 35 | dataset=dict( 36 | metainfo=metainfo, 37 | data_root=data_root, 38 | ann_file='annotations/test.json', 39 | data_prefix=dict(img='images/'))) 40 | 41 | test_dataloader = val_dataloader 42 | 43 | default_hooks = dict( 44 | param_scheduler=dict( 45 | warmup_min_iter=10, 46 | warmup_epochs=3, 47 | total_epochs=int(max_epochs * 1.2))) 48 | 49 | val_evaluator = dict(ann_file=data_root + 'annotations/test.json') 50 | test_evaluator = val_evaluator 51 | 52 | default_hooks = dict( 53 | checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'), 54 | logger=dict(type='LoggerHook', interval=5)) 55 | train_cfg = dict(max_epochs=max_epochs, val_interval=10) 56 | # visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa 57 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_x_fast_8xb8-80e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './ppyoloe_plus_s_fast_8xb8-80e_coco.py' 2 | 3 | # The pretrained model is geted and converted from official PPYOLOE. 4 | # https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md 5 | load_from = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/ppyoloe_plus_x_obj365_pretrained-43a8000d.pth' # noqa 6 | 7 | deepen_factor = 1.33 8 | widen_factor = 1.25 9 | 10 | model = dict( 11 | backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 12 | neck=dict( 13 | deepen_factor=deepen_factor, 14 | widen_factor=widen_factor, 15 | ), 16 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 17 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/ppyoloe/ppyoloe_s_fast_8xb32-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './ppyoloe_plus_s_fast_8xb8-80e_coco.py' 2 | 3 | # The pretrained model is geted and converted from official PPYOLOE. 4 | # https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md 5 | checkpoint = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/cspresnet_s_imagenet1k_pretrained-2be81763.pth' # noqa 6 | 7 | train_batch_size_per_gpu = 32 8 | max_epochs = 300 9 | 10 | # Base learning rate for optim_wrapper 11 | base_lr = 0.01 12 | 13 | model = dict( 14 | data_preprocessor=dict( 15 | mean=[0.485 * 255, 0.456 * 255, 0.406 * 255], 16 | std=[0.229 * 255., 0.224 * 255., 0.225 * 255.]), 17 | backbone=dict( 18 | block_cfg=dict(use_alpha=False), 19 | init_cfg=dict( 20 | type='Pretrained', 21 | prefix='backbone.', 22 | checkpoint=checkpoint, 23 | map_location='cpu')), 24 | train_cfg=dict(initial_epoch=100)) 25 | 26 | train_dataloader = dict(batch_size=train_batch_size_per_gpu) 27 | 28 | optim_wrapper = dict(optimizer=dict(lr=base_lr)) 29 | 30 | default_hooks = dict(param_scheduler=dict(total_epochs=int(max_epochs * 1.2))) 31 | 32 | train_cfg = dict(max_epochs=max_epochs) 33 | 34 | # PPYOLOE plus use obj365 pretrained model, but PPYOLOE not, 35 | # `load_from` need to set to None. 36 | load_from = None 37 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/ppyoloe/ppyoloe_s_fast_8xb32-400e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './ppyoloe_s_fast_8xb32-300e_coco.py' 2 | 3 | max_epochs = 400 4 | 5 | model = dict(train_cfg=dict(initial_epoch=133)) 6 | 7 | default_hooks = dict(param_scheduler=dict(total_epochs=int(max_epochs * 1.2))) 8 | 9 | train_cfg = dict(max_epochs=max_epochs) 10 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/ppyoloe/ppyoloe_x_fast_8xb16-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './ppyoloe_s_fast_8xb32-300e_coco.py' 2 | 3 | # The pretrained model is geted and converted from official PPYOLOE. 4 | # https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md 5 | checkpoint = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/cspresnet_x_imagenet1k_pretrained-81c33ccb.pth' # noqa 6 | 7 | deepen_factor = 1.33 8 | widen_factor = 1.25 9 | 10 | train_batch_size_per_gpu = 16 11 | 12 | model = dict( 13 | backbone=dict( 14 | deepen_factor=deepen_factor, 15 | widen_factor=widen_factor, 16 | init_cfg=dict(checkpoint=checkpoint)), 17 | neck=dict( 18 | deepen_factor=deepen_factor, 19 | widen_factor=widen_factor, 20 | ), 21 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 22 | 23 | train_dataloader = dict(batch_size=train_batch_size_per_gpu) 24 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/razor/subnets/yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | 'mmrazor::_base_/nas_backbones/spos_shufflenet_supernet.py', 3 | '../../yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' 4 | ] 5 | 6 | checkpoint_file = 'https://download.openmmlab.com/mmrazor/v1/spos/spos_shufflenetv2_subnet_8xb128_in1k_flops_0.33M_acc_73.87_20211222-1f0a0b4d_v3.pth' # noqa 7 | fix_subnet = 'https://download.openmmlab.com/mmrazor/v1/spos/spos_shufflenetv2_subnet_8xb128_in1k_flops_0.33M_acc_73.87_20211222-1f0a0b4d_subnet_cfg_v3.yaml' # noqa 8 | widen_factor = 1.0 9 | channels = [160, 320, 640] 10 | 11 | _base_.nas_backbone.out_indices = (1, 2, 3) 12 | _base_.nas_backbone.init_cfg = dict( 13 | type='Pretrained', 14 | checkpoint=checkpoint_file, 15 | prefix='architecture.backbone.') 16 | nas_backbone = dict( 17 | type='mmrazor.sub_model', 18 | fix_subnet=fix_subnet, 19 | cfg=_base_.nas_backbone, 20 | extra_prefix='architecture.backbone.') 21 | 22 | _base_.model.backbone = nas_backbone 23 | _base_.model.neck.widen_factor = widen_factor 24 | _base_.model.neck.in_channels = channels 25 | _base_.model.neck.out_channels = channels 26 | _base_.model.bbox_head.head_module.in_channels = channels 27 | _base_.model.bbox_head.head_module.widen_factor = widen_factor 28 | 29 | find_unused_parameters = True 30 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/razor/subnets/yolov6_l_attentivenas_a6_d12_syncbn_fast_8xb32-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | 'mmrazor::_base_/nas_backbones/attentive_mobilenetv3_supernet.py', 3 | '../../yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco.py' 4 | ] 5 | 6 | checkpoint_file = 'https://download.openmmlab.com/mmrazor/v1/bignas/attentive_mobilenet_subnet_8xb256_in1k_flops-0.93G_acc-80.81_20221229_200440-73d92cc6.pth' # noqa 7 | fix_subnet = 'https://download.openmmlab.com/mmrazor/v1/bignas/ATTENTIVE_SUBNET_A6.yaml' # noqa 8 | deepen_factor = 1.2 9 | widen_factor = 1 10 | channels = [40, 128, 224] 11 | mid_channels = [40, 128, 224] 12 | 13 | _base_.train_dataloader.batch_size = 16 14 | _base_.nas_backbone.out_indices = (2, 4, 6) 15 | _base_.nas_backbone.conv_cfg = dict(type='mmrazor.BigNasConv2d') 16 | _base_.nas_backbone.norm_cfg = dict(type='mmrazor.DynamicBatchNorm2d') 17 | _base_.nas_backbone.init_cfg = dict( 18 | type='Pretrained', 19 | checkpoint=checkpoint_file, 20 | prefix='architecture.backbone.') 21 | nas_backbone = dict( 22 | type='mmrazor.sub_model', 23 | fix_subnet=fix_subnet, 24 | cfg=_base_.nas_backbone, 25 | extra_prefix='backbone.') 26 | 27 | _base_.model.backbone = nas_backbone 28 | _base_.model.neck.widen_factor = widen_factor 29 | _base_.model.neck.deepen_factor = deepen_factor 30 | _base_.model.neck.in_channels = channels 31 | _base_.model.neck.out_channels = mid_channels 32 | _base_.model.bbox_head.head_module.in_channels = mid_channels 33 | _base_.model.bbox_head.head_module.widen_factor = widen_factor 34 | 35 | find_unused_parameters = True 36 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/README.md: -------------------------------------------------------------------------------- 1 | # CSPNeXt ImageNet Pre-training 2 | 3 | In this folder, we provide the imagenet pre-training config of RTMDet's backbone CSPNeXt. 4 | 5 | ## Requirements 6 | 7 | To train with these configs, please install [MMClassification 1.x](https://github.com/open-mmlab/mmclassification/tree/1.x) first. 8 | 9 | Install by MIM: 10 | 11 | ```shell 12 | mim install mmcls>=1.0.0rc0 13 | ``` 14 | 15 | or install by pip: 16 | 17 | ```shell 18 | pip install mmcls>=1.0.0rc0 19 | ``` 20 | 21 | ## Prepare Dataset 22 | 23 | To pre-train on ImageNet, you need to prepare the dataset first. Please refer to the [guide](https://mmclassification.readthedocs.io/en/1.x/user_guides/dataset_prepare.html#imagenet). 24 | 25 | ## How to Train 26 | 27 | You can use the classification config in the same way as the detection config. 28 | 29 | For single-GPU training, run: 30 | 31 | ```shell 32 | python tools/train.py \ 33 | ${CONFIG_FILE} \ 34 | [optional arguments] 35 | ``` 36 | 37 | For multi-GPU training, run: 38 | 39 | ```shell 40 | bash ./tools/dist_train.sh \ 41 | ${CONFIG_FILE} \ 42 | ${GPU_NUM} \ 43 | [optional arguments] 44 | ``` 45 | 46 | More details can be found in [user guides](https://mmdetection.readthedocs.io/en/3.x/user_guides/train.html). 47 | 48 | ## Results and Models 49 | 50 | | Model | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Download | 51 | | :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :-----------------------------------------------------------------------------------------------------------------: | 52 | | CSPNeXt-tiny | 224x224 | 2.73 | 0.339 | 69.44 | 89.45 | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth) | 53 | | CSPNeXt-s | 224x224 | 4.89 | 0.664 | 74.41 | 92.23 | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth) | 54 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/cspnext-s_8xb256-rsb-a1-600e_in1k.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | 'mmcls::_base_/datasets/imagenet_bs256_rsb_a12.py', 3 | 'mmcls::_base_/schedules/imagenet_bs2048_rsb.py', 4 | 'mmcls::_base_/default_runtime.py' 5 | ] 6 | 7 | custom_imports = dict( 8 | imports=['mmdet.models', 'mmyolo.models'], allow_failed_imports=False) 9 | 10 | model = dict( 11 | type='ImageClassifier', 12 | backbone=dict( 13 | type='mmyolo.CSPNeXt', 14 | arch='P5', 15 | out_indices=(4, ), 16 | expand_ratio=0.5, 17 | deepen_factor=0.33, 18 | widen_factor=0.5, 19 | channel_attention=True, 20 | norm_cfg=dict(type='BN'), 21 | act_cfg=dict(type='mmyolo.SiLU')), 22 | neck=dict(type='GlobalAveragePooling'), 23 | head=dict( 24 | type='LinearClsHead', 25 | num_classes=1000, 26 | in_channels=512, 27 | loss=dict( 28 | type='LabelSmoothLoss', 29 | label_smooth_val=0.1, 30 | mode='original', 31 | loss_weight=1.0), 32 | topk=(1, 5)), 33 | train_cfg=dict(augments=[ 34 | dict(type='Mixup', alpha=0.2, num_classes=1000), 35 | dict(type='CutMix', alpha=1.0, num_classes=1000) 36 | ])) 37 | 38 | # dataset settings 39 | train_dataloader = dict(sampler=dict(type='RepeatAugSampler', shuffle=True)) 40 | 41 | # schedule settings 42 | optim_wrapper = dict( 43 | optimizer=dict(weight_decay=0.01), 44 | paramwise_cfg=dict(bias_decay_mult=0., norm_decay_mult=0.), 45 | ) 46 | 47 | param_scheduler = [ 48 | # warm up learning rate scheduler 49 | dict( 50 | type='LinearLR', 51 | start_factor=0.0001, 52 | by_epoch=True, 53 | begin=0, 54 | end=5, 55 | # update by iter 56 | convert_to_iter_based=True), 57 | # main learning rate scheduler 58 | dict( 59 | type='CosineAnnealingLR', 60 | T_max=595, 61 | eta_min=1.0e-6, 62 | by_epoch=True, 63 | begin=5, 64 | end=600) 65 | ] 66 | 67 | train_cfg = dict(by_epoch=True, max_epochs=600) 68 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/cspnext-tiny_8xb256-rsb-a1-600e_in1k.py: -------------------------------------------------------------------------------- 1 | _base_ = './cspnext-s_8xb256-rsb-a1-600e_in1k.py' 2 | 3 | model = dict( 4 | backbone=dict(deepen_factor=0.167, widen_factor=0.375), 5 | head=dict(in_channels=384)) 6 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py: -------------------------------------------------------------------------------- 1 | _base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py' 2 | 3 | # ========================modified parameters====================== 4 | data_root = 'data/split_ms_dota/' 5 | # Path of test images folder 6 | test_data_prefix = 'test/images/' 7 | # Submission dir for result submit 8 | submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' 9 | 10 | # =======================Unmodified in most cases================== 11 | train_dataloader = dict(dataset=dict(data_root=data_root)) 12 | 13 | val_dataloader = dict(dataset=dict(data_root=data_root)) 14 | 15 | # Inference on val dataset 16 | test_dataloader = val_dataloader 17 | 18 | # Inference on test dataset and format the output results 19 | # for submission. Note: the test set has no annotation. 20 | # test_dataloader = dict( 21 | # dataset=dict( 22 | # data_root=data_root, 23 | # ann_file='', # test set has no annotation 24 | # data_prefix=dict(img_path=test_data_prefix), 25 | # pipeline=_base_.test_pipeline)) 26 | # test_evaluator = dict( 27 | # type='mmrotate.DOTAMetric', 28 | # format_only=True, 29 | # merge_patches=True, 30 | # outfile_prefix=submission_dir) 31 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_coco-pretrain_2xb4-36e_dota-ms.py: -------------------------------------------------------------------------------- 1 | _base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py' 2 | 3 | load_from = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco/rtmdet_l_syncbn_fast_8xb32-300e_coco_20230102_135928-ee3abdc4.pth' # noqa 4 | 5 | # Submission dir for result submit 6 | submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' 7 | 8 | # Inference on test dataset and format the output results 9 | # for submission. Note: the test set has no annotation. 10 | # test_dataloader = dict( 11 | # dataset=dict( 12 | # data_root=_base_.data_root, 13 | # ann_file='', # test set has no annotation 14 | # data_prefix=dict(img_path=_base_.test_data_prefix), 15 | # pipeline=_base_.test_pipeline)) 16 | # test_evaluator = dict( 17 | # type='mmrotate.DOTAMetric', 18 | # format_only=True, 19 | # merge_patches=True, 20 | # outfile_prefix=submission_dir) 21 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota-ms.py: -------------------------------------------------------------------------------- 1 | _base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py' 2 | 3 | checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth' # noqa 4 | 5 | # ========================modified parameters====================== 6 | deepen_factor = 0.67 7 | widen_factor = 0.75 8 | 9 | # Submission dir for result submit 10 | submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' 11 | 12 | # =======================Unmodified in most cases================== 13 | model = dict( 14 | backbone=dict( 15 | deepen_factor=deepen_factor, 16 | widen_factor=widen_factor, 17 | init_cfg=dict(checkpoint=checkpoint)), 18 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 19 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 20 | 21 | # Inference on test dataset and format the output results 22 | # for submission. Note: the test set has no annotation. 23 | # test_dataloader = dict( 24 | # dataset=dict( 25 | # data_root=_base_.data_root, 26 | # ann_file='', # test set has no annotation 27 | # data_prefix=dict(img_path=_base_.test_data_prefix), 28 | # pipeline=_base_.test_pipeline)) 29 | # test_evaluator = dict( 30 | # type='mmrotate.DOTAMetric', 31 | # format_only=True, 32 | # merge_patches=True, 33 | # outfile_prefix=submission_dir) 34 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota.py: -------------------------------------------------------------------------------- 1 | _base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py' 2 | 3 | checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth' # noqa 4 | 5 | # ========================modified parameters====================== 6 | deepen_factor = 0.67 7 | widen_factor = 0.75 8 | 9 | # Submission dir for result submit 10 | submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' 11 | 12 | # =======================Unmodified in most cases================== 13 | model = dict( 14 | backbone=dict( 15 | deepen_factor=deepen_factor, 16 | widen_factor=widen_factor, 17 | init_cfg=dict(checkpoint=checkpoint)), 18 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 19 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 20 | 21 | # Inference on test dataset and format the output results 22 | # for submission. Note: the test set has no annotation. 23 | # test_dataloader = dict( 24 | # dataset=dict( 25 | # data_root=_base_.data_root, 26 | # ann_file='', # test set has no annotation 27 | # data_prefix=dict(img_path=_base_.test_data_prefix), 28 | # pipeline=_base_.test_pipeline)) 29 | # test_evaluator = dict( 30 | # type='mmrotate.DOTAMetric', 31 | # format_only=True, 32 | # merge_patches=True, 33 | # outfile_prefix=submission_dir) 34 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota-ms.py: -------------------------------------------------------------------------------- 1 | _base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py' 2 | 3 | checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth' # noqa 4 | 5 | # ========================modified parameters====================== 6 | deepen_factor = 0.33 7 | widen_factor = 0.5 8 | 9 | # Batch size of a single GPU during training 10 | train_batch_size_per_gpu = 8 11 | 12 | # Submission dir for result submit 13 | submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' 14 | 15 | # =======================Unmodified in most cases================== 16 | model = dict( 17 | backbone=dict( 18 | deepen_factor=deepen_factor, 19 | widen_factor=widen_factor, 20 | init_cfg=dict(checkpoint=checkpoint)), 21 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 22 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 23 | 24 | train_dataloader = dict(batch_size=train_batch_size_per_gpu) 25 | 26 | # Inference on test dataset and format the output results 27 | # for submission. Note: the test set has no annotation. 28 | # test_dataloader = dict( 29 | # dataset=dict( 30 | # data_root=_base_.data_root, 31 | # ann_file='', # test set has no annotation 32 | # data_prefix=dict(img_path=_base_.test_data_prefix), 33 | # pipeline=_base_.test_pipeline)) 34 | # test_evaluator = dict( 35 | # type='mmrotate.DOTAMetric', 36 | # format_only=True, 37 | # merge_patches=True, 38 | # outfile_prefix=submission_dir) 39 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota.py: -------------------------------------------------------------------------------- 1 | _base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py' 2 | 3 | checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth' # noqa 4 | 5 | # ========================modified parameters====================== 6 | deepen_factor = 0.33 7 | widen_factor = 0.5 8 | 9 | # Batch size of a single GPU during training 10 | train_batch_size_per_gpu = 8 11 | 12 | # Submission dir for result submit 13 | submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' 14 | 15 | # =======================Unmodified in most cases================== 16 | model = dict( 17 | backbone=dict( 18 | deepen_factor=deepen_factor, 19 | widen_factor=widen_factor, 20 | init_cfg=dict(checkpoint=checkpoint)), 21 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 22 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 23 | 24 | train_dataloader = dict(batch_size=train_batch_size_per_gpu) 25 | 26 | # Inference on test dataset and format the output results 27 | # for submission. Note: the test set has no annotation. 28 | # test_dataloader = dict( 29 | # dataset=dict( 30 | # data_root=_base_.data_root, 31 | # ann_file='', # test set has no annotation 32 | # data_prefix=dict(img_path=_base_.test_data_prefix), 33 | # pipeline=_base_.test_pipeline)) 34 | # test_evaluator = dict( 35 | # type='mmrotate.DOTAMetric', 36 | # format_only=True, 37 | # merge_patches=True, 38 | # outfile_prefix=submission_dir) 39 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota-ms.py: -------------------------------------------------------------------------------- 1 | _base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py' 2 | 3 | checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth' # noqa 4 | 5 | # ========================modified parameters====================== 6 | deepen_factor = 0.167 7 | widen_factor = 0.375 8 | 9 | # Batch size of a single GPU during training 10 | train_batch_size_per_gpu = 8 11 | 12 | # Submission dir for result submit 13 | submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' 14 | 15 | # =======================Unmodified in most cases================== 16 | model = dict( 17 | backbone=dict( 18 | deepen_factor=deepen_factor, 19 | widen_factor=widen_factor, 20 | init_cfg=dict(checkpoint=checkpoint)), 21 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 22 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 23 | 24 | train_dataloader = dict(batch_size=train_batch_size_per_gpu) 25 | 26 | # Inference on test dataset and format the output results 27 | # for submission. Note: the test set has no annotation. 28 | # test_dataloader = dict( 29 | # dataset=dict( 30 | # data_root=_base_.data_root, 31 | # ann_file='', # test set has no annotation 32 | # data_prefix=dict(img_path=_base_.test_data_prefix), 33 | # pipeline=_base_.test_pipeline)) 34 | # test_evaluator = dict( 35 | # type='mmrotate.DOTAMetric', 36 | # format_only=True, 37 | # merge_patches=True, 38 | # outfile_prefix=submission_dir) 39 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota.py: -------------------------------------------------------------------------------- 1 | _base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py' 2 | 3 | checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth' # noqa 4 | 5 | # ========================modified parameters====================== 6 | deepen_factor = 0.167 7 | widen_factor = 0.375 8 | 9 | # Batch size of a single GPU during training 10 | train_batch_size_per_gpu = 8 11 | 12 | # Submission dir for result submit 13 | submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' 14 | 15 | # =======================Unmodified in most cases================== 16 | model = dict( 17 | backbone=dict( 18 | deepen_factor=deepen_factor, 19 | widen_factor=widen_factor, 20 | init_cfg=dict(checkpoint=checkpoint)), 21 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 22 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 23 | 24 | train_dataloader = dict(batch_size=train_batch_size_per_gpu) 25 | 26 | # Inference on test dataset and format the output results 27 | # for submission. Note: the test set has no annotation. 28 | # test_dataloader = dict( 29 | # dataset=dict( 30 | # data_root=_base_.data_root, 31 | # ann_file='', # test set has no annotation 32 | # data_prefix=dict(img_path=_base_.test_data_prefix), 33 | # pipeline=_base_.test_pipeline)) 34 | # test_evaluator = dict( 35 | # type='mmrotate.DOTAMetric', 36 | # format_only=True, 37 | # merge_patches=True, 38 | # outfile_prefix=submission_dir) 39 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/rtmdet/rtmdet-ins_s_syncbn_fast_8xb32-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './rtmdet_s_syncbn_fast_8xb32-300e_coco.py' 2 | 3 | widen_factor = 0.5 4 | 5 | model = dict( 6 | bbox_head=dict( 7 | type='RTMDetInsSepBNHead', 8 | head_module=dict( 9 | type='RTMDetInsSepBNHeadModule', 10 | use_sigmoid_cls=True, 11 | widen_factor=widen_factor), 12 | loss_mask=dict( 13 | type='mmdet.DiceLoss', loss_weight=2.0, eps=5e-6, 14 | reduction='mean')), 15 | test_cfg=dict( 16 | multi_label=True, 17 | nms_pre=1000, 18 | min_bbox_size=0, 19 | score_thr=0.05, 20 | nms=dict(type='nms', iou_threshold=0.6), 21 | max_per_img=100, 22 | mask_thr_binary=0.5)) 23 | 24 | _base_.test_pipeline[-2] = dict( 25 | type='LoadAnnotations', with_bbox=True, with_mask=True, _scope_='mmdet') 26 | 27 | val_dataloader = dict(dataset=dict(pipeline=_base_.test_pipeline)) 28 | test_dataloader = val_dataloader 29 | 30 | val_evaluator = dict(metric=['bbox', 'segm']) 31 | test_evaluator = val_evaluator 32 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './rtmdet_l_syncbn_fast_8xb32-300e_coco.py' 2 | 3 | # ========================modified parameters====================== 4 | deepen_factor = 0.67 5 | widen_factor = 0.75 6 | 7 | # =======================Unmodified in most cases================== 8 | model = dict( 9 | backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 10 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 11 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 12 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/rtmdet/rtmdet_tiny_fast_1xb12-40e_cat.py: -------------------------------------------------------------------------------- 1 | _base_ = 'rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py' 2 | 3 | data_root = './data/cat/' 4 | class_name = ('cat', ) 5 | num_classes = len(class_name) 6 | metainfo = dict(classes=class_name, palette=[(20, 220, 60)]) 7 | 8 | num_epochs_stage2 = 5 9 | 10 | max_epochs = 40 11 | train_batch_size_per_gpu = 12 12 | train_num_workers = 4 13 | val_batch_size_per_gpu = 1 14 | val_num_workers = 2 15 | 16 | load_from = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117-dbb1dc83.pth' # noqa 17 | 18 | model = dict( 19 | backbone=dict(frozen_stages=4), 20 | bbox_head=dict(head_module=dict(num_classes=num_classes)), 21 | train_cfg=dict(assigner=dict(num_classes=num_classes))) 22 | 23 | train_dataloader = dict( 24 | batch_size=train_batch_size_per_gpu, 25 | num_workers=train_num_workers, 26 | dataset=dict( 27 | data_root=data_root, 28 | metainfo=metainfo, 29 | ann_file='annotations/trainval.json', 30 | data_prefix=dict(img='images/'))) 31 | 32 | val_dataloader = dict( 33 | batch_size=val_batch_size_per_gpu, 34 | num_workers=val_num_workers, 35 | dataset=dict( 36 | metainfo=metainfo, 37 | data_root=data_root, 38 | ann_file='annotations/test.json', 39 | data_prefix=dict(img='images/'))) 40 | 41 | test_dataloader = val_dataloader 42 | 43 | param_scheduler = [ 44 | dict( 45 | type='LinearLR', 46 | start_factor=_base_.lr_start_factor, 47 | by_epoch=False, 48 | begin=0, 49 | end=30), 50 | dict( 51 | # use cosine lr from 150 to 300 epoch 52 | type='CosineAnnealingLR', 53 | eta_min=_base_.base_lr * 0.05, 54 | begin=max_epochs // 2, 55 | end=max_epochs, 56 | T_max=max_epochs // 2, 57 | by_epoch=True, 58 | convert_to_iter_based=True), 59 | ] 60 | 61 | _base_.custom_hooks[1].switch_epoch = max_epochs - num_epochs_stage2 62 | 63 | val_evaluator = dict(ann_file=data_root + 'annotations/test.json') 64 | test_evaluator = val_evaluator 65 | 66 | default_hooks = dict( 67 | checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'), 68 | logger=dict(type='LoggerHook', interval=5)) 69 | train_cfg = dict(max_epochs=max_epochs, val_interval=10) 70 | # visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa 71 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './rtmdet_s_syncbn_fast_8xb32-300e_coco.py' 2 | checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth' # noqa 3 | 4 | # ========================modified parameters====================== 5 | deepen_factor = 0.167 6 | widen_factor = 0.375 7 | img_scale = _base_.img_scale 8 | 9 | # ratio range for random resize 10 | random_resize_ratio_range = (0.5, 2.0) 11 | # Number of cached images in mosaic 12 | mosaic_max_cached_images = 20 13 | # Number of cached images in mixup 14 | mixup_max_cached_images = 10 15 | 16 | # =======================Unmodified in most cases================== 17 | model = dict( 18 | backbone=dict( 19 | deepen_factor=deepen_factor, 20 | widen_factor=widen_factor, 21 | init_cfg=dict(checkpoint=checkpoint)), 22 | neck=dict( 23 | deepen_factor=deepen_factor, 24 | widen_factor=widen_factor, 25 | ), 26 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 27 | 28 | train_pipeline = [ 29 | dict(type='LoadImageFromFile', backend_args=_base_.backend_args), 30 | dict(type='LoadAnnotations', with_bbox=True), 31 | dict( 32 | type='Mosaic', 33 | img_scale=img_scale, 34 | use_cached=True, 35 | max_cached_images=mosaic_max_cached_images, # note 36 | random_pop=False, # note 37 | pad_val=114.0), 38 | dict( 39 | type='mmdet.RandomResize', 40 | # img_scale is (width, height) 41 | scale=(img_scale[0] * 2, img_scale[1] * 2), 42 | ratio_range=random_resize_ratio_range, 43 | resize_type='mmdet.Resize', 44 | keep_ratio=True), 45 | dict(type='mmdet.RandomCrop', crop_size=img_scale), 46 | dict(type='mmdet.YOLOXHSVRandomAug'), 47 | dict(type='mmdet.RandomFlip', prob=0.5), 48 | dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), 49 | dict( 50 | type='YOLOv5MixUp', 51 | use_cached=True, 52 | random_pop=False, 53 | max_cached_images=mixup_max_cached_images, 54 | prob=0.5), 55 | dict(type='mmdet.PackDetInputs') 56 | ] 57 | 58 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) 59 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './rtmdet_l_syncbn_fast_8xb32-300e_coco.py' 2 | 3 | # ========================modified parameters====================== 4 | deepen_factor = 1.33 5 | widen_factor = 1.25 6 | 7 | # =======================Unmodified in most cases================== 8 | model = dict( 9 | backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 10 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 11 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 12 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/crowdhuman/yolov5_s-v61_8xb16-300e_ignore_crowdhuman.py: -------------------------------------------------------------------------------- 1 | _base_ = 'yolov5_s-v61_fast_8xb16-300e_crowdhuman.py' 2 | 3 | model = dict( 4 | data_preprocessor=dict( 5 | _delete_=True, 6 | type='mmdet.DetDataPreprocessor', 7 | mean=[0., 0., 0.], 8 | std=[255., 255., 255.], 9 | bgr_to_rgb=True), 10 | bbox_head=dict(ignore_iof_thr=0.5)) 11 | 12 | img_scale = _base_.img_scale 13 | 14 | albu_train_transforms = [ 15 | dict(type='Blur', p=0.01), 16 | dict(type='MedianBlur', p=0.01), 17 | dict(type='ToGray', p=0.01), 18 | dict(type='CLAHE', p=0.01) 19 | ] 20 | 21 | pre_transform = [ 22 | dict(type='LoadImageFromFile', backend_args=_base_.backend_args), 23 | # only change this 24 | dict(type='mmdet.LoadAnnotations', with_bbox=True) 25 | ] 26 | 27 | train_pipeline = [ 28 | *pre_transform, 29 | dict( 30 | type='Mosaic', 31 | img_scale=img_scale, 32 | pad_val=114.0, 33 | pre_transform=pre_transform), 34 | dict( 35 | type='YOLOv5RandomAffine', 36 | max_rotate_degree=0.0, 37 | max_shear_degree=0.0, 38 | scaling_ratio_range=(0.5, 1.5), 39 | # img_scale is (width, height) 40 | border=(-img_scale[0] // 2, -img_scale[1] // 2), 41 | border_val=(114, 114, 114)), 42 | dict( 43 | type='mmdet.Albu', 44 | transforms=albu_train_transforms, 45 | bbox_params=dict( 46 | type='BboxParams', 47 | format='pascal_voc', 48 | label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), 49 | keymap={ 50 | 'img': 'image', 51 | 'gt_bboxes': 'bboxes' 52 | }), 53 | dict(type='YOLOv5HSVRandomAug'), 54 | dict(type='mmdet.RandomFlip', prob=0.5), 55 | dict( 56 | type='mmdet.PackDetInputs', 57 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', 58 | 'flip_direction')) 59 | ] 60 | 61 | train_dataloader = dict( 62 | collate_fn=dict(type='pseudo_collate'), 63 | dataset=dict(pipeline=train_pipeline)) 64 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/crowdhuman/yolov5_s-v61_fast_8xb16-300e_crowdhuman.py: -------------------------------------------------------------------------------- 1 | _base_ = '../yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' 2 | 3 | # Use the model trained on the COCO as the pretrained model 4 | load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' # noqa 5 | 6 | # dataset settings 7 | data_root = 'data/CrowdHuman/' 8 | dataset_type = 'YOLOv5CrowdHumanDataset' 9 | 10 | # parameters that often need to be modified 11 | num_classes = 1 12 | 13 | anchors = [ 14 | [(6, 14), (12, 28), (19, 48)], # P3/8 15 | [(29, 79), (46, 124), (142, 54)], # P4/16 16 | [(73, 198), (124, 330), (255, 504)] # P5/32 17 | ] 18 | 19 | model = dict( 20 | bbox_head=dict( 21 | head_module=dict(num_classes=num_classes), 22 | prior_generator=dict(base_sizes=anchors))) 23 | 24 | train_dataloader = dict( 25 | dataset=dict( 26 | type=dataset_type, 27 | data_root=data_root, 28 | ann_file='annotation_train.odgt', 29 | data_prefix=dict(img='Images/'))) 30 | 31 | val_dataloader = dict( 32 | dataset=dict( 33 | type=dataset_type, 34 | data_root=data_root, 35 | ann_file='annotation_val.odgt', 36 | data_prefix=dict(img='Images/'), 37 | # CrowdHumanMetric does not support out-of-order output images 38 | # for the time being. batch_shapes_cfg does not support. 39 | batch_shapes_cfg=None)) 40 | test_dataloader = val_dataloader 41 | 42 | val_evaluator = dict( 43 | _delete_=True, 44 | type='mmdet.CrowdHumanMetric', 45 | ann_file=data_root + 'annotation_val.odgt', 46 | metric=['AP', 'MR', 'JI']) 47 | test_evaluator = val_evaluator 48 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa 2 | 3 | deepen_factor = 0.33 4 | widen_factor = 0.25 5 | 6 | model = dict( 7 | backbone=dict( 8 | deepen_factor=deepen_factor, 9 | widen_factor=widen_factor, 10 | ), 11 | neck=dict( 12 | deepen_factor=deepen_factor, 13 | widen_factor=widen_factor, 14 | ), 15 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 16 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa 2 | 3 | data_root = 'data/balloon/' 4 | # Path of train annotation file 5 | train_ann_file = 'train.json' 6 | train_data_prefix = 'train/' # Prefix of train image path 7 | # Path of val annotation file 8 | val_ann_file = 'val.json' 9 | val_data_prefix = 'val/' # Prefix of val image path 10 | metainfo = { 11 | 'classes': ('balloon', ), 12 | 'palette': [ 13 | (220, 20, 60), 14 | ] 15 | } 16 | num_classes = 1 17 | 18 | train_batch_size_per_gpu = 4 19 | train_num_workers = 2 20 | log_interval = 1 21 | ##################### 22 | train_dataloader = dict( 23 | batch_size=train_batch_size_per_gpu, 24 | num_workers=train_num_workers, 25 | dataset=dict( 26 | data_root=data_root, 27 | metainfo=metainfo, 28 | data_prefix=dict(img=train_data_prefix), 29 | ann_file=train_ann_file)) 30 | val_dataloader = dict( 31 | dataset=dict( 32 | data_root=data_root, 33 | metainfo=metainfo, 34 | data_prefix=dict(img=val_data_prefix), 35 | ann_file=val_ann_file)) 36 | test_dataloader = val_dataloader 37 | val_evaluator = dict(ann_file=data_root + val_ann_file) 38 | test_evaluator = val_evaluator 39 | default_hooks = dict(logger=dict(interval=log_interval)) 40 | ##################### 41 | 42 | model = dict(bbox_head=dict(head_module=dict(num_classes=num_classes))) 43 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa 2 | 3 | # ========================modified parameters====================== 4 | mask_overlap = False # Polygon2Mask 5 | 6 | # ===============================Unmodified in most cases==================== 7 | model = dict(bbox_head=dict(mask_overlap=mask_overlap)) 8 | 9 | train_pipeline = [ 10 | *_base_.pre_transform, 11 | dict( 12 | type='Mosaic', 13 | img_scale=_base_.img_scale, 14 | pad_val=114.0, 15 | pre_transform=_base_.pre_transform), 16 | dict( 17 | type='YOLOv5RandomAffine', 18 | max_rotate_degree=0.0, 19 | max_shear_degree=0.0, 20 | scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), 21 | border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), 22 | border_val=(114, 114, 114), 23 | min_area_ratio=_base_.min_area_ratio, 24 | max_aspect_ratio=_base_.max_aspect_ratio, 25 | use_mask_refine=True), 26 | dict( 27 | type='mmdet.Albu', 28 | transforms=_base_.albu_train_transforms, 29 | bbox_params=dict( 30 | type='BboxParams', 31 | format='pascal_voc', 32 | label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), 33 | keymap={ 34 | 'img': 'image', 35 | 'gt_bboxes': 'bboxes', 36 | }), 37 | dict(type='YOLOv5HSVRandomAug'), 38 | dict(type='mmdet.RandomFlip', prob=0.5), 39 | dict( 40 | type='Polygon2Mask', 41 | downsample_ratio=_base_.downsample_ratio, 42 | mask_overlap=mask_overlap), 43 | dict( 44 | type='PackDetInputs', 45 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', 46 | 'flip_direction')) 47 | ] 48 | 49 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) 50 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa 2 | 3 | deepen_factor = 1.33 4 | widen_factor = 1.25 5 | 6 | model = dict( 7 | backbone=dict( 8 | deepen_factor=deepen_factor, 9 | widen_factor=widen_factor, 10 | ), 11 | neck=dict( 12 | deepen_factor=deepen_factor, 13 | widen_factor=widen_factor, 14 | ), 15 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 16 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py' 2 | 3 | # This config will refine bbox by mask while loading annotations and 4 | # transforming after `YOLOv5RandomAffine` 5 | 6 | # ========================modified parameters====================== 7 | deepen_factor = 0.33 8 | widen_factor = 0.25 9 | 10 | # ===============================Unmodified in most cases==================== 11 | model = dict( 12 | backbone=dict( 13 | deepen_factor=deepen_factor, 14 | widen_factor=widen_factor, 15 | ), 16 | neck=dict( 17 | deepen_factor=deepen_factor, 18 | widen_factor=widen_factor, 19 | ), 20 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 21 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = '../yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' 2 | 3 | # This config will refine bbox by mask while loading annotations and 4 | # transforming after `YOLOv5RandomAffine` 5 | 6 | # ========================modified parameters====================== 7 | use_mask2refine = True 8 | min_area_ratio = 0.01 # YOLOv5RandomAffine 9 | 10 | # ===============================Unmodified in most cases==================== 11 | pre_transform = [ 12 | dict(type='LoadImageFromFile', backend_args=_base_.backend_args), 13 | dict( 14 | type='LoadAnnotations', 15 | with_bbox=True, 16 | with_mask=True, 17 | mask2bbox=use_mask2refine) 18 | ] 19 | 20 | last_transform = [ 21 | # Delete gt_masks to avoid more computation 22 | dict(type='RemoveDataElement', keys=['gt_masks']), 23 | dict( 24 | type='mmdet.Albu', 25 | transforms=_base_.albu_train_transforms, 26 | bbox_params=dict( 27 | type='BboxParams', 28 | format='pascal_voc', 29 | label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), 30 | keymap={ 31 | 'img': 'image', 32 | 'gt_bboxes': 'bboxes' 33 | }), 34 | dict(type='YOLOv5HSVRandomAug'), 35 | dict(type='mmdet.RandomFlip', prob=0.5), 36 | dict( 37 | type='mmdet.PackDetInputs', 38 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', 39 | 'flip_direction')) 40 | ] 41 | 42 | train_pipeline = [ 43 | *pre_transform, 44 | dict( 45 | type='Mosaic', 46 | img_scale=_base_.img_scale, 47 | pad_val=114.0, 48 | pre_transform=pre_transform), 49 | dict( 50 | type='YOLOv5RandomAffine', 51 | max_rotate_degree=0.0, 52 | max_shear_degree=0.0, 53 | scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), 54 | # img_scale is (width, height) 55 | border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), 56 | border_val=(114, 114, 114), 57 | min_area_ratio=min_area_ratio, 58 | use_mask_refine=use_mask2refine), 59 | *last_transform 60 | ] 61 | 62 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) 63 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py' 2 | 3 | # This config use refining bbox and `YOLOv5CopyPaste`. 4 | # Refining bbox means refining bbox by mask while loading annotations and 5 | # transforming after `YOLOv5RandomAffine` 6 | 7 | # ========================modified parameters====================== 8 | deepen_factor = 1.33 9 | widen_factor = 1.25 10 | 11 | # ===============================Unmodified in most cases==================== 12 | model = dict( 13 | backbone=dict( 14 | deepen_factor=deepen_factor, 15 | widen_factor=widen_factor, 16 | ), 17 | neck=dict( 18 | deepen_factor=deepen_factor, 19 | widen_factor=widen_factor, 20 | ), 21 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 22 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/voc/yolov5_l-v61_fast_1xb32-50e_voc.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5_s-v61_fast_1xb64-50e_voc.py' 2 | 3 | deepen_factor = 1.0 4 | widen_factor = 1.0 5 | train_batch_size_per_gpu = 32 6 | train_num_workers = 8 7 | 8 | load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco/yolov5_l-v61_syncbn_fast_8xb16-300e_coco_20220917_031007-096ef0eb.pth' # noqa 9 | 10 | model = dict( 11 | backbone=dict( 12 | deepen_factor=deepen_factor, 13 | widen_factor=widen_factor, 14 | ), 15 | neck=dict( 16 | deepen_factor=deepen_factor, 17 | widen_factor=widen_factor, 18 | ), 19 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 20 | 21 | train_dataloader = dict( 22 | batch_size=train_batch_size_per_gpu, num_workers=train_num_workers) 23 | 24 | optim_wrapper = dict( 25 | optimizer=dict(batch_size_per_gpu=train_batch_size_per_gpu)) 26 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/voc/yolov5_m-v61_fast_1xb64-50e_voc.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5_s-v61_fast_1xb64-50e_voc.py' 2 | 3 | deepen_factor = 0.67 4 | widen_factor = 0.75 5 | 6 | load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944-516a710f.pth' # noqa 7 | 8 | model = dict( 9 | backbone=dict( 10 | deepen_factor=deepen_factor, 11 | widen_factor=widen_factor, 12 | ), 13 | neck=dict( 14 | deepen_factor=deepen_factor, 15 | widen_factor=widen_factor, 16 | ), 17 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 18 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/voc/yolov5_n-v61_fast_1xb64-50e_voc.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5_s-v61_fast_1xb64-50e_voc.py' 2 | 3 | deepen_factor = 0.33 4 | widen_factor = 0.25 5 | 6 | load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco/yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739-b804c1ad.pth' # noqa 7 | 8 | model = dict( 9 | backbone=dict( 10 | deepen_factor=deepen_factor, 11 | widen_factor=widen_factor, 12 | ), 13 | neck=dict( 14 | deepen_factor=deepen_factor, 15 | widen_factor=widen_factor, 16 | ), 17 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 18 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/voc/yolov5_x-v61_fast_1xb32-50e_voc.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5_s-v61_fast_1xb64-50e_voc.py' 2 | 3 | deepen_factor = 1.33 4 | widen_factor = 1.25 5 | train_batch_size_per_gpu = 32 6 | train_num_workers = 8 7 | 8 | # TODO: need to add pretrained_model 9 | load_from = None 10 | 11 | model = dict( 12 | backbone=dict( 13 | deepen_factor=deepen_factor, 14 | widen_factor=widen_factor, 15 | ), 16 | neck=dict( 17 | deepen_factor=deepen_factor, 18 | widen_factor=widen_factor, 19 | ), 20 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 21 | 22 | train_dataloader = dict( 23 | batch_size=train_batch_size_per_gpu, num_workers=train_num_workers) 24 | 25 | optim_wrapper = dict( 26 | optimizer=dict(batch_size_per_gpu=train_batch_size_per_gpu)) 27 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco.py' 2 | 3 | deepen_factor = 1.0 4 | widen_factor = 1.0 5 | 6 | model = dict( 7 | backbone=dict( 8 | deepen_factor=deepen_factor, 9 | widen_factor=widen_factor, 10 | ), 11 | neck=dict( 12 | deepen_factor=deepen_factor, 13 | widen_factor=widen_factor, 14 | ), 15 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 16 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py' 2 | 3 | deepen_factor = 1.0 4 | widen_factor = 1.0 5 | 6 | model = dict( 7 | backbone=dict( 8 | deepen_factor=deepen_factor, 9 | widen_factor=widen_factor, 10 | ), 11 | neck=dict( 12 | deepen_factor=deepen_factor, 13 | widen_factor=widen_factor, 14 | ), 15 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 16 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = 'yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py' 2 | 3 | deepen_factor = 0.33 4 | widen_factor = 0.25 5 | 6 | model = dict( 7 | backbone=dict( 8 | deepen_factor=deepen_factor, 9 | widen_factor=widen_factor, 10 | ), 11 | neck=dict( 12 | deepen_factor=deepen_factor, 13 | widen_factor=widen_factor, 14 | ), 15 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 16 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' 2 | 3 | deepen_factor = 0.33 4 | widen_factor = 0.25 5 | 6 | model = dict( 7 | backbone=dict( 8 | deepen_factor=deepen_factor, 9 | widen_factor=widen_factor, 10 | ), 11 | neck=dict( 12 | deepen_factor=deepen_factor, 13 | widen_factor=widen_factor, 14 | ), 15 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 16 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py: -------------------------------------------------------------------------------- 1 | _base_ = 'yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' 2 | 3 | data_root = './data/cat/' 4 | class_name = ('cat', ) 5 | num_classes = len(class_name) 6 | metainfo = dict(classes=class_name, palette=[(20, 220, 60)]) 7 | 8 | anchors = [ 9 | [(68, 69), (154, 91), (143, 162)], # P3/8 10 | [(242, 160), (189, 287), (391, 207)], # P4/16 11 | [(353, 337), (539, 341), (443, 432)] # P5/32 12 | ] 13 | 14 | max_epochs = 40 15 | train_batch_size_per_gpu = 12 16 | train_num_workers = 4 17 | 18 | load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' # noqa 19 | 20 | model = dict( 21 | backbone=dict(frozen_stages=4), 22 | bbox_head=dict( 23 | head_module=dict(num_classes=num_classes), 24 | prior_generator=dict(base_sizes=anchors))) 25 | 26 | train_dataloader = dict( 27 | batch_size=train_batch_size_per_gpu, 28 | num_workers=train_num_workers, 29 | dataset=dict( 30 | data_root=data_root, 31 | metainfo=metainfo, 32 | ann_file='annotations/trainval.json', 33 | data_prefix=dict(img='images/'))) 34 | 35 | val_dataloader = dict( 36 | dataset=dict( 37 | metainfo=metainfo, 38 | data_root=data_root, 39 | ann_file='annotations/test.json', 40 | data_prefix=dict(img='images/'))) 41 | 42 | test_dataloader = val_dataloader 43 | 44 | _base_.optim_wrapper.optimizer.batch_size_per_gpu = train_batch_size_per_gpu 45 | 46 | val_evaluator = dict(ann_file=data_root + 'annotations/test.json') 47 | test_evaluator = val_evaluator 48 | 49 | default_hooks = dict( 50 | checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'), 51 | # The warmup_mim_iter parameter is critical. 52 | # The default value is 1000 which is not suitable for cat datasets. 53 | param_scheduler=dict(max_epochs=max_epochs, warmup_mim_iter=10), 54 | logger=dict(type='LoggerHook', interval=5)) 55 | train_cfg = dict(max_epochs=max_epochs, val_interval=10) 56 | # visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa 57 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/yolov5_s-v61_fast_1xb12-ms-40e_cat.py: -------------------------------------------------------------------------------- 1 | _base_ = 'yolov5_s-v61_fast_1xb12-40e_cat.py' 2 | 3 | model = dict( 4 | data_preprocessor=dict( 5 | type='YOLOv5DetDataPreprocessor', 6 | pad_size_divisor=32, 7 | batch_augments=[ 8 | dict( 9 | type='YOLOXBatchSyncRandomResize', 10 | random_size_range=(480, 800), 11 | size_divisor=32, 12 | interval=1) 13 | ])) 14 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn-detect_8xb16-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = 'yolov5_s-v61_syncbn_8xb16-300e_coco.py' 2 | 3 | test_pipeline = [ 4 | dict(type='LoadImageFromFile', backend_args=_base_.backend_args), 5 | dict( 6 | type='LetterResize', 7 | scale=_base_.img_scale, 8 | allow_scale_up=True, 9 | use_mini_pad=True), 10 | dict(type='LoadAnnotations', with_bbox=True), 11 | dict( 12 | type='mmdet.PackDetInputs', 13 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 14 | 'scale_factor', 'pad_param')) 15 | ] 16 | 17 | val_dataloader = dict( 18 | dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=None)) 19 | test_dataloader = val_dataloader 20 | 21 | model = dict( 22 | test_cfg=dict( 23 | multi_label=False, score_thr=0.25, nms=dict(iou_threshold=0.45))) 24 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn_fast_1xb4-300e_balloon.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' 2 | 3 | # ========================modified parameters====================== 4 | data_root = 'data/balloon/' 5 | # Path of train annotation file 6 | train_ann_file = 'train.json' 7 | train_data_prefix = 'train/' # Prefix of train image path 8 | # Path of val annotation file 9 | val_ann_file = 'val.json' 10 | val_data_prefix = 'val/' # Prefix of val image path 11 | metainfo = { 12 | 'classes': ('balloon', ), 13 | 'palette': [ 14 | (220, 20, 60), 15 | ] 16 | } 17 | num_classes = 1 18 | 19 | train_batch_size_per_gpu = 4 20 | train_num_workers = 2 21 | log_interval = 1 22 | 23 | # =======================Unmodified in most cases================== 24 | train_dataloader = dict( 25 | batch_size=train_batch_size_per_gpu, 26 | num_workers=train_num_workers, 27 | dataset=dict( 28 | data_root=data_root, 29 | metainfo=metainfo, 30 | data_prefix=dict(img=train_data_prefix), 31 | ann_file=train_ann_file)) 32 | val_dataloader = dict( 33 | dataset=dict( 34 | data_root=data_root, 35 | metainfo=metainfo, 36 | data_prefix=dict(img=val_data_prefix), 37 | ann_file=val_ann_file)) 38 | test_dataloader = val_dataloader 39 | val_evaluator = dict(ann_file=data_root + val_ann_file) 40 | test_evaluator = val_evaluator 41 | model = dict(bbox_head=dict(head_module=dict(num_classes=num_classes))) 42 | default_hooks = dict(logger=dict(interval=log_interval)) 43 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = 'yolov5_s-v61_syncbn_8xb16-300e_coco.py' 2 | 3 | # fast means faster training speed, 4 | # but less flexibility for multitasking 5 | model = dict( 6 | data_preprocessor=dict( 7 | type='YOLOv5DetDataPreprocessor', 8 | mean=[0., 0., 0.], 9 | std=[255., 255., 255.], 10 | bgr_to_rgb=True)) 11 | 12 | train_dataloader = dict(collate_fn=dict(type='yolov5_collate')) 13 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/yolov5_x-p6-v62_syncbn_fast_8xb16-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco.py' 2 | deepen_factor = 1.33 3 | widen_factor = 1.25 4 | 5 | model = dict( 6 | backbone=dict( 7 | deepen_factor=deepen_factor, 8 | widen_factor=widen_factor, 9 | ), 10 | neck=dict( 11 | deepen_factor=deepen_factor, 12 | widen_factor=widen_factor, 13 | ), 14 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 15 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/yolov5_x-v61_syncbn_fast_8xb16-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py' 2 | deepen_factor = 1.33 3 | widen_factor = 1.25 4 | 5 | model = dict( 6 | backbone=dict( 7 | deepen_factor=deepen_factor, 8 | widen_factor=widen_factor, 9 | ), 10 | neck=dict( 11 | deepen_factor=deepen_factor, 12 | widen_factor=widen_factor, 13 | ), 14 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 15 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_l_mask-refine_syncbn_fast_8xb16-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5u_m_mask-refine_syncbn_fast_8xb16-300e_coco.py' 2 | 3 | # This config will refine bbox by mask while loading annotations and 4 | # transforming after `YOLOv5RandomAffine` 5 | 6 | # ========================modified parameters====================== 7 | deepen_factor = 1.00 8 | widen_factor = 1.00 9 | 10 | mixup_prob = 0.15 11 | copypaste_prob = 0.3 12 | 13 | # =======================Unmodified in most cases================== 14 | img_scale = _base_.img_scale 15 | pre_transform = _base_.pre_transform 16 | last_transform = _base_.last_transform 17 | affine_scale = _base_.affine_scale 18 | 19 | model = dict( 20 | backbone=dict( 21 | deepen_factor=deepen_factor, 22 | widen_factor=widen_factor, 23 | ), 24 | neck=dict( 25 | deepen_factor=deepen_factor, 26 | widen_factor=widen_factor, 27 | ), 28 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 29 | 30 | mosaic_affine_transform = [ 31 | dict( 32 | type='Mosaic', 33 | img_scale=img_scale, 34 | pad_val=114.0, 35 | pre_transform=pre_transform), 36 | dict(type='YOLOv5CopyPaste', prob=copypaste_prob), 37 | dict( 38 | type='YOLOv5RandomAffine', 39 | max_rotate_degree=0.0, 40 | max_shear_degree=0.0, 41 | max_aspect_ratio=100., 42 | scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), 43 | # img_scale is (width, height) 44 | border=(-img_scale[0] // 2, -img_scale[1] // 2), 45 | border_val=(114, 114, 114), 46 | min_area_ratio=_base_.min_area_ratio, 47 | use_mask_refine=_base_.use_mask2refine) 48 | ] 49 | 50 | train_pipeline = [ 51 | *pre_transform, *mosaic_affine_transform, 52 | dict( 53 | type='YOLOv5MixUp', 54 | prob=mixup_prob, 55 | pre_transform=[*pre_transform, *mosaic_affine_transform]), 56 | *last_transform 57 | ] 58 | 59 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) 60 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_l_syncbn_fast_8xb16-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5u_s_syncbn_fast_8xb16-300e_coco.py' 2 | 3 | # ========================modified parameters====================== 4 | # TODO: Update the training hyperparameters 5 | deepen_factor = 1.0 6 | widen_factor = 1.0 7 | 8 | # =======================Unmodified in most cases================== 9 | model = dict( 10 | backbone=dict( 11 | deepen_factor=deepen_factor, 12 | widen_factor=widen_factor, 13 | ), 14 | neck=dict( 15 | deepen_factor=deepen_factor, 16 | widen_factor=widen_factor, 17 | ), 18 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 19 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_m_syncbn_fast_8xb16-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5u_s_syncbn_fast_8xb16-300e_coco.py' 2 | 3 | # ========================modified parameters====================== 4 | # TODO: Update the training hyperparameters 5 | deepen_factor = 0.67 6 | widen_factor = 0.75 7 | 8 | # =======================Unmodified in most cases================== 9 | model = dict( 10 | backbone=dict( 11 | deepen_factor=deepen_factor, 12 | widen_factor=widen_factor, 13 | ), 14 | neck=dict( 15 | deepen_factor=deepen_factor, 16 | widen_factor=widen_factor, 17 | ), 18 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 19 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_n_mask-refine_syncbn_fast_8xb16-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5u_s_mask-refine_syncbn_fast_8xb16-300e_coco.py' 2 | 3 | # This config will refine bbox by mask while loading annotations and 4 | # transforming after `YOLOv5RandomAffine` 5 | 6 | # ========================modified parameters====================== 7 | deepen_factor = 0.33 8 | widen_factor = 0.25 9 | 10 | # ===============================Unmodified in most cases==================== 11 | model = dict( 12 | backbone=dict( 13 | deepen_factor=deepen_factor, 14 | widen_factor=widen_factor, 15 | ), 16 | neck=dict( 17 | deepen_factor=deepen_factor, 18 | widen_factor=widen_factor, 19 | ), 20 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 21 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_n_syncbn_fast_8xb16-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5u_s_syncbn_fast_8xb16-300e_coco.py' 2 | 3 | # ========================modified parameters====================== 4 | deepen_factor = 0.33 5 | widen_factor = 0.25 6 | 7 | # =======================Unmodified in most cases================== 8 | model = dict( 9 | backbone=dict( 10 | deepen_factor=deepen_factor, 11 | widen_factor=widen_factor, 12 | ), 13 | neck=dict( 14 | deepen_factor=deepen_factor, 15 | widen_factor=widen_factor, 16 | ), 17 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 18 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_x_mask-refine_syncbn_fast_8xb16-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5u_l_mask-refine_syncbn_fast_8xb16-300e_coco.py' 2 | 3 | # ========================modified parameters====================== 4 | deepen_factor = 1.33 5 | widen_factor = 1.25 6 | 7 | # =======================Unmodified in most cases================== 8 | model = dict( 9 | backbone=dict( 10 | deepen_factor=deepen_factor, 11 | widen_factor=widen_factor, 12 | ), 13 | neck=dict( 14 | deepen_factor=deepen_factor, 15 | widen_factor=widen_factor, 16 | ), 17 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 18 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_x_syncbn_fast_8xb16-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov5u_l_syncbn_fast_8xb16-300e_coco.py' 2 | 3 | # ========================modified parameters====================== 4 | # TODO: Update the training hyperparameters 5 | deepen_factor = 1.33 6 | widen_factor = 1.25 7 | 8 | # =======================Unmodified in most cases================== 9 | model = dict( 10 | backbone=dict( 11 | deepen_factor=deepen_factor, 12 | widen_factor=widen_factor, 13 | ), 14 | neck=dict( 15 | deepen_factor=deepen_factor, 16 | widen_factor=widen_factor, 17 | ), 18 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 19 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov6_m_syncbn_fast_8xb32-300e_coco.py' 2 | 3 | # ======================= Possible modified parameters ======================= 4 | # -----model related----- 5 | # The scaling factor that controls the depth of the network structure 6 | deepen_factor = 1 7 | # The scaling factor that controls the width of the network structure 8 | widen_factor = 1 9 | 10 | # ============================== Unmodified in most cases =================== 11 | model = dict( 12 | backbone=dict( 13 | deepen_factor=deepen_factor, 14 | widen_factor=widen_factor, 15 | hidden_ratio=1. / 2, 16 | block_cfg=dict( 17 | type='ConvWrapper', 18 | norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)), 19 | act_cfg=dict(type='SiLU', inplace=True)), 20 | neck=dict( 21 | deepen_factor=deepen_factor, 22 | widen_factor=widen_factor, 23 | hidden_ratio=1. / 2, 24 | block_cfg=dict( 25 | type='ConvWrapper', 26 | norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)), 27 | block_act_cfg=dict(type='SiLU', inplace=True)), 28 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 29 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov6_s_syncbn_fast_8xb32-300e_coco.py' 2 | 3 | # ======================= Possible modified parameters ======================= 4 | # -----model related----- 5 | # The scaling factor that controls the depth of the network structure 6 | deepen_factor = 0.6 7 | # The scaling factor that controls the width of the network structure 8 | widen_factor = 0.75 9 | 10 | # -----train val related----- 11 | affine_scale = 0.9 # YOLOv5RandomAffine scaling ratio 12 | 13 | # ============================== Unmodified in most cases =================== 14 | model = dict( 15 | backbone=dict( 16 | type='YOLOv6CSPBep', 17 | deepen_factor=deepen_factor, 18 | widen_factor=widen_factor, 19 | hidden_ratio=2. / 3, 20 | block_cfg=dict(type='RepVGGBlock'), 21 | act_cfg=dict(type='ReLU', inplace=True)), 22 | neck=dict( 23 | type='YOLOv6CSPRepPAFPN', 24 | deepen_factor=deepen_factor, 25 | widen_factor=widen_factor, 26 | block_cfg=dict(type='RepVGGBlock'), 27 | hidden_ratio=2. / 3, 28 | block_act_cfg=dict(type='ReLU', inplace=True)), 29 | bbox_head=dict( 30 | type='YOLOv6Head', head_module=dict(widen_factor=widen_factor))) 31 | 32 | mosaic_affine_pipeline = [ 33 | dict( 34 | type='Mosaic', 35 | img_scale=_base_.img_scale, 36 | pad_val=114.0, 37 | pre_transform=_base_.pre_transform), 38 | dict( 39 | type='YOLOv5RandomAffine', 40 | max_rotate_degree=0.0, 41 | max_shear_degree=0.0, 42 | scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), 43 | # img_scale is (width, height) 44 | border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), 45 | border_val=(114, 114, 114)) 46 | ] 47 | 48 | train_pipeline = [ 49 | *_base_.pre_transform, *mosaic_affine_pipeline, 50 | dict( 51 | type='YOLOv5MixUp', 52 | prob=0.1, 53 | pre_transform=[*_base_.pre_transform, *mosaic_affine_pipeline]), 54 | dict(type='YOLOv5HSVRandomAug'), 55 | dict(type='mmdet.RandomFlip', prob=0.5), 56 | dict( 57 | type='mmdet.PackDetInputs', 58 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', 59 | 'flip_direction')) 60 | ] 61 | 62 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) 63 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov6/yolov6_n_syncbn_fast_8xb32-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov6_s_syncbn_fast_8xb32-300e_coco.py' 2 | 3 | # ======================= Possible modified parameters ======================= 4 | # -----model related----- 5 | # The scaling factor that controls the depth of the network structure 6 | deepen_factor = 0.33 7 | # The scaling factor that controls the width of the network structure 8 | widen_factor = 0.25 9 | 10 | # -----train val related----- 11 | lr_factor = 0.02 # Learning rate scaling factor 12 | 13 | # ============================== Unmodified in most cases =================== 14 | model = dict( 15 | backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 16 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 17 | bbox_head=dict( 18 | head_module=dict(widen_factor=widen_factor), 19 | loss_bbox=dict(iou_mode='siou'))) 20 | 21 | default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) 22 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov6_s_syncbn_fast_8xb32-400e_coco.py' 2 | 3 | # ======================= Possible modified parameters ======================= 4 | # -----model related----- 5 | # The scaling factor that controls the depth of the network structure 6 | deepen_factor = 0.33 7 | # The scaling factor that controls the width of the network structure 8 | widen_factor = 0.25 9 | 10 | # -----train val related----- 11 | lr_factor = 0.02 # Learning rate scaling factor 12 | 13 | # ============================== Unmodified in most cases =================== 14 | model = dict( 15 | backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 16 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 17 | bbox_head=dict( 18 | head_module=dict(widen_factor=widen_factor), 19 | loss_bbox=dict(iou_mode='siou'))) 20 | 21 | default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) 22 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov6/yolov6_s_fast_1xb12-40e_cat.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov6_s_syncbn_fast_8xb32-400e_coco.py' 2 | 3 | data_root = './data/cat/' 4 | class_name = ('cat', ) 5 | num_classes = len(class_name) 6 | metainfo = dict(classes=class_name, palette=[(20, 220, 60)]) 7 | 8 | max_epochs = 40 9 | train_batch_size_per_gpu = 12 10 | train_num_workers = 4 11 | num_last_epochs = 5 12 | 13 | load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth' # noqa 14 | 15 | model = dict( 16 | backbone=dict(frozen_stages=4), 17 | bbox_head=dict(head_module=dict(num_classes=num_classes)), 18 | train_cfg=dict( 19 | initial_assigner=dict(num_classes=num_classes), 20 | assigner=dict(num_classes=num_classes))) 21 | 22 | train_dataloader = dict( 23 | batch_size=train_batch_size_per_gpu, 24 | num_workers=train_num_workers, 25 | dataset=dict( 26 | data_root=data_root, 27 | metainfo=metainfo, 28 | ann_file='annotations/trainval.json', 29 | data_prefix=dict(img='images/'))) 30 | 31 | val_dataloader = dict( 32 | dataset=dict( 33 | metainfo=metainfo, 34 | data_root=data_root, 35 | ann_file='annotations/test.json', 36 | data_prefix=dict(img='images/'))) 37 | 38 | test_dataloader = val_dataloader 39 | 40 | val_evaluator = dict(ann_file=data_root + 'annotations/test.json') 41 | test_evaluator = val_evaluator 42 | 43 | _base_.optim_wrapper.optimizer.batch_size_per_gpu = train_batch_size_per_gpu 44 | _base_.custom_hooks[1].switch_epoch = max_epochs - num_last_epochs 45 | 46 | default_hooks = dict( 47 | checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'), 48 | # The warmup_mim_iter parameter is critical. 49 | # The default value is 1000 which is not suitable for cat datasets. 50 | param_scheduler=dict(max_epochs=max_epochs, warmup_mim_iter=10), 51 | logger=dict(type='LoggerHook', interval=5)) 52 | train_cfg = dict( 53 | max_epochs=max_epochs, 54 | val_interval=10, 55 | dynamic_intervals=[(max_epochs - num_last_epochs, 1)]) 56 | # visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa 57 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov6/yolov6_s_syncbn_fast_8xb32-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov6_s_syncbn_fast_8xb32-400e_coco.py' 2 | 3 | # ======================= Frequently modified parameters ===================== 4 | # -----train val related----- 5 | # Base learning rate for optim_wrapper 6 | max_epochs = 300 # Maximum training epochs 7 | num_last_epochs = 15 # Last epoch number to switch training pipeline 8 | 9 | # ============================== Unmodified in most cases =================== 10 | default_hooks = dict( 11 | param_scheduler=dict( 12 | type='YOLOv5ParamSchedulerHook', 13 | scheduler_type='cosine', 14 | lr_factor=0.01, 15 | max_epochs=max_epochs)) 16 | 17 | custom_hooks = [ 18 | dict( 19 | type='EMAHook', 20 | ema_type='ExpMomentumEMA', 21 | momentum=0.0001, 22 | update_buffers=True, 23 | strict_load=False, 24 | priority=49), 25 | dict( 26 | type='mmdet.PipelineSwitchHook', 27 | switch_epoch=max_epochs - num_last_epochs, 28 | switch_pipeline=_base_.train_pipeline_stage2) 29 | ] 30 | 31 | train_cfg = dict( 32 | max_epochs=max_epochs, 33 | dynamic_intervals=[(max_epochs - num_last_epochs, 1)]) 34 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov6/yolov6_t_syncbn_fast_8xb32-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov6_s_syncbn_fast_8xb32-300e_coco.py' 2 | 3 | # ======================= Possible modified parameters ======================= 4 | # -----model related----- 5 | # The scaling factor that controls the depth of the network structure 6 | deepen_factor = 0.33 7 | # The scaling factor that controls the width of the network structure 8 | widen_factor = 0.375 9 | 10 | # ============================== Unmodified in most cases =================== 11 | model = dict( 12 | backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 13 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 14 | bbox_head=dict( 15 | type='YOLOv6Head', 16 | head_module=dict(widen_factor=widen_factor), 17 | loss_bbox=dict(iou_mode='siou'))) 18 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov6_s_syncbn_fast_8xb32-400e_coco.py' 2 | 3 | # ======================= Possible modified parameters ======================= 4 | # -----model related----- 5 | # The scaling factor that controls the depth of the network structure 6 | deepen_factor = 0.33 7 | # The scaling factor that controls the width of the network structure 8 | widen_factor = 0.375 9 | 10 | # ============================== Unmodified in most cases =================== 11 | model = dict( 12 | backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 13 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 14 | bbox_head=dict( 15 | type='YOLOv6Head', 16 | head_module=dict(widen_factor=widen_factor), 17 | loss_bbox=dict(iou_mode='siou'))) 18 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov6/yolov6_v3_l_syncbn_fast_8xb32-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov6_v3_m_syncbn_fast_8xb32-300e_coco.py' 2 | 3 | # ======================= Possible modified parameters ======================= 4 | # -----model related----- 5 | # The scaling factor that controls the depth of the network structure 6 | deepen_factor = 1 7 | # The scaling factor that controls the width of the network structure 8 | widen_factor = 1 9 | 10 | # ============================== Unmodified in most cases =================== 11 | model = dict( 12 | backbone=dict( 13 | deepen_factor=deepen_factor, 14 | widen_factor=widen_factor, 15 | hidden_ratio=1. / 2, 16 | block_cfg=dict( 17 | type='ConvWrapper', 18 | norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)), 19 | act_cfg=dict(type='SiLU', inplace=True)), 20 | neck=dict( 21 | deepen_factor=deepen_factor, 22 | widen_factor=widen_factor, 23 | hidden_ratio=1. / 2, 24 | block_cfg=dict( 25 | type='ConvWrapper', 26 | norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)), 27 | block_act_cfg=dict(type='SiLU', inplace=True)), 28 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 29 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov6/yolov6_v3_m_syncbn_fast_8xb32-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py' 2 | 3 | # ======================= Possible modified parameters ======================= 4 | # -----model related----- 5 | # The scaling factor that controls the depth of the network structure 6 | deepen_factor = 0.6 7 | # The scaling factor that controls the width of the network structure 8 | widen_factor = 0.75 9 | 10 | # -----train val related----- 11 | affine_scale = 0.9 # YOLOv5RandomAffine scaling ratio 12 | 13 | # ============================== Unmodified in most cases =================== 14 | model = dict( 15 | backbone=dict( 16 | type='YOLOv6CSPBep', 17 | deepen_factor=deepen_factor, 18 | widen_factor=widen_factor, 19 | hidden_ratio=2. / 3, 20 | block_cfg=dict(type='RepVGGBlock'), 21 | act_cfg=dict(type='ReLU', inplace=True)), 22 | neck=dict( 23 | type='YOLOv6CSPRepBiPAFPN', 24 | deepen_factor=deepen_factor, 25 | widen_factor=widen_factor, 26 | block_cfg=dict(type='RepVGGBlock'), 27 | hidden_ratio=2. / 3, 28 | block_act_cfg=dict(type='ReLU', inplace=True)), 29 | bbox_head=dict( 30 | type='YOLOv6Head', 31 | head_module=dict(reg_max=16, widen_factor=widen_factor))) 32 | 33 | mosaic_affine_pipeline = [ 34 | dict( 35 | type='Mosaic', 36 | img_scale=_base_.img_scale, 37 | pad_val=114.0, 38 | pre_transform=_base_.pre_transform), 39 | dict( 40 | type='YOLOv5RandomAffine', 41 | max_rotate_degree=0.0, 42 | max_shear_degree=0.0, 43 | scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), 44 | # img_scale is (width, height) 45 | border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), 46 | border_val=(114, 114, 114)) 47 | ] 48 | 49 | train_pipeline = [ 50 | *_base_.pre_transform, *mosaic_affine_pipeline, 51 | dict( 52 | type='YOLOv5MixUp', 53 | prob=0.1, 54 | pre_transform=[*_base_.pre_transform, *mosaic_affine_pipeline]), 55 | dict(type='YOLOv5HSVRandomAug'), 56 | dict(type='mmdet.RandomFlip', prob=0.5), 57 | dict( 58 | type='mmdet.PackDetInputs', 59 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', 60 | 'flip_direction')) 61 | ] 62 | 63 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) 64 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov6/yolov6_v3_n_syncbn_fast_8xb32-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py' 2 | 3 | # ======================= Possible modified parameters ======================= 4 | # -----model related----- 5 | # The scaling factor that controls the depth of the network structure 6 | deepen_factor = 0.33 7 | # The scaling factor that controls the width of the network structure 8 | widen_factor = 0.25 9 | 10 | # -----train val related----- 11 | lr_factor = 0.02 # Learning rate scaling factor 12 | 13 | # ============================== Unmodified in most cases =================== 14 | model = dict( 15 | backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 16 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 17 | bbox_head=dict( 18 | head_module=dict(widen_factor=widen_factor), 19 | loss_bbox=dict(iou_mode='siou'))) 20 | 21 | default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) 22 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov6/yolov6_v3_t_syncbn_fast_8xb32-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py' 2 | 3 | # ======================= Possible modified parameters ======================= 4 | # -----model related----- 5 | # The scaling factor that controls the depth of the network structure 6 | deepen_factor = 0.33 7 | # The scaling factor that controls the width of the network structure 8 | widen_factor = 0.375 9 | 10 | # ============================== Unmodified in most cases =================== 11 | model = dict( 12 | backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 13 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 14 | bbox_head=dict( 15 | type='YOLOv6Head', 16 | head_module=dict(widen_factor=widen_factor), 17 | loss_bbox=dict(iou_mode='siou'))) 18 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov7/yolov7_d-p6_syncbn_fast_8x16b-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py' 2 | 3 | model = dict( 4 | backbone=dict(arch='D'), 5 | neck=dict( 6 | use_maxpool_in_downsample=True, 7 | use_in_channels_in_downsample=True, 8 | block_cfg=dict( 9 | type='ELANBlock', 10 | middle_ratio=0.4, 11 | block_ratio=0.2, 12 | num_blocks=6, 13 | num_convs_in_block=1), 14 | in_channels=[384, 768, 1152, 1536], 15 | out_channels=[192, 384, 576, 768]), 16 | bbox_head=dict( 17 | head_module=dict( 18 | in_channels=[192, 384, 576, 768], 19 | main_out_channels=[384, 768, 1152, 1536], 20 | aux_out_channels=[384, 768, 1152, 1536], 21 | ))) 22 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov7/yolov7_e-p6_syncbn_fast_8x16b-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py' 2 | 3 | model = dict( 4 | backbone=dict(arch='E'), 5 | neck=dict( 6 | use_maxpool_in_downsample=True, 7 | use_in_channels_in_downsample=True, 8 | block_cfg=dict( 9 | type='ELANBlock', 10 | middle_ratio=0.4, 11 | block_ratio=0.2, 12 | num_blocks=6, 13 | num_convs_in_block=1), 14 | in_channels=[320, 640, 960, 1280], 15 | out_channels=[160, 320, 480, 640]), 16 | bbox_head=dict( 17 | head_module=dict( 18 | in_channels=[160, 320, 480, 640], 19 | main_out_channels=[320, 640, 960, 1280]))) 20 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov7/yolov7_e2e-p6_syncbn_fast_8x16b-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py' 2 | 3 | model = dict( 4 | backbone=dict(arch='E2E'), 5 | neck=dict( 6 | use_maxpool_in_downsample=True, 7 | use_in_channels_in_downsample=True, 8 | block_cfg=dict( 9 | type='EELANBlock', 10 | num_elan_block=2, 11 | middle_ratio=0.4, 12 | block_ratio=0.2, 13 | num_blocks=6, 14 | num_convs_in_block=1), 15 | in_channels=[320, 640, 960, 1280], 16 | out_channels=[160, 320, 480, 640]), 17 | bbox_head=dict( 18 | head_module=dict( 19 | in_channels=[160, 320, 480, 640], 20 | main_out_channels=[320, 640, 960, 1280]))) 21 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov7/yolov7_tiny_fast_1xb12-40e_cat.py: -------------------------------------------------------------------------------- 1 | _base_ = 'yolov7_tiny_syncbn_fast_8x16b-300e_coco.py' 2 | 3 | data_root = './data/cat/' 4 | class_name = ('cat', ) 5 | num_classes = len(class_name) 6 | metainfo = dict(classes=class_name, palette=[(20, 220, 60)]) 7 | 8 | anchors = [ 9 | [(68, 69), (154, 91), (143, 162)], # P3/8 10 | [(242, 160), (189, 287), (391, 207)], # P4/16 11 | [(353, 337), (539, 341), (443, 432)] # P5/32 12 | ] 13 | 14 | max_epochs = 40 15 | train_batch_size_per_gpu = 12 16 | train_num_workers = 4 17 | 18 | load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco/yolov7_tiny_syncbn_fast_8x16b-300e_coco_20221126_102719-0ee5bbdf.pth' # noqa 19 | 20 | model = dict( 21 | backbone=dict(frozen_stages=4), 22 | bbox_head=dict( 23 | head_module=dict(num_classes=num_classes), 24 | prior_generator=dict(base_sizes=anchors))) 25 | 26 | train_dataloader = dict( 27 | batch_size=train_batch_size_per_gpu, 28 | num_workers=train_num_workers, 29 | dataset=dict( 30 | data_root=data_root, 31 | metainfo=metainfo, 32 | ann_file='annotations/trainval.json', 33 | data_prefix=dict(img='images/'))) 34 | 35 | val_dataloader = dict( 36 | dataset=dict( 37 | metainfo=metainfo, 38 | data_root=data_root, 39 | ann_file='annotations/test.json', 40 | data_prefix=dict(img='images/'))) 41 | 42 | test_dataloader = val_dataloader 43 | 44 | _base_.optim_wrapper.optimizer.batch_size_per_gpu = train_batch_size_per_gpu 45 | 46 | val_evaluator = dict(ann_file=data_root + 'annotations/test.json') 47 | test_evaluator = val_evaluator 48 | 49 | default_hooks = dict( 50 | checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'), 51 | # The warmup_mim_iter parameter is critical. 52 | # The default value is 1000 which is not suitable for cat datasets. 53 | param_scheduler=dict(max_epochs=max_epochs, warmup_mim_iter=10), 54 | logger=dict(type='LoggerHook', interval=5)) 55 | train_cfg = dict(max_epochs=max_epochs, val_interval=10) 56 | # visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa 57 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov7_l_syncbn_fast_8x16b-300e_coco.py' 2 | 3 | model = dict( 4 | backbone=dict(arch='X'), 5 | neck=dict( 6 | in_channels=[640, 1280, 1280], 7 | out_channels=[160, 320, 640], 8 | block_cfg=dict( 9 | type='ELANBlock', 10 | middle_ratio=0.4, 11 | block_ratio=0.4, 12 | num_blocks=3, 13 | num_convs_in_block=2), 14 | use_repconv_outs=False), 15 | bbox_head=dict(head_module=dict(in_channels=[320, 640, 1280]))) 16 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py' 2 | 3 | # This config use refining bbox and `YOLOv5CopyPaste`. 4 | # Refining bbox means refining bbox by mask while loading annotations and 5 | # transforming after `YOLOv5RandomAffine` 6 | 7 | # ========================modified parameters====================== 8 | deepen_factor = 1.00 9 | widen_factor = 1.00 10 | last_stage_out_channels = 512 11 | 12 | mixup_prob = 0.15 13 | copypaste_prob = 0.3 14 | 15 | # =======================Unmodified in most cases================== 16 | img_scale = _base_.img_scale 17 | pre_transform = _base_.pre_transform 18 | last_transform = _base_.last_transform 19 | affine_scale = _base_.affine_scale 20 | 21 | model = dict( 22 | backbone=dict( 23 | last_stage_out_channels=last_stage_out_channels, 24 | deepen_factor=deepen_factor, 25 | widen_factor=widen_factor), 26 | neck=dict( 27 | deepen_factor=deepen_factor, 28 | widen_factor=widen_factor, 29 | in_channels=[256, 512, last_stage_out_channels], 30 | out_channels=[256, 512, last_stage_out_channels]), 31 | bbox_head=dict( 32 | head_module=dict( 33 | widen_factor=widen_factor, 34 | in_channels=[256, 512, last_stage_out_channels]))) 35 | 36 | mosaic_affine_transform = [ 37 | dict( 38 | type='Mosaic', 39 | img_scale=img_scale, 40 | pad_val=114.0, 41 | pre_transform=pre_transform), 42 | dict(type='YOLOv5CopyPaste', prob=copypaste_prob), 43 | dict( 44 | type='YOLOv5RandomAffine', 45 | max_rotate_degree=0.0, 46 | max_shear_degree=0.0, 47 | max_aspect_ratio=100., 48 | scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), 49 | # img_scale is (width, height) 50 | border=(-img_scale[0] // 2, -img_scale[1] // 2), 51 | border_val=(114, 114, 114), 52 | min_area_ratio=_base_.min_area_ratio, 53 | use_mask_refine=_base_.use_mask2refine) 54 | ] 55 | 56 | train_pipeline = [ 57 | *pre_transform, *mosaic_affine_transform, 58 | dict( 59 | type='YOLOv5MixUp', 60 | prob=mixup_prob, 61 | pre_transform=[*pre_transform, *mosaic_affine_transform]), 62 | *last_transform 63 | ] 64 | 65 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) 66 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov8_m_syncbn_fast_8xb16-500e_coco.py' 2 | 3 | # ========================modified parameters====================== 4 | deepen_factor = 1.00 5 | widen_factor = 1.00 6 | last_stage_out_channels = 512 7 | 8 | mixup_prob = 0.15 9 | 10 | # =======================Unmodified in most cases================== 11 | pre_transform = _base_.pre_transform 12 | mosaic_affine_transform = _base_.mosaic_affine_transform 13 | last_transform = _base_.last_transform 14 | 15 | model = dict( 16 | backbone=dict( 17 | last_stage_out_channels=last_stage_out_channels, 18 | deepen_factor=deepen_factor, 19 | widen_factor=widen_factor), 20 | neck=dict( 21 | deepen_factor=deepen_factor, 22 | widen_factor=widen_factor, 23 | in_channels=[256, 512, last_stage_out_channels], 24 | out_channels=[256, 512, last_stage_out_channels]), 25 | bbox_head=dict( 26 | head_module=dict( 27 | widen_factor=widen_factor, 28 | in_channels=[256, 512, last_stage_out_channels]))) 29 | 30 | train_pipeline = [ 31 | *pre_transform, *mosaic_affine_transform, 32 | dict( 33 | type='YOLOv5MixUp', 34 | prob=mixup_prob, 35 | pre_transform=[*pre_transform, *mosaic_affine_transform]), 36 | *last_transform 37 | ] 38 | 39 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) 40 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov8/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py' 2 | 3 | # This config will refine bbox by mask while loading annotations and 4 | # transforming after `YOLOv5RandomAffine` 5 | 6 | deepen_factor = 0.33 7 | widen_factor = 0.25 8 | 9 | model = dict( 10 | backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 11 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 12 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 13 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov8_s_syncbn_fast_8xb16-500e_coco.py' 2 | 3 | deepen_factor = 0.33 4 | widen_factor = 0.25 5 | 6 | model = dict( 7 | backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 8 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 9 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 10 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov8/yolov8_s_fast_1xb12-40e_cat.py: -------------------------------------------------------------------------------- 1 | _base_ = 'yolov8_s_syncbn_fast_8xb16-500e_coco.py' 2 | 3 | data_root = './data/cat/' 4 | class_name = ('cat', ) 5 | num_classes = len(class_name) 6 | metainfo = dict(classes=class_name, palette=[(20, 220, 60)]) 7 | 8 | close_mosaic_epochs = 5 9 | 10 | max_epochs = 40 11 | train_batch_size_per_gpu = 12 12 | train_num_workers = 4 13 | 14 | load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco/yolov8_s_syncbn_fast_8xb16-500e_coco_20230117_180101-5aa5f0f1.pth' # noqa 15 | 16 | model = dict( 17 | backbone=dict(frozen_stages=4), 18 | bbox_head=dict(head_module=dict(num_classes=num_classes)), 19 | train_cfg=dict(assigner=dict(num_classes=num_classes))) 20 | 21 | train_dataloader = dict( 22 | batch_size=train_batch_size_per_gpu, 23 | num_workers=train_num_workers, 24 | dataset=dict( 25 | data_root=data_root, 26 | metainfo=metainfo, 27 | ann_file='annotations/trainval.json', 28 | data_prefix=dict(img='images/'))) 29 | 30 | val_dataloader = dict( 31 | dataset=dict( 32 | metainfo=metainfo, 33 | data_root=data_root, 34 | ann_file='annotations/test.json', 35 | data_prefix=dict(img='images/'))) 36 | 37 | test_dataloader = val_dataloader 38 | 39 | _base_.optim_wrapper.optimizer.batch_size_per_gpu = train_batch_size_per_gpu 40 | _base_.custom_hooks[1].switch_epoch = max_epochs - close_mosaic_epochs 41 | 42 | val_evaluator = dict(ann_file=data_root + 'annotations/test.json') 43 | test_evaluator = val_evaluator 44 | 45 | default_hooks = dict( 46 | checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'), 47 | # The warmup_mim_iter parameter is critical. 48 | # The default value is 1000 which is not suitable for cat datasets. 49 | param_scheduler=dict(max_epochs=max_epochs, warmup_mim_iter=10), 50 | logger=dict(type='LoggerHook', interval=5)) 51 | train_cfg = dict(max_epochs=max_epochs, val_interval=10) 52 | # visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa 53 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov8/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py' 2 | 3 | # This config use refining bbox and `YOLOv5CopyPaste`. 4 | # Refining bbox means refining bbox by mask while loading annotations and 5 | # transforming after `YOLOv5RandomAffine` 6 | 7 | deepen_factor = 1.00 8 | widen_factor = 1.25 9 | 10 | model = dict( 11 | backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 12 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 13 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 14 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolov8/yolov8_x_syncbn_fast_8xb16-500e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolov8_l_syncbn_fast_8xb16-500e_coco.py' 2 | 3 | deepen_factor = 1.00 4 | widen_factor = 1.25 5 | 6 | model = dict( 7 | backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 8 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 9 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 10 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolox/pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = ['./yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py'] 2 | 3 | load_from = 'https://download.openmmlab.com/mmyolo/v0/yolox/yolox_l_fast_8xb8-300e_coco/yolox_l_fast_8xb8-300e_coco_20230213_160715-c731eb1c.pth' # noqa 4 | 5 | # ========================modified parameters====================== 6 | deepen_factor = 1.0 7 | widen_factor = 1.0 8 | 9 | # =======================Unmodified in most cases================== 10 | # model settings 11 | model = dict( 12 | backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 13 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 14 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 15 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolox/pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = ['./yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py'] 2 | 3 | load_from = 'https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco_20230210_144328-e657e182.pth' # noqa 4 | 5 | # ========================modified parameters====================== 6 | deepen_factor = 0.67 7 | widen_factor = 0.75 8 | 9 | # =======================Unmodified in most cases================== 10 | # model settings 11 | model = dict( 12 | backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 13 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 14 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 15 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolox/pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py' 2 | 3 | load_from = 'https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco_20230210_143637-4c338102.pth' # noqa 4 | 5 | deepen_factor = 0.33 6 | widen_factor = 0.375 7 | scaling_ratio_range = (0.75, 1.0) 8 | 9 | # model settings 10 | model = dict( 11 | data_preprocessor=dict(batch_augments=[ 12 | dict( 13 | type='YOLOXBatchSyncRandomResize', 14 | random_size_range=(320, 640), 15 | size_divisor=32, 16 | interval=1) 17 | ]), 18 | backbone=dict( 19 | deepen_factor=deepen_factor, 20 | widen_factor=widen_factor, 21 | ), 22 | neck=dict( 23 | deepen_factor=deepen_factor, 24 | widen_factor=widen_factor, 25 | ), 26 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 27 | 28 | # data settings 29 | img_scale = _base_.img_scale 30 | pre_transform = _base_.pre_transform 31 | 32 | train_pipeline_stage1 = [ 33 | *pre_transform, 34 | dict( 35 | type='Mosaic', 36 | img_scale=img_scale, 37 | pad_val=114.0, 38 | pre_transform=pre_transform), 39 | dict( 40 | type='RandomAffine', 41 | scaling_ratio_range=scaling_ratio_range, 42 | border=(-img_scale[0] // 2, -img_scale[1] // 2)), 43 | dict(type='mmdet.YOLOXHSVRandomAug'), 44 | dict(type='RandomFlip', prob=0.5), 45 | dict( 46 | type='FilterAnnotations', 47 | by_keypoints=True, 48 | min_gt_bbox_wh=(1, 1), 49 | keep_empty=False), 50 | dict( 51 | type='PackDetInputs', 52 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape')) 53 | ] 54 | 55 | test_pipeline = [ 56 | *pre_transform, 57 | dict(type='Resize', scale=(416, 416), keep_ratio=True), 58 | dict( 59 | type='mmdet.Pad', 60 | pad_to_square=True, 61 | pad_val=dict(img=(114.0, 114.0, 114.0))), 62 | dict( 63 | type='PackDetInputs', 64 | meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', 65 | 'scale_factor', 'flip_indices')) 66 | ] 67 | 68 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline_stage1)) 69 | val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) 70 | test_dataloader = val_dataloader 71 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolox/yolox_l_fast_8xb8-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolox_s_fast_8xb8-300e_coco.py' 2 | 3 | # ========================modified parameters====================== 4 | deepen_factor = 1.0 5 | widen_factor = 1.0 6 | 7 | # =======================Unmodified in most cases================== 8 | # model settings 9 | model = dict( 10 | backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 11 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 12 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 13 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py' 2 | 3 | # ========================modified parameters====================== 4 | deepen_factor = 0.67 5 | widen_factor = 0.75 6 | 7 | # =======================Unmodified in most cases================== 8 | # model settings 9 | model = dict( 10 | backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 11 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 12 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 13 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolox/yolox_m_fast_8xb8-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolox_s_fast_8xb8-300e_coco.py' 2 | 3 | # ========================modified parameters====================== 4 | deepen_factor = 0.67 5 | widen_factor = 0.75 6 | 7 | # =======================Unmodified in most cases================== 8 | # model settings 9 | model = dict( 10 | backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 11 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 12 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 13 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolox/yolox_nano_fast_8xb32-300e-rtmdet-hyp_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco.py' 2 | 3 | # ========================modified parameters====================== 4 | deepen_factor = 0.33 5 | widen_factor = 0.25 6 | use_depthwise = True 7 | 8 | # =======================Unmodified in most cases================== 9 | # model settings 10 | model = dict( 11 | backbone=dict( 12 | deepen_factor=deepen_factor, 13 | widen_factor=widen_factor, 14 | use_depthwise=use_depthwise), 15 | neck=dict( 16 | deepen_factor=deepen_factor, 17 | widen_factor=widen_factor, 18 | use_depthwise=use_depthwise), 19 | bbox_head=dict( 20 | head_module=dict( 21 | widen_factor=widen_factor, use_depthwise=use_depthwise))) 22 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolox/yolox_nano_fast_8xb8-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolox_tiny_fast_8xb8-300e_coco.py' 2 | 3 | # ========================modified parameters====================== 4 | deepen_factor = 0.33 5 | widen_factor = 0.25 6 | use_depthwise = True 7 | 8 | # =======================Unmodified in most cases================== 9 | # model settings 10 | model = dict( 11 | backbone=dict( 12 | deepen_factor=deepen_factor, 13 | widen_factor=widen_factor, 14 | use_depthwise=use_depthwise), 15 | neck=dict( 16 | deepen_factor=deepen_factor, 17 | widen_factor=widen_factor, 18 | use_depthwise=use_depthwise), 19 | bbox_head=dict( 20 | head_module=dict( 21 | widen_factor=widen_factor, use_depthwise=use_depthwise))) 22 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolox/yolox_p5_tta.py: -------------------------------------------------------------------------------- 1 | # TODO: Need to solve the problem of multiple backend_args parameters 2 | # _backend_args = dict( 3 | # backend='petrel', 4 | # path_mapping=dict({ 5 | # './data/': 's3://openmmlab/datasets/detection/', 6 | # 'data/': 's3://openmmlab/datasets/detection/' 7 | # })) 8 | 9 | _backend_args = None 10 | 11 | tta_model = dict( 12 | type='mmdet.DetTTAModel', 13 | tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.65), max_per_img=300)) 14 | 15 | img_scales = [(640, 640), (320, 320), (960, 960)] 16 | 17 | # LoadImageFromFile 18 | # / | \ 19 | # Resize Resize Resize # noqa 20 | # / \ / \ / \ 21 | # RandomFlip RandomFlip RandomFlip RandomFlip RandomFlip RandomFlip # noqa 22 | # | | | | | | 23 | # LoadAnn LoadAnn LoadAnn LoadAnn LoadAnn LoadAnn 24 | # | | | | | | 25 | # PackDetIn PackDetIn PackDetIn PackDetIn PackDetIn PackDetIn # noqa 26 | 27 | tta_pipeline = [ 28 | dict(type='LoadImageFromFile', backend_args=_backend_args), 29 | dict( 30 | type='TestTimeAug', 31 | transforms=[ 32 | [ 33 | dict(type='mmdet.Resize', scale=s, keep_ratio=True) 34 | for s in img_scales 35 | ], 36 | [ 37 | # ``RandomFlip`` must be placed before ``Pad``, otherwise 38 | # bounding box coordinates after flipping cannot be 39 | # recovered correctly. 40 | dict(type='mmdet.RandomFlip', prob=1.), 41 | dict(type='mmdet.RandomFlip', prob=0.) 42 | ], 43 | [ 44 | dict( 45 | type='mmdet.Pad', 46 | pad_to_square=True, 47 | pad_val=dict(img=(114.0, 114.0, 114.0))), 48 | ], 49 | [ 50 | dict( 51 | type='mmdet.PackDetInputs', 52 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 53 | 'scale_factor', 'flip', 'flip_direction')) 54 | ] 55 | ]) 56 | ] 57 | -------------------------------------------------------------------------------- /third_party/mmyolo/configs/yolox/yolox_x_fast_8xb8-300e_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = './yolox_s_fast_8xb8-300e_coco.py' 2 | 3 | # ========================modified parameters====================== 4 | deepen_factor = 1.33 5 | widen_factor = 1.25 6 | 7 | # =======================Unmodified in most cases================== 8 | # model settings 9 | model = dict( 10 | backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 11 | neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), 12 | bbox_head=dict(head_module=dict(widen_factor=widen_factor))) 13 | -------------------------------------------------------------------------------- /tools/count_num_parameters.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | 4 | 5 | def parse_args(): 6 | 7 | parser = argparse.ArgumentParser("Compute the number of parameters of a model") 8 | parser.add_argument('checkpoint', type=str, help='model checkpoint path') 9 | 10 | args = parser.parse_args() 11 | return args 12 | 13 | 14 | if __name__ == '__main__': 15 | args = parse_args() 16 | 17 | # load checkpoint 18 | model = torch.load(args.checkpoint, map_location='cpu') 19 | state_dict = model['state_dict'] 20 | num_parameters = 0 21 | 22 | for k, v in state_dict.items(): 23 | num_parameters += v.numel() 24 | 25 | print(f'num_parameters: {num_parameters} | {num_parameters / 1e6:.2f} MB') -------------------------------------------------------------------------------- /tools/dist_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | CHECKPOINT=$2 5 | GPUS=$3 6 | NNODES=${NNODES:-1} 7 | NODE_RANK=${NODE_RANK:-0} 8 | PORT=${PORT:-29588} 9 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 10 | 11 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 12 | python3 -m torch.distributed.launch \ 13 | --nnodes=$NNODES \ 14 | --node_rank=$NODE_RANK \ 15 | --master_addr=$MASTER_ADDR \ 16 | --nproc_per_node=$GPUS \ 17 | --master_port=$PORT \ 18 | $(dirname "$0")/test.py \ 19 | $CONFIG \ 20 | $CHECKPOINT \ 21 | --launcher pytorch \ 22 | ${@:4} 23 | -------------------------------------------------------------------------------- /tools/dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | GPUS=$2 5 | NNODES=${NNODES:-1} 6 | NODE_RANK=${NODE_RANK:-0} 7 | PORT=${MASTER_PORT:-29500} 8 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 9 | 10 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 11 | python3 -m torch.distributed.launch \ 12 | --nnodes=$NNODES \ 13 | --node_rank=$NODE_RANK \ 14 | --master_addr=$MASTER_ADDR \ 15 | --nproc_per_node=$GPUS \ 16 | --master_port=$PORT \ 17 | $(dirname "$0")/train.py \ 18 | $CONFIG \ 19 | --launcher pytorch ${@:3} 20 | -------------------------------------------------------------------------------- /tools/evaluate_latency.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-861/developer-guide/index.html#trtexec 3 | 4 | trtexec --onnx=path_to_onnx_file \ 5 | --fp16 \ 6 | --iterations=2000 \ 7 | --verbose \ 8 | --device=0 -------------------------------------------------------------------------------- /tools/generate_image_prompts.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tqdm 3 | import argparse 4 | import os.path as osp 5 | import numpy as np 6 | from PIL import Image 7 | from transformers import (AutoTokenizer, AutoProcessor, 8 | CLIPVisionModelWithProjection, 9 | CLIPTextModelWithProjection) 10 | 11 | if __name__ == "__main__": 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument( 15 | '--model', 16 | type=str, 17 | default='../pretrained_models/open-ai-clip-vit-base-patch32') 18 | parser.add_argument('--image-dir', type=str, default='data/samples.txt') 19 | parser.add_argument('--out-dir', type=str, default='') 20 | parser.add_argument('--out-file', type=str) 21 | 22 | args = parser.parse_args() 23 | 24 | tokenizer = AutoTokenizer.from_pretrained(args.model) 25 | vision_model = CLIPVisionModelWithProjection.from_pretrained(args.model) 26 | text_model = CLIPTextModelWithProjection.from_pretrained(args.model) 27 | processor = AutoProcessor.from_pretrained(args.model) 28 | 29 | # padding prompts 30 | device = 'cuda:0' 31 | text_model.to(device) 32 | texts = tokenizer(text=[' '], return_tensors='pt', padding=True) 33 | texts = texts.to(device) 34 | text_outputs = text_model(**texts) 35 | txt_feats = text_outputs.text_embeds 36 | txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True) 37 | txt_feats = txt_feats.reshape(-1, txt_feats.shape[-1]).cpu().data.numpy() 38 | 39 | images = os.listdir(args.image_dir) 40 | category_embeds = [] 41 | 42 | def _forward_vision_model(image_name): 43 | image_path = osp.join(args.image_dir, image_name) 44 | # category = image_name.split('-')[1] 45 | image = Image.open(image_path).convert("RGB") 46 | inputs = processor(images=image, return_tensors="pt", padding=True) 47 | image_outputs = vision_model(**inputs) 48 | img_feats = image_outputs.image_embeds 49 | # img_feats 50 | img_feats = img_feats / img_feats.norm(p=2, dim=-1, keepdim=True) 51 | img_feats = img_feats.reshape( 52 | -1, img_feats.shape[-1])[0].cpu().data.numpy() 53 | category_embeds.append(img_feats) 54 | 55 | for image_ in tqdm.tqdm(images): 56 | _forward_vision_model(image_) 57 | category_embeds.append(txt_feats) 58 | category_embeds = np.stack(category_embeds) 59 | np.save(osp.join(args.out_dir, args.out_file), category_embeds) 60 | -------------------------------------------------------------------------------- /tools/generate_text_prompts_dosod.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import argparse 4 | import numpy as np 5 | import torch 6 | from mmdet.apis import init_detector 7 | 8 | if __name__ == "__main__": 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('config', type=str, help='Path to config file') 11 | parser.add_argument('checkpoint', type=str, help='Path to checkpoint file') 12 | parser.add_argument('--text', 13 | type=str, 14 | default='data/texts/coco_class_texts.json', 15 | help='Path to text file''') 16 | parser.add_argument('--out-dir', type=str, help='The dir to save text embeddings npy') 17 | parser.add_argument('--device', 18 | default='cuda:0', 19 | help='Device used for run') 20 | 21 | args = parser.parse_args() 22 | 23 | device = args.device 24 | 25 | with open(args.text) as f: 26 | data = json.load(f) 27 | texts = [x[0] for x in data] 28 | 29 | # generate text embeddings 30 | print('init model......') 31 | model = init_detector(args.config, args.checkpoint, device=device) 32 | model.eval() 33 | 34 | print('start to generate text embeddings......') 35 | with torch.no_grad(): 36 | text_embeddings = model.backbone_text([texts], enable_assertion=False) 37 | text_embeddings = model.bbox_head.head_module.forward_text(text_embeddings) 38 | text_embeddings = text_embeddings.reshape(-1, text_embeddings.shape[-1]) 39 | 40 | print('start to save text embeddings......') 41 | os.makedirs(args.out_dir, exist_ok=True) 42 | text_embeddings = text_embeddings.cpu().data.numpy() 43 | np.save(os.path.join(args.out_dir, 44 | os.path.splitext(os.path.basename(args.text))[0] + '_' + os.path.splitext(os.path.basename(args.checkpoint))[0]) + ".npy", 45 | text_embeddings) 46 | -------------------------------------------------------------------------------- /tools/generate_text_prompts_yoloworld.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | import numpy as np 4 | from transformers import (AutoTokenizer, CLIPTextModelWithProjection) 5 | 6 | 7 | if __name__ == "__main__": 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument( 11 | '--model', 12 | type=str, 13 | default='/horizon-bucket/AIoT-data-bucket/yonghao01.he/pretrain_models/clip-vit-base-patch32') 14 | parser.add_argument('--text', 15 | type=str, 16 | default='/home/users/yonghao01.he/projects/YOLO-World-Workspace/yolo-world-reparameterize-show/open_word.json') 17 | parser.add_argument('--out', type=str, default='/home/users/yonghao01.he/projects/YOLO-World-Workspace/yolo-world-reparameterize-show/open_word.npy') 18 | 19 | args = parser.parse_args() 20 | 21 | tokenizer = AutoTokenizer.from_pretrained(args.model) 22 | model = CLIPTextModelWithProjection.from_pretrained(args.model) 23 | 24 | with open(args.text) as f: 25 | data = json.load(f) 26 | texts = [x[0] for x in data] 27 | device = 'cuda:0' 28 | model.to(device) 29 | texts = tokenizer(text=texts, return_tensors='pt', padding=True) 30 | texts = texts.to(device) 31 | text_outputs = model(**texts) 32 | txt_feats = text_outputs.text_embeds 33 | txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True) 34 | txt_feats = txt_feats.reshape(-1, txt_feats.shape[-1]) 35 | 36 | np.save(args.out, txt_feats.cpu().data.numpy()) 37 | -------------------------------------------------------------------------------- /tools/generate_vocabulary_json.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | 4 | if __name__ == "__main__": 5 | parser = argparse.ArgumentParser(description="Convert Vocabulary Text to Json File") 6 | parser.add_argument("--text", type=str, default="person,bicycle,car,motorcycle,airplane,bus,train,truck,boat,traffic light,fire hydrant,stop sign,parking meter,bench,bird,cat,dog,horse,sheep,cow,elephant,bear,zebra,giraffe,backpack,umbrella,handbag,tie,suitcase,frisbee,skis,snowboard,sports ball,kite,baseball bat,baseball glove,skateboard,surfboard,tennis racket,bottle,wine glass,cup,fork,knife,spoon,bowl,banana,apple,sandwich,orange,broccoli,carrot,hot dog,pizza,donut,cake,chair,couch,potted plant,bed,dining table,toilet,tv,laptop,mouse,remote,keyboard,cell phone,microwave,oven,toaster,sink,refrigerator,book,clock,vase,scissors,teddy bear,hair drier,toothbrush", help='Texts') 7 | parser.add_argument("--output", type=str, default='offline_vocabulary.json', help='Output path') 8 | 9 | args = parser.parse_args() 10 | 11 | # 指定输出的 JSON 文件名 12 | text = args.text 13 | output_file = args.output 14 | 15 | # 将文本按逗号分割并去除多余的空格 16 | items = [item.strip() for item in text.split(",")] 17 | 18 | # 将每个项目转换为单独的列表 19 | nested_items = [[item] for item in items] 20 | 21 | print("len items:", len(nested_items), nested_items) 22 | 23 | # 将嵌套列表保存为 JSON 文件 24 | with open(output_file, "w", encoding="utf-8") as file: 25 | # indent=4 26 | json.dump(nested_items, file, ensure_ascii=False) 27 | 28 | print(f"Finshed. Save vocabulary file: {output_file}") 29 | 30 | -------------------------------------------------------------------------------- /tools/reparameterize_dosod.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | import torch 5 | import numpy as np 6 | 7 | 8 | def parse_args(): 9 | 10 | parser = argparse.ArgumentParser("Reparameterize DOSOD") 11 | parser.add_argument('--model', help='model checkpoints to reparameterize') 12 | parser.add_argument('--out-dir', help='output checkpoints') 13 | parser.add_argument( 14 | '--text-embed', 15 | help='text embeddings to be reparameterized') 16 | 17 | args = parser.parse_args() 18 | return args 19 | 20 | 21 | def convert_head(scale, bias, text_embed): 22 | N, D = text_embed.shape 23 | weight = (text_embed * scale.exp()).view(N, D, 1, 1) 24 | bias = torch.ones(N) * bias 25 | return weight, bias 26 | 27 | 28 | def reparameterize_head(state_dict, embeds): 29 | 30 | cls_layers = [ 31 | 'bbox_head.head_module.cls_contrasts.0', 32 | 'bbox_head.head_module.cls_contrasts.1', 33 | 'bbox_head.head_module.cls_contrasts.2' 34 | ] 35 | 36 | for i in range(3): 37 | scale = state_dict[cls_layers[i] + '.logit_scale'] 38 | bias = state_dict[cls_layers[i] + '.bias'] 39 | weight, bias = convert_head(scale, bias, embeds) 40 | state_dict[cls_layers[i] + '.conv.weight'] = weight 41 | state_dict[cls_layers[i] + '.conv.bias'] = bias 42 | del state_dict[cls_layers[i] + '.bias'] 43 | del state_dict[cls_layers[i] + '.logit_scale'] 44 | return state_dict 45 | 46 | 47 | def main(): 48 | 49 | args = parse_args() 50 | 51 | # load checkpoint 52 | model = torch.load(args.model, map_location='cpu') 53 | state_dict = model['state_dict'] 54 | 55 | # load embeddings 56 | embeddings = torch.from_numpy(np.load(args.text_embed)) 57 | 58 | # remove text encoder and text adaptor 59 | keys = list(state_dict.keys()) 60 | keys = [x for x in keys if "backbone_text" not in x and 'text_mlp' not in x] 61 | 62 | state_dict_wo_text = {x: state_dict[x] for x in keys} 63 | print("removing text encoder") 64 | 65 | state_dict_wo_text = reparameterize_head(state_dict_wo_text, embeddings) 66 | print("reparameterizing head") 67 | 68 | model['state_dict'] = state_dict_wo_text 69 | 70 | model_name = os.path.basename(args.model) 71 | model_name = model_name.replace('.pth', f'_rep.pth') 72 | torch.save(model, os.path.join(args.out_dir, model_name)) 73 | 74 | 75 | if __name__ == "__main__": 76 | main() 77 | -------------------------------------------------------------------------------- /yolo_world/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tencent Inc. All rights reserved. 2 | import importlib.metadata as importlib_metadata 3 | 4 | try: 5 | __version__ = importlib_metadata.version(__package__ or __name__) 6 | except importlib_metadata.PackageNotFoundError: 7 | __version__ = '0.0.0' 8 | 9 | 10 | from .models import * # noqa 11 | from .datasets import * # noqa 12 | from .engine import * # noqa 13 | -------------------------------------------------------------------------------- /yolo_world/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tencent Inc. All rights reserved. 2 | from .mm_dataset import ( 3 | MultiModalDataset, MultiModalMixedDataset) 4 | from .yolov5_obj365v1 import YOLOv5Objects365V1Dataset 5 | from .yolov5_obj365v2 import YOLOv5Objects365V2Dataset 6 | from .yolov5_mixed_grounding import YOLOv5MixedGroundingDataset 7 | from .utils import yolow_collate 8 | from .transformers import * # NOQA 9 | from .yolov5_v3det import YOLOv5V3DetDataset 10 | from .yolov5_lvis import YOLOv5LVISV1Dataset 11 | from .yolov5_cc3m_grounding import YOLOv5GeneralGroundingDataset 12 | 13 | __all__ = [ 14 | 'MultiModalDataset', 'YOLOv5Objects365V1Dataset', 15 | 'YOLOv5Objects365V2Dataset', 'YOLOv5MixedGroundingDataset', 16 | 'YOLOv5V3DetDataset', 'yolow_collate', 17 | 'YOLOv5LVISV1Dataset', 'MultiModalMixedDataset', 18 | 'YOLOv5GeneralGroundingDataset' 19 | ] 20 | -------------------------------------------------------------------------------- /yolo_world/datasets/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tencent Inc. All rights reserved. 2 | from .mm_transforms import RandomLoadText, LoadText 3 | from .mm_mix_img_transforms import ( 4 | MultiModalMosaic, MultiModalMosaic9, YOLOv5MultiModalMixUp, 5 | YOLOXMultiModalMixUp) 6 | 7 | __all__ = ['RandomLoadText', 'LoadText', 'MultiModalMosaic', 8 | 'MultiModalMosaic9', 'YOLOv5MultiModalMixUp', 9 | 'YOLOXMultiModalMixUp'] 10 | -------------------------------------------------------------------------------- /yolo_world/datasets/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from typing import Sequence 3 | 4 | import torch 5 | from mmengine.dataset import COLLATE_FUNCTIONS 6 | 7 | 8 | @COLLATE_FUNCTIONS.register_module() 9 | def yolow_collate(data_batch: Sequence, 10 | use_ms_training: bool = False) -> dict: 11 | """Rewrite collate_fn to get faster training speed. 12 | 13 | Args: 14 | data_batch (Sequence): Batch of data. 15 | use_ms_training (bool): Whether to use multi-scale training. 16 | """ 17 | batch_imgs = [] 18 | batch_bboxes_labels = [] 19 | batch_masks = [] 20 | for i in range(len(data_batch)): 21 | datasamples = data_batch[i]['data_samples'] 22 | inputs = data_batch[i]['inputs'] 23 | batch_imgs.append(inputs) 24 | 25 | gt_bboxes = datasamples.gt_instances.bboxes.tensor 26 | gt_labels = datasamples.gt_instances.labels 27 | if 'masks' in datasamples.gt_instances: 28 | masks = datasamples.gt_instances.masks.to( 29 | dtype=torch.bool, device=gt_bboxes.device) 30 | batch_masks.append(masks) 31 | batch_idx = gt_labels.new_full((len(gt_labels), 1), i) 32 | bboxes_labels = torch.cat((batch_idx, gt_labels[:, None], gt_bboxes), 33 | dim=1) 34 | batch_bboxes_labels.append(bboxes_labels) 35 | 36 | collated_results = { 37 | 'data_samples': { 38 | 'bboxes_labels': torch.cat(batch_bboxes_labels, 0) 39 | } 40 | } 41 | if len(batch_masks) > 0: 42 | collated_results['data_samples']['masks'] = torch.cat(batch_masks, 0) 43 | 44 | if use_ms_training: 45 | collated_results['inputs'] = batch_imgs 46 | else: 47 | collated_results['inputs'] = torch.stack(batch_imgs, 0) 48 | 49 | if hasattr(data_batch[0]['data_samples'], 'texts'): 50 | batch_texts = [meta['data_samples'].texts for meta in data_batch] 51 | collated_results['data_samples']['texts'] = batch_texts 52 | 53 | if hasattr(data_batch[0]['data_samples'], 'is_detection'): 54 | # detection flag 55 | batch_detection = [meta['data_samples'].is_detection 56 | for meta in data_batch] 57 | collated_results['data_samples']['is_detection'] = torch.tensor( 58 | batch_detection) 59 | 60 | return collated_results 61 | -------------------------------------------------------------------------------- /yolo_world/datasets/yolov5_lvis.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tencent Inc. All rights reserved. 2 | from mmdet.datasets import LVISV1Dataset 3 | 4 | from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset 5 | from mmyolo.registry import DATASETS 6 | 7 | 8 | @DATASETS.register_module() 9 | class YOLOv5LVISV1Dataset(BatchShapePolicyDataset, LVISV1Dataset): 10 | """Dataset for YOLOv5 LVIS Dataset. 11 | 12 | We only add `BatchShapePolicy` function compared with Objects365V1Dataset. 13 | See `mmyolo/datasets/utils.py#BatchShapePolicy` for details 14 | """ 15 | pass 16 | -------------------------------------------------------------------------------- /yolo_world/datasets/yolov5_obj365v1.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tencent Inc. All rights reserved. 2 | from mmdet.datasets import Objects365V1Dataset 3 | 4 | from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset 5 | from mmyolo.registry import DATASETS 6 | 7 | 8 | @DATASETS.register_module() 9 | class YOLOv5Objects365V1Dataset(BatchShapePolicyDataset, Objects365V1Dataset): 10 | """Dataset for YOLOv5 VOC Dataset. 11 | 12 | We only add `BatchShapePolicy` function compared with Objects365V1Dataset. 13 | See `mmyolo/datasets/utils.py#BatchShapePolicy` for details 14 | """ 15 | pass 16 | -------------------------------------------------------------------------------- /yolo_world/datasets/yolov5_obj365v2.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tencent Inc. All rights reserved. 2 | from mmdet.datasets import Objects365V2Dataset 3 | 4 | from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset 5 | from mmyolo.registry import DATASETS 6 | 7 | 8 | @DATASETS.register_module() 9 | class YOLOv5Objects365V2Dataset(BatchShapePolicyDataset, Objects365V2Dataset): 10 | """Dataset for YOLOv5 VOC Dataset. 11 | 12 | We only add `BatchShapePolicy` function compared with Objects365V1Dataset. 13 | See `mmyolo/datasets/utils.py#BatchShapePolicy` for details 14 | """ 15 | pass 16 | -------------------------------------------------------------------------------- /yolo_world/engine/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tencent Inc. All rights reserved. 2 | from .optimizers import * # noqa 3 | -------------------------------------------------------------------------------- /yolo_world/engine/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tencent Inc. All rights reserved. 2 | from .yolow_v5_optim_constructor import YOLOWv5OptimizerConstructor 3 | 4 | __all__ = ['YOLOWv5OptimizerConstructor'] 5 | -------------------------------------------------------------------------------- /yolo_world/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tencent Inc. All rights reserved. 2 | from .backbones import * # noqa 3 | from .layers import * # noqa 4 | from .detectors import * # noqa 5 | from .losses import * # noqa 6 | from .data_preprocessors import * # noqa 7 | from .dense_heads import * # noqa 8 | from .necks import * # noqa 9 | from .assigner import * # noqa 10 | -------------------------------------------------------------------------------- /yolo_world/models/assigner/__init__.py: -------------------------------------------------------------------------------- 1 | from .task_aligned_assigner import YOLOWorldSegAssigner 2 | 3 | __all__ = ['YOLOWorldSegAssigner'] -------------------------------------------------------------------------------- /yolo_world/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tencent Inc. All rights reserved. 2 | # YOLO Multi-Modal Backbone (Vision Language) 3 | # Vision: YOLOv8 CSPDarknet 4 | # Language: CLIP Text Encoder (12-layer transformer) 5 | from .mm_backbone import ( 6 | MultiModalYOLOBackbone, 7 | HuggingVisionBackbone, 8 | HuggingCLIPLanguageBackbone, 9 | PseudoLanguageBackbone) 10 | 11 | __all__ = [ 12 | 'MultiModalYOLOBackbone', 13 | 'HuggingVisionBackbone', 14 | 'HuggingCLIPLanguageBackbone', 15 | 'PseudoLanguageBackbone' 16 | ] 17 | -------------------------------------------------------------------------------- /yolo_world/models/data_preprocessors/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tencent Inc. All rights reserved. 2 | from .data_preprocessor import YOLOWDetDataPreprocessor 3 | 4 | __all__ = ['YOLOWDetDataPreprocessor'] 5 | -------------------------------------------------------------------------------- /yolo_world/models/dense_heads/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tencent Inc. All rights reserved. 2 | from .yolo_world_head import YOLOWorldHead, YOLOWorldHeadModule, RepYOLOWorldHeadModule, RepYOLOWorldHeadModuleV1 3 | from .yolo_world_seg_head import YOLOWorldSegHead, YOLOWorldSegHeadModule 4 | from .dosod_head import (DOSODYOLOv8Head, 5 | DOSODYOLOv8dHeadModule, 6 | DOSODContrastiveHead, 7 | RepDOSODYOLOv8Head, 8 | RepDOSODYOLOv8dHeadModuleDRobotics, 9 | RepDOSODYOLOv8dHeadModule, 10 | RepDOSODContrastiveHead, ) 11 | 12 | __all__ = [ 13 | 'YOLOWorldHead', 'YOLOWorldHeadModule', 'YOLOWorldSegHead', 'RepYOLOWorldHeadModuleV1', 14 | 'YOLOWorldSegHeadModule', 'RepYOLOWorldHeadModule', 15 | 'DOSODYOLOv8Head', 'DOSODYOLOv8dHeadModule', 'DOSODContrastiveHead', 16 | 'RepDOSODYOLOv8dHeadModuleDRobotics', 'RepDOSODYOLOv8Head', 'RepDOSODYOLOv8dHeadModule', 'RepDOSODContrastiveHead', 17 | ] 18 | -------------------------------------------------------------------------------- /yolo_world/models/detectors/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tencent Inc. All rights reserved. 2 | from .yolo_world import YOLOWorldDetector, SimpleYOLOWorldDetector 3 | from .dosod import DOSODDetector, RepDOSODDetector 4 | 5 | __all__ = ['YOLOWorldDetector', 'SimpleYOLOWorldDetector', 6 | 'DOSODDetector', 'RepDOSODDetector'] 7 | -------------------------------------------------------------------------------- /yolo_world/models/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tencent Inc. All rights reserved. 2 | # Basic brick modules for PAFPN based on CSPLayers 3 | 4 | from .yolo_bricks import ( 5 | CSPLayerWithTwoConv, 6 | MaxSigmoidAttnBlock, 7 | MaxSigmoidCSPLayerWithTwoConv, 8 | ImagePoolingAttentionModule, 9 | RepConvMaxSigmoidCSPLayerWithTwoConv, 10 | RepMaxSigmoidCSPLayerWithTwoConv 11 | ) 12 | 13 | __all__ = ['CSPLayerWithTwoConv', 14 | 'MaxSigmoidAttnBlock', 15 | 'MaxSigmoidCSPLayerWithTwoConv', 16 | 'RepConvMaxSigmoidCSPLayerWithTwoConv', 17 | 'RepMaxSigmoidCSPLayerWithTwoConv', 18 | 'ImagePoolingAttentionModule'] 19 | -------------------------------------------------------------------------------- /yolo_world/models/losses/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tencent Inc. All rights reserved. 2 | from .dynamic_loss import CoVMSELoss 3 | 4 | __all__ = ['CoVMSELoss'] 5 | -------------------------------------------------------------------------------- /yolo_world/models/losses/dynamic_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tencent Inc. All rights reserved. 2 | from typing import Optional 3 | 4 | import torch 5 | import torch.nn as nn 6 | from torch import Tensor 7 | from mmdet.models.losses.mse_loss import mse_loss 8 | from mmyolo.registry import MODELS 9 | 10 | 11 | @MODELS.register_module() 12 | class CoVMSELoss(nn.Module): 13 | 14 | def __init__(self, 15 | dim: int = 0, 16 | reduction: str = 'mean', 17 | loss_weight: float = 1.0, 18 | eps: float = 1e-6) -> None: 19 | super().__init__() 20 | self.dim = dim 21 | self.reduction = reduction 22 | self.loss_weight = loss_weight 23 | self.eps = eps 24 | 25 | def forward(self, 26 | pred: Tensor, 27 | weight: Optional[Tensor] = None, 28 | avg_factor: Optional[int] = None, 29 | reduction_override: Optional[str] = None) -> Tensor: 30 | """Forward function of loss.""" 31 | assert reduction_override in (None, 'none', 'mean', 'sum') 32 | reduction = ( 33 | reduction_override if reduction_override else self.reduction) 34 | cov = pred.std(self.dim) / pred.mean(self.dim).clamp(min=self.eps) 35 | target = torch.zeros_like(cov) 36 | loss = self.loss_weight * mse_loss( 37 | cov, target, weight, reduction=reduction, avg_factor=avg_factor) 38 | return loss 39 | -------------------------------------------------------------------------------- /yolo_world/models/necks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tencent Inc. All rights reserved. 2 | from .yolo_world_pafpn import YOLOWorldPAFPN, YOLOWorldDualPAFPN 3 | 4 | __all__ = ['YOLOWorldPAFPN', 'YOLOWorldDualPAFPN'] 5 | -------------------------------------------------------------------------------- /yolo_world/version.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tencent Inc. All rights reserved. 2 | from yolo_world import __version__ 3 | 4 | def __version_info() -> tuple: 5 | """Parse a version string into a tuple. 6 | Returns: 7 | tuple[int | str]: The version info, e.g., "1.3.0" is parsed into 8 | (1, 3, 0), and "2.0.0rc1" is parsed into (2, 0, 0, 'rc1'). 9 | """ 10 | version_info = [] 11 | for x in __version__.split('.'): 12 | if x.isdigit(): 13 | version_info.append(int(x)) 14 | elif x.find('rc') != -1: 15 | patch_version = x.split('rc') 16 | version_info.append(int(patch_version[0])) 17 | version_info.append(f'rc{patch_version[1]}') 18 | return tuple(version_info) 19 | 20 | 21 | version_info = __version_info() 22 | 23 | __all__ = ['__version__', 'version_info'] 24 | --------------------------------------------------------------------------------