├── .gitattributes ├── README.assets ├── image-20230819130751450.png ├── image-20230819134931428.png ├── MOT20.gif └── dancetrack.gif ├── yolox ├── __init__.py ├── core │ └── __init__.py ├── layers │ ├── __init__.py │ ├── csrc │ │ ├── vision.cpp │ │ └── cocoeval │ │ │ └── cocoeval.h │ └── fast_coco_eval_api.py ├── exp │ ├── __init__.py │ ├── build.py │ └── base_exp.py ├── evaluators │ └── __init__.py ├── data │ ├── datasets │ │ ├── __init__.py │ │ ├── datasets_wrapper.py │ │ └── mot.py │ ├── __init__.py │ ├── data_prefetcher.py │ └── samplers.py ├── models │ ├── __init__.py │ ├── yolox.py │ ├── yolo_fpn.py │ ├── losses.py │ └── yolo_pafpn.py ├── utils │ ├── __init__.py │ ├── checkpoint.py │ ├── setup_env.py │ ├── ema.py │ ├── logger.py │ ├── demo_utils.py │ ├── allreduce_norm.py │ ├── metric.py │ ├── model_utils.py │ ├── cluster_nms.py │ ├── visualize.py │ └── box_ops.py ├── tracking_utils │ ├── timer.py │ ├── io.py │ └── evaluation.py └── tracker │ └── basetrack.py ├── requirements.txt ├── exps ├── default │ ├── yolox_l.py │ ├── yolox_m.py │ ├── yolox_s.py │ ├── yolox_x.py │ ├── yolox_tiny.py │ ├── nano.py │ └── yolov3.py └── example │ └── mot │ ├── yolox_x_diffusion_det_mot17.py │ ├── yolox_x_diffusion_det_mot20.py │ ├── yolox_x_diffusion_det_dancetrack.py │ ├── yolox_x_diffusion_det_mot17_ablation.py │ ├── yolox_x_diffusion_track_mot17.py │ ├── yolox_x_diffusion_track_mot17_baseline.py │ ├── yolox_x_diffusion_track_mot20.py │ ├── yolox_x_diffusion_track_dancetrack.py │ ├── yolox_x_diffusion_track_mot20_baseline.py │ ├── yolox_x_diffusion_track_dancetrack_baseline.py │ └── yolox_x_diffusion_track_mot17_ablation.py ├── tools ├── convert_video.py ├── mix_data_bdd100k.py ├── convert_crowdhuman_to_coco.py ├── convert_ethz_to_coco.py ├── convert_cityperson_to_coco.py ├── mix_data_test_mot20.py ├── mix_data_ablation.py ├── mota.py ├── train.py ├── mix_data_test_mot17.py └── convert_bdd100k_to_coco.py ├── setup.py ├── .gitignore └── diffusion └── models └── diffusionnet.py /.gitattributes: -------------------------------------------------------------------------------- 1 | README.assets/MOT20.gif filter=lfs diff=lfs merge=lfs -text 2 | README.assets/dancetrack.gif filter=lfs diff=lfs merge=lfs -text 3 | -------------------------------------------------------------------------------- /README.assets/image-20230819130751450.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RainBowLuoCS/DiffusionTrack/HEAD/README.assets/image-20230819130751450.png -------------------------------------------------------------------------------- /README.assets/image-20230819134931428.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RainBowLuoCS/DiffusionTrack/HEAD/README.assets/image-20230819134931428.png -------------------------------------------------------------------------------- /README.assets/MOT20.gif: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d7944be05fb6e8f06b5b5c5f348febf24a738d9dc5267824dc49dff5cc56b101 3 | size 83313257 4 | -------------------------------------------------------------------------------- /README.assets/dancetrack.gif: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:94c7e7accd4ff802dd8635834085b00ec9e9597c95c75580c0eacde22816ce17 3 | size 76207196 4 | -------------------------------------------------------------------------------- /yolox/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | 4 | from .utils import configure_module 5 | 6 | configure_module() 7 | 8 | __version__ = "0.1.0" 9 | -------------------------------------------------------------------------------- /yolox/core/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | from .launch import launch 6 | from .trainer import Trainer 7 | -------------------------------------------------------------------------------- /yolox/layers/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | from .fast_coco_eval_api import COCOeval_opt 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | torch>=1.7 3 | opencv_python 4 | loguru 5 | scikit-image 6 | tqdm 7 | torchvision>=0.10.0 8 | Pillow 9 | thop 10 | ninja 11 | tabulate 12 | tensorboard 13 | lap 14 | motmetrics 15 | filterpy 16 | h5py 17 | -------------------------------------------------------------------------------- /yolox/exp/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | from .base_exp import BaseExp 6 | from .build import get_exp 7 | from .yolox_base import Exp 8 | -------------------------------------------------------------------------------- /yolox/evaluators/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | from .coco_evaluator import COCOEvaluator 6 | from .diffusion_mot_evaluator import DiffusionMOTEvaluator 7 | from .diffusion_mot_evaluator_kl import DiffusionMOTEvaluatorKL 8 | -------------------------------------------------------------------------------- /yolox/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | from .datasets_wrapper import ConcatDataset, Dataset, MixConcatDataset 6 | from .mosaicdetection import MosaicDetection,DiffusionMosaicDetection 7 | from .mot import MOTDataset 8 | -------------------------------------------------------------------------------- /yolox/models/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | from .darknet import CSPDarknet, Darknet 6 | from .losses import IOUloss 7 | from .yolo_fpn import YOLOFPN 8 | from .yolo_head import YOLOXHead 9 | from .yolo_pafpn import YOLOPAFPN 10 | from .yolox import YOLOX 11 | -------------------------------------------------------------------------------- /yolox/data/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | from .data_augment import TrainTransform, ValTransform,DiffusionValTransform,DiffusionTrainTransform 6 | from .data_prefetcher import DataPrefetcher 7 | from .dataloading import DataLoader, get_yolox_datadir 8 | from .datasets import * 9 | from .samplers import InfiniteSampler, YoloBatchSampler 10 | -------------------------------------------------------------------------------- /exps/default/yolox_l.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | import os 6 | 7 | from yolox.exp import Exp as MyExp 8 | 9 | 10 | class Exp(MyExp): 11 | def __init__(self): 12 | super(Exp, self).__init__() 13 | self.depth = 1.0 14 | self.width = 1.0 15 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 16 | -------------------------------------------------------------------------------- /exps/default/yolox_m.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | import os 6 | 7 | from yolox.exp import Exp as MyExp 8 | 9 | 10 | class Exp(MyExp): 11 | def __init__(self): 12 | super(Exp, self).__init__() 13 | self.depth = 0.67 14 | self.width = 0.75 15 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 16 | -------------------------------------------------------------------------------- /exps/default/yolox_s.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | import os 6 | 7 | from yolox.exp import Exp as MyExp 8 | 9 | 10 | class Exp(MyExp): 11 | def __init__(self): 12 | super(Exp, self).__init__() 13 | self.depth = 0.33 14 | self.width = 0.50 15 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 16 | -------------------------------------------------------------------------------- /exps/default/yolox_x.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | import os 6 | 7 | from yolox.exp import Exp as MyExp 8 | 9 | 10 | class Exp(MyExp): 11 | def __init__(self): 12 | super(Exp, self).__init__() 13 | self.depth = 1.33 14 | self.width = 1.25 15 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 16 | -------------------------------------------------------------------------------- /yolox/utils/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | from .allreduce_norm import * 6 | from .boxes import * 7 | from .checkpoint import load_ckpt, save_checkpoint 8 | from .demo_utils import * 9 | from .dist import * 10 | from .ema import ModelEMA 11 | from .logger import setup_logger 12 | from .lr_scheduler import LRScheduler 13 | from .metric import * 14 | from .model_utils import * 15 | from .setup_env import * 16 | from .visualize import * 17 | -------------------------------------------------------------------------------- /exps/default/yolox_tiny.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | import os 6 | 7 | from yolox.exp import Exp as MyExp 8 | 9 | 10 | class Exp(MyExp): 11 | def __init__(self): 12 | super(Exp, self).__init__() 13 | self.depth = 0.33 14 | self.width = 0.375 15 | self.scale = (0.5, 1.5) 16 | self.random_size = (10, 20) 17 | self.test_size = (416, 416) 18 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 19 | self.enable_mixup = False 20 | -------------------------------------------------------------------------------- /yolox/layers/csrc/vision.cpp: -------------------------------------------------------------------------------- 1 | #include "cocoeval/cocoeval.h" 2 | 3 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 4 | m.def("COCOevalAccumulate", &COCOeval::Accumulate, "COCOeval::Accumulate"); 5 | m.def( 6 | "COCOevalEvaluateImages", 7 | &COCOeval::EvaluateImages, 8 | "COCOeval::EvaluateImages"); 9 | pybind11::class_(m, "InstanceAnnotation") 10 | .def(pybind11::init()); 11 | pybind11::class_(m, "ImageEvaluation") 12 | .def(pybind11::init<>()); 13 | } 14 | -------------------------------------------------------------------------------- /tools/convert_video.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | 3 | def convert_video(video_path): 4 | cap = cv2.VideoCapture(video_path) 5 | width = cap.get(cv2.CAP_PROP_FRAME_WIDTH) # float 6 | height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT) # float 7 | fps = cap.get(cv2.CAP_PROP_FPS) 8 | video_name = video_path.split('/')[-1].split('.')[0] 9 | save_name = video_name + '_converted' 10 | save_path = video_path.replace(video_name, save_name) 11 | vid_writer = cv2.VideoWriter( 12 | save_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (int(width), int(height)) 13 | ) 14 | while True: 15 | ret_val, frame = cap.read() 16 | if ret_val: 17 | vid_writer.write(frame) 18 | ch = cv2.waitKey(1) 19 | if ch == 27 or ch == ord("q") or ch == ord("Q"): 20 | break 21 | else: 22 | break 23 | 24 | if __name__ == "__main__": 25 | video_path = 'videos/palace.mp4' 26 | convert_video(video_path) -------------------------------------------------------------------------------- /yolox/tracking_utils/timer.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | class Timer(object): 5 | """A simple timer.""" 6 | def __init__(self): 7 | self.total_time = 0. 8 | self.calls = 0 9 | self.start_time = 0. 10 | self.diff = 0. 11 | self.average_time = 0. 12 | 13 | self.duration = 0. 14 | 15 | def tic(self): 16 | # using time.time instead of time.clock because time time.clock 17 | # does not normalize for multithreading 18 | self.start_time = time.time() 19 | 20 | def toc(self, average=True): 21 | self.diff = time.time() - self.start_time 22 | self.total_time += self.diff 23 | self.calls += 1 24 | self.average_time = self.total_time / self.calls 25 | if average: 26 | self.duration = self.average_time 27 | else: 28 | self.duration = self.diff 29 | return self.duration 30 | 31 | def clear(self): 32 | self.total_time = 0. 33 | self.calls = 0 34 | self.start_time = 0. 35 | self.diff = 0. 36 | self.average_time = 0. 37 | self.duration = 0. -------------------------------------------------------------------------------- /yolox/tracker/basetrack.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import OrderedDict 3 | 4 | 5 | class TrackState(object): 6 | New = 0 7 | Tracked = 1 8 | Lost = 2 9 | Removed = 3 10 | 11 | 12 | class BaseTrack(object): 13 | _count = 0 14 | 15 | track_id = 0 16 | is_activated = False 17 | state = TrackState.New 18 | 19 | history = OrderedDict() 20 | features = [] 21 | curr_feature = None 22 | score = 0 23 | start_frame = 0 24 | frame_id = 0 25 | time_since_update = 0 26 | 27 | # multi-camera 28 | location = (np.inf, np.inf) 29 | 30 | @property 31 | def end_frame(self): 32 | return self.frame_id 33 | 34 | @staticmethod 35 | def next_id(): 36 | BaseTrack._count += 1 37 | return BaseTrack._count 38 | 39 | def activate(self, *args): 40 | raise NotImplementedError 41 | 42 | def predict(self): 43 | raise NotImplementedError 44 | 45 | def update(self, *args, **kwargs): 46 | raise NotImplementedError 47 | 48 | def mark_lost(self): 49 | self.state = TrackState.Lost 50 | 51 | def mark_removed(self): 52 | self.state = TrackState.Removed 53 | -------------------------------------------------------------------------------- /exps/default/nano.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | import os 6 | import torch.nn as nn 7 | 8 | from yolox.exp import Exp as MyExp 9 | 10 | 11 | class Exp(MyExp): 12 | def __init__(self): 13 | super(Exp, self).__init__() 14 | self.depth = 0.33 15 | self.width = 0.25 16 | self.scale = (0.5, 1.5) 17 | self.random_size = (10, 20) 18 | self.test_size = (416, 416) 19 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 20 | self.enable_mixup = False 21 | 22 | def get_model(self, sublinear=False): 23 | 24 | def init_yolo(M): 25 | for m in M.modules(): 26 | if isinstance(m, nn.BatchNorm2d): 27 | m.eps = 1e-3 28 | m.momentum = 0.03 29 | if "model" not in self.__dict__: 30 | from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead 31 | in_channels = [256, 512, 1024] 32 | # NANO model use depthwise = True, which is main difference. 33 | backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True) 34 | head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True) 35 | self.model = YOLOX(backbone, head) 36 | 37 | self.model.apply(init_yolo) 38 | self.model.head.initialize_biases(1e-2) 39 | return self.model 40 | -------------------------------------------------------------------------------- /yolox/utils/checkpoint.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | from loguru import logger 5 | 6 | import torch 7 | 8 | import os 9 | import shutil 10 | 11 | 12 | def load_ckpt(model, ckpt): 13 | model_state_dict = model.state_dict() 14 | load_dict = {} 15 | for key_model, v in model_state_dict.items(): 16 | if key_model not in ckpt: 17 | logger.warning( 18 | "{} is not in the ckpt. Please double check and see if this is desired.".format( 19 | key_model 20 | ) 21 | ) 22 | continue 23 | v_ckpt = ckpt[key_model] 24 | if v.shape != v_ckpt.shape: 25 | logger.warning( 26 | "Shape of {} in checkpoint is {}, while shape of {} in model is {}.".format( 27 | key_model, v_ckpt.shape, key_model, v.shape 28 | ) 29 | ) 30 | continue 31 | load_dict[key_model] = v_ckpt 32 | 33 | model.load_state_dict(load_dict, strict=False) 34 | return model 35 | 36 | 37 | def save_checkpoint(state, is_best, save_dir, model_name=""): 38 | if not os.path.exists(save_dir): 39 | os.makedirs(save_dir) 40 | filename = os.path.join(save_dir, model_name + "_ckpt.pth.tar") 41 | torch.save(state, filename) 42 | if is_best: 43 | best_filename = os.path.join(save_dir, "best_ckpt.pth.tar") 44 | shutil.copyfile(filename, best_filename) 45 | -------------------------------------------------------------------------------- /yolox/models/yolox.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | import torch.nn as nn 6 | 7 | from .yolo_head import YOLOXHead 8 | from .yolo_pafpn import YOLOPAFPN 9 | 10 | class YOLOX(nn.Module): 11 | """ 12 | YOLOX model module. The module list is defined by create_yolov3_modules function. 13 | The network returns loss values from three YOLO layers during training 14 | and detection results during test. 15 | """ 16 | 17 | def __init__(self, backbone=None, head=None): 18 | super().__init__() 19 | if backbone is None: 20 | backbone = YOLOPAFPN() 21 | if head is None: 22 | head = YOLOXHead(80) 23 | 24 | self.backbone = backbone 25 | self.head = head 26 | 27 | def forward(self, x, targets=None): 28 | # fpn output content features of [dark3, dark4, dark5] 29 | fpn_outs = self.backbone(x) 30 | 31 | if self.training: 32 | assert targets is not None 33 | loss, iou_loss, conf_loss, cls_loss, l1_loss, num_fg = self.head( 34 | fpn_outs, targets, x 35 | ) 36 | outputs = { 37 | "total_loss": loss, 38 | "iou_loss": iou_loss, 39 | "l1_loss": l1_loss, 40 | "conf_loss": conf_loss, 41 | "cls_loss": cls_loss, 42 | "num_fg": num_fg, 43 | } 44 | else: 45 | outputs = self.head(fpn_outs) 46 | 47 | return outputs 48 | -------------------------------------------------------------------------------- /yolox/exp/build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | import importlib 6 | import os 7 | import sys 8 | 9 | 10 | def get_exp_by_file(exp_file): 11 | try: 12 | sys.path.append(os.path.dirname(exp_file)) 13 | current_exp = importlib.import_module(os.path.basename(exp_file).split(".")[0]) 14 | exp = current_exp.Exp() 15 | except Exception: 16 | raise ImportError("{} doesn't contains class named 'Exp'".format(exp_file)) 17 | return exp 18 | 19 | 20 | def get_exp_by_name(exp_name): 21 | import yolox 22 | 23 | yolox_path = os.path.dirname(os.path.dirname(yolox.__file__)) 24 | filedict = { 25 | "yolox-s": "yolox_s.py", 26 | "yolox-m": "yolox_m.py", 27 | "yolox-l": "yolox_l.py", 28 | "yolox-x": "yolox_x.py", 29 | "yolox-tiny": "yolox_tiny.py", 30 | "yolox-nano": "nano.py", 31 | "yolov3": "yolov3.py", 32 | } 33 | filename = filedict[exp_name] 34 | exp_path = os.path.join(yolox_path, "exps", "default", filename) 35 | return get_exp_by_file(exp_path) 36 | 37 | 38 | def get_exp(exp_file, exp_name): 39 | """ 40 | get Exp object by file or name. If exp_file and exp_name 41 | are both provided, get Exp by exp_file. 42 | 43 | Args: 44 | exp_file (str): file path of experiment. 45 | exp_name (str): name of experiment. "yolo-s", 46 | """ 47 | assert ( 48 | exp_file is not None or exp_name is not None 49 | ), "plz provide exp file or exp name." 50 | if exp_file is not None: 51 | return get_exp_by_file(exp_file) 52 | else: 53 | return get_exp_by_name(exp_name) 54 | -------------------------------------------------------------------------------- /yolox/utils/setup_env.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | import cv2 6 | 7 | import os 8 | import subprocess 9 | 10 | __all__ = ["configure_nccl", "configure_module"] 11 | 12 | 13 | def configure_nccl(): 14 | """Configure multi-machine environment variables of NCCL.""" 15 | os.environ["NCCL_LAUNCH_MODE"] = "PARALLEL" 16 | os.environ["NCCL_IB_HCA"] = subprocess.getoutput( 17 | "pushd /sys/class/infiniband/ > /dev/null; for i in mlx5_*; " 18 | "do cat $i/ports/1/gid_attrs/types/* 2>/dev/null " 19 | "| grep v >/dev/null && echo $i ; done; popd > /dev/null" 20 | ) 21 | os.environ["NCCL_IB_GID_INDEX"] = "3" 22 | os.environ["NCCL_IB_TC"] = "106" 23 | 24 | 25 | def configure_module(ulimit_value=8192): 26 | """ 27 | Configure pytorch module environment. setting of ulimit and cv2 will be set. 28 | 29 | Args: 30 | ulimit_value(int): default open file number on linux. Default value: 8192. 31 | """ 32 | # system setting 33 | try: 34 | import resource 35 | 36 | rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) 37 | resource.setrlimit(resource.RLIMIT_NOFILE, (ulimit_value, rlimit[1])) 38 | except Exception: 39 | # Exception might be raised in Windows OS or rlimit reaches max limit number. 40 | # However, set rlimit value might not be necessary. 41 | pass 42 | 43 | # cv2 44 | # multiprocess might be harmful on performance of torch dataloader 45 | os.environ["OPENCV_OPENCL_RUNTIME"] = "disabled" 46 | try: 47 | cv2.setNumThreads(0) 48 | cv2.ocl.setUseOpenCL(False) 49 | except Exception: 50 | # cv2 version mismatch might rasie exceptions. 51 | pass 52 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Megvii, Inc. and its affiliates. All Rights Reserved 3 | 4 | import re 5 | import setuptools 6 | import glob 7 | from os import path 8 | import torch 9 | from torch.utils.cpp_extension import CppExtension 10 | 11 | torch_ver = [int(x) for x in torch.__version__.split(".")[:2]] 12 | assert torch_ver >= [1, 3], "Requires PyTorch >= 1.3" 13 | 14 | 15 | def get_extensions(): 16 | this_dir = path.dirname(path.abspath(__file__)) 17 | extensions_dir = path.join(this_dir, "yolox", "layers", "csrc") 18 | 19 | main_source = path.join(extensions_dir, "vision.cpp") 20 | sources = glob.glob(path.join(extensions_dir, "**", "*.cpp")) 21 | 22 | sources = [main_source] + sources 23 | extension = CppExtension 24 | 25 | extra_compile_args = {"cxx": ["-O3"]} 26 | define_macros = [] 27 | 28 | include_dirs = [extensions_dir] 29 | 30 | ext_modules = [ 31 | extension( 32 | "yolox._C", 33 | sources, 34 | include_dirs=include_dirs, 35 | define_macros=define_macros, 36 | extra_compile_args=extra_compile_args, 37 | ) 38 | ] 39 | 40 | return ext_modules 41 | 42 | 43 | with open("yolox/__init__.py", "r") as f: 44 | version = re.search( 45 | r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', 46 | f.read(), re.MULTILINE 47 | ).group(1) 48 | 49 | 50 | # with open("README.md", "r") as f: 51 | # long_description = f.read() 52 | 53 | long_description="sss" 54 | setuptools.setup( 55 | name="yolox", 56 | version=version, 57 | author="basedet team", 58 | python_requires=">=3.6", 59 | long_description=long_description, 60 | ext_modules=get_extensions(), 61 | classifiers=["Programming Language :: Python :: 3", "Operating System :: OS Independent"], 62 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 63 | packages=setuptools.find_namespace_packages(), 64 | ) 65 | -------------------------------------------------------------------------------- /tools/mix_data_bdd100k.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import numpy as np 4 | 5 | """ 6 | cd datasets 7 | mkdir -p mix_det/annotations 8 | cp mot/annotations/val_half.json mix_det/annotations/val_half.json 9 | cp mot/annotations/test.json mix_det/annotations/test.json 10 | cd mix_det 11 | ln -s ../mot/train mot_train 12 | ln -s ../crowdhuman/CrowdHuman_train crowdhuman_train 13 | ln -s ../crowdhuman/CrowdHuman_val crowdhuman_val 14 | ln -s ../Cityscapes cp_train 15 | ln -s ../ETHZ ethz_train 16 | cd .. 17 | """ 18 | 19 | bdd100ktrain_json = json.load(open('datasets/bdd100k/annotations/mix_train_val.json','r')) 20 | # need_index=np.random.choice(range(len(bdd100ktrain_json['images'])),len(bdd100ktrain_json['images'])//3,replace=False) 21 | # need_img_ids={} 22 | img_list = list() 23 | for img in bdd100ktrain_json['images']: 24 | img['is_video']=1 25 | img_list.append(img) 26 | # need_img_ids[bdd100ktrain_json['images'][img_idx]['id']]=1 27 | 28 | ann_list = list() 29 | for ann in bdd100ktrain_json['annotations']: 30 | # if ann['image_id'] in need_img_ids: 31 | ann_list.append(ann) 32 | 33 | video_list = bdd100ktrain_json['videos'] 34 | category_list = bdd100ktrain_json['categories'] 35 | 36 | 37 | print('bdd100ktrain') 38 | 39 | max_img = len(img_list) 40 | max_ann = len(ann_list) 41 | max_video = len(video_list) 42 | 43 | bdd100kval_json = json.load(open('datasets/bdd100k/annotations/val.json','r')) 44 | for img in bdd100kval_json['images']: 45 | img['prev_image_id'] = img['prev_image_id'] + max_img 46 | img['next_image_id'] = img['next_image_id'] + max_img 47 | img['id'] = img['id'] + max_img 48 | img['video_id']+= max_video 49 | img['is_video']=1 50 | img_list.append(img) 51 | 52 | for ann in bdd100kval_json['annotations']: 53 | ann['id'] = ann['id'] + max_ann 54 | ann['image_id'] = ann['image_id'] + max_img 55 | ann_list.append(ann) 56 | 57 | for vid in bdd100kval_json['videos']: 58 | vid['id']+=max_video 59 | video_list.append(vid) 60 | 61 | print('bdd100ktest') 62 | 63 | mix_json = dict() 64 | mix_json['images'] = img_list 65 | mix_json['annotations'] = ann_list 66 | mix_json['videos'] = video_list 67 | mix_json['categories'] = category_list 68 | json.dump(mix_json, open('datasets/bdd100k/annotations/mix_train_val.json','w')) 69 | -------------------------------------------------------------------------------- /yolox/exp/base_exp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | import torch 6 | from torch.nn import Module 7 | 8 | from yolox.utils import LRScheduler 9 | 10 | import ast 11 | import pprint 12 | from abc import ABCMeta, abstractmethod 13 | from tabulate import tabulate 14 | from typing import Dict 15 | 16 | 17 | class BaseExp(metaclass=ABCMeta): 18 | """Basic class for any experiment.""" 19 | 20 | def __init__(self): 21 | self.seed = None 22 | self.output_dir = "./DiffusionTrack_outputs" 23 | self.print_interval = 100 24 | self.eval_interval = 10 25 | 26 | @abstractmethod 27 | def get_model(self) -> Module: 28 | pass 29 | 30 | @abstractmethod 31 | def get_data_loader( 32 | self, batch_size: int, is_distributed: bool 33 | ) -> Dict[str, torch.utils.data.DataLoader]: 34 | pass 35 | 36 | @abstractmethod 37 | def get_optimizer(self, batch_size: int) -> torch.optim.Optimizer: 38 | pass 39 | 40 | @abstractmethod 41 | def get_lr_scheduler( 42 | self, lr: float, iters_per_epoch: int, **kwargs 43 | ) -> LRScheduler: 44 | pass 45 | 46 | @abstractmethod 47 | def get_evaluator(self): 48 | pass 49 | 50 | @abstractmethod 51 | def eval(self, model, evaluator, weights): 52 | pass 53 | 54 | def __repr__(self): 55 | table_header = ["keys", "values"] 56 | exp_table = [ 57 | (str(k), pprint.pformat(v)) 58 | for k, v in vars(self).items() 59 | if not k.startswith("_") 60 | ] 61 | return tabulate(exp_table, headers=table_header, tablefmt="fancy_grid") 62 | 63 | def merge(self, cfg_list): 64 | assert len(cfg_list) % 2 == 0 65 | for k, v in zip(cfg_list[0::2], cfg_list[1::2]): 66 | # only update value with same key 67 | if hasattr(self, k): 68 | src_value = getattr(self, k) 69 | src_type = type(src_value) 70 | if src_value is not None and src_type != type(v): 71 | try: 72 | v = src_type(v) 73 | except Exception: 74 | v = ast.literal_eval(v) 75 | setattr(self, k, v) 76 | -------------------------------------------------------------------------------- /tools/convert_crowdhuman_to_coco.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import json 4 | from PIL import Image 5 | 6 | DATA_PATH = 'datasets/crowdhuman/' 7 | OUT_PATH = DATA_PATH + 'annotations/' 8 | SPLITS = ['val', 'train'] 9 | DEBUG = False 10 | 11 | def load_func(fpath): 12 | print('fpath', fpath) 13 | assert os.path.exists(fpath) 14 | with open(fpath,'r') as fid: 15 | lines = fid.readlines() 16 | records =[json.loads(line.strip('\n')) for line in lines] 17 | return records 18 | 19 | if __name__ == '__main__': 20 | if not os.path.exists(OUT_PATH): 21 | os.mkdir(OUT_PATH) 22 | for split in SPLITS: 23 | data_path = DATA_PATH + split 24 | out_path = OUT_PATH + '{}.json'.format(split) 25 | out = {'images': [], 'annotations': [], 'categories': [{'id': 1, 'name': 'person'}]} 26 | ann_path = DATA_PATH + 'annotation_{}.odgt'.format(split) 27 | anns_data = load_func(ann_path) 28 | image_cnt = 0 29 | ann_cnt = 0 30 | video_cnt = 0 31 | for ann_data in anns_data: 32 | image_cnt += 1 33 | file_path = DATA_PATH + 'CrowdHuman_{}/Images/'.format(split) + '{}.jpg'.format(ann_data['ID']) 34 | im = Image.open(file_path) 35 | image_info = {'file_name': '{}.jpg'.format(ann_data['ID']), 36 | 'id': image_cnt, 37 | 'height': im.size[1], 38 | 'width': im.size[0]} 39 | out['images'].append(image_info) 40 | if split != 'test': 41 | anns = ann_data['gtboxes'] 42 | for i in range(len(anns)): 43 | ann_cnt += 1 44 | fbox = anns[i]['fbox'] 45 | ann = {'id': ann_cnt, 46 | 'category_id': 1, 47 | 'image_id': image_cnt, 48 | 'track_id': -1, 49 | 'bbox_vis': anns[i]['vbox'], 50 | 'bbox': fbox, 51 | 'area': fbox[2] * fbox[3], 52 | 'iscrowd': 1 if 'extra' in anns[i] and \ 53 | 'ignore' in anns[i]['extra'] and \ 54 | anns[i]['extra']['ignore'] == 1 else 0} 55 | out['annotations'].append(ann) 56 | print('loaded {} for {} images and {} samples'.format(split, len(out['images']), len(out['annotations']))) 57 | json.dump(out, open(out_path, 'w')) -------------------------------------------------------------------------------- /tools/convert_ethz_to_coco.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import json 4 | from PIL import Image 5 | 6 | DATA_PATH = 'datasets/ETHZ/' 7 | DATA_FILE_PATH = 'datasets/data_path/eth.train' 8 | OUT_PATH = DATA_PATH + 'annotations/' 9 | 10 | def load_paths(data_path): 11 | with open(data_path, 'r') as file: 12 | img_files = file.readlines() 13 | img_files = [x.replace('\n', '') for x in img_files] 14 | img_files = list(filter(lambda x: len(x) > 0, img_files)) 15 | label_files = [x.replace('images', 'labels_with_ids').replace('.png', '.txt').replace('.jpg', '.txt') for x in img_files] 16 | return img_files, label_files 17 | 18 | if __name__ == '__main__': 19 | if not os.path.exists(OUT_PATH): 20 | os.mkdir(OUT_PATH) 21 | 22 | out_path = OUT_PATH + 'train.json' 23 | out = {'images': [], 'annotations': [], 'categories': [{'id': 1, 'name': 'person'}]} 24 | img_paths, label_paths = load_paths(DATA_FILE_PATH) 25 | image_cnt = 0 26 | ann_cnt = 0 27 | video_cnt = 0 28 | for img_path, label_path in zip(img_paths, label_paths): 29 | image_cnt += 1 30 | im = Image.open(os.path.join("datasets", img_path)) 31 | image_info = {'file_name': img_path, 32 | 'id': image_cnt, 33 | 'height': im.size[1], 34 | 'width': im.size[0]} 35 | out['images'].append(image_info) 36 | # Load labels 37 | if os.path.isfile(os.path.join("datasets", label_path)): 38 | labels0 = np.loadtxt(os.path.join("datasets", label_path), dtype=np.float32).reshape(-1, 6) 39 | # Normalized xywh to pixel xyxy format 40 | labels = labels0.copy() 41 | labels[:, 2] = image_info['width'] * (labels0[:, 2] - labels0[:, 4] / 2) 42 | labels[:, 3] = image_info['height'] * (labels0[:, 3] - labels0[:, 5] / 2) 43 | labels[:, 4] = image_info['width'] * labels0[:, 4] 44 | labels[:, 5] = image_info['height'] * labels0[:, 5] 45 | else: 46 | labels = np.array([]) 47 | for i in range(len(labels)): 48 | ann_cnt += 1 49 | fbox = labels[i, 2:6].tolist() 50 | ann = {'id': ann_cnt, 51 | 'category_id': 1, 52 | 'image_id': image_cnt, 53 | 'track_id': -1, 54 | 'bbox': fbox, 55 | 'area': fbox[2] * fbox[3], 56 | 'iscrowd': 0} 57 | out['annotations'].append(ann) 58 | print('loaded train for {} images and {} samples'.format(len(out['images']), len(out['annotations']))) 59 | json.dump(out, open(out_path, 'w')) 60 | -------------------------------------------------------------------------------- /tools/convert_cityperson_to_coco.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import json 4 | from PIL import Image 5 | 6 | DATA_PATH = 'datasets/Cityscapes/' 7 | DATA_FILE_PATH = 'datasets/data_path/citypersons.train' 8 | OUT_PATH = DATA_PATH + 'annotations/' 9 | 10 | def load_paths(data_path): 11 | with open(data_path, 'r') as file: 12 | img_files = file.readlines() 13 | img_files = [x.replace('\n', '') for x in img_files] 14 | img_files = list(filter(lambda x: len(x) > 0, img_files)) 15 | label_files = [x.replace('images', 'labels_with_ids').replace('.png', '.txt').replace('.jpg', '.txt') for x in img_files] 16 | return img_files, label_files 17 | 18 | if __name__ == '__main__': 19 | if not os.path.exists(OUT_PATH): 20 | os.mkdir(OUT_PATH) 21 | 22 | out_path = OUT_PATH + 'train.json' 23 | out = {'images': [], 'annotations': [], 'categories': [{'id': 1, 'name': 'person'}]} 24 | img_paths, label_paths = load_paths(DATA_FILE_PATH) 25 | image_cnt = 0 26 | ann_cnt = 0 27 | video_cnt = 0 28 | for img_path, label_path in zip(img_paths, label_paths): 29 | image_cnt += 1 30 | im = Image.open(os.path.join("datasets", img_path)) 31 | image_info = {'file_name': img_path, 32 | 'id': image_cnt, 33 | 'height': im.size[1], 34 | 'width': im.size[0]} 35 | out['images'].append(image_info) 36 | # Load labels 37 | if os.path.isfile(os.path.join("datasets", label_path)): 38 | labels0 = np.loadtxt(os.path.join("datasets", label_path), dtype=np.float32).reshape(-1, 6) 39 | # Normalized xywh to pixel xyxy format 40 | labels = labels0.copy() 41 | labels[:, 2] = image_info['width'] * (labels0[:, 2] - labels0[:, 4] / 2) 42 | labels[:, 3] = image_info['height'] * (labels0[:, 3] - labels0[:, 5] / 2) 43 | labels[:, 4] = image_info['width'] * labels0[:, 4] 44 | labels[:, 5] = image_info['height'] * labels0[:, 5] 45 | else: 46 | labels = np.array([]) 47 | for i in range(len(labels)): 48 | ann_cnt += 1 49 | fbox = labels[i, 2:6].tolist() 50 | ann = {'id': ann_cnt, 51 | 'category_id': 1, 52 | 'image_id': image_cnt, 53 | 'track_id': -1, 54 | 'bbox': fbox, 55 | 'area': fbox[2] * fbox[3], 56 | 'iscrowd': 0} 57 | out['annotations'].append(ann) 58 | print('loaded train for {} images and {} samples'.format(len(out['images']), len(out['annotations']))) 59 | json.dump(out, open(out_path, 'w')) 60 | -------------------------------------------------------------------------------- /tools/mix_data_test_mot20.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | 5 | """ 6 | cd datasets 7 | mkdir -p mix_mot20_ch/annotations 8 | cp MOT20/annotations/val_half.json mix_mot20_ch/annotations/val_half.json 9 | cp MOT20/annotations/test.json mix_mot20_ch/annotations/test.json 10 | cd mix_mot20_ch 11 | ln -s ../MOT20/train mot20_train 12 | ln -s ../crowdhuman/CrowdHuman_train crowdhuman_train 13 | ln -s ../crowdhuman/CrowdHuman_val crowdhuman_val 14 | cd .. 15 | """ 16 | 17 | mot_json = json.load(open('datasets/MOT20/annotations/train.json','r')) 18 | 19 | img_list = list() 20 | for img in mot_json['images']: 21 | img['file_name'] = 'mot20_train/' + img['file_name'] 22 | img_list.append(img) 23 | 24 | ann_list = list() 25 | for ann in mot_json['annotations']: 26 | ann_list.append(ann) 27 | 28 | video_list = mot_json['videos'] 29 | category_list = mot_json['categories'] 30 | 31 | 32 | max_img = 10000 33 | max_ann = 2000000 34 | max_video = 10 35 | 36 | crowdhuman_json = json.load(open('datasets/crowdhuman/annotations/train.json','r')) 37 | img_id_count = 0 38 | for img in crowdhuman_json['images']: 39 | img_id_count += 1 40 | img['file_name'] = 'crowdhuman_train/Images/' + img['file_name'] 41 | img['frame_id'] = img_id_count 42 | img['prev_image_id'] = img['id'] + max_img 43 | img['next_image_id'] = img['id'] + max_img 44 | img['id'] = img['id'] + max_img 45 | img['video_id'] = max_video 46 | img_list.append(img) 47 | 48 | for ann in crowdhuman_json['annotations']: 49 | ann['id'] = ann['id'] + max_ann 50 | ann['image_id'] = ann['image_id'] + max_img 51 | ann_list.append(ann) 52 | 53 | video_list.append({ 54 | 'id': max_video, 55 | 'file_name': 'crowdhuman_train' 56 | }) 57 | 58 | 59 | max_img = 30000 60 | max_ann = 10000000 61 | 62 | crowdhuman_val_json = json.load(open('datasets/crowdhuman/annotations/val.json','r')) 63 | img_id_count = 0 64 | for img in crowdhuman_val_json['images']: 65 | img_id_count += 1 66 | img['file_name'] = 'crowdhuman_val/Images/' + img['file_name'] 67 | img['frame_id'] = img_id_count 68 | img['prev_image_id'] = img['id'] + max_img 69 | img['next_image_id'] = img['id'] + max_img 70 | img['id'] = img['id'] + max_img 71 | img['video_id'] = max_video 72 | img_list.append(img) 73 | 74 | for ann in crowdhuman_val_json['annotations']: 75 | ann['id'] = ann['id'] + max_ann 76 | ann['image_id'] = ann['image_id'] + max_img 77 | ann_list.append(ann) 78 | 79 | video_list.append({ 80 | 'id': max_video, 81 | 'file_name': 'crowdhuman_val' 82 | }) 83 | 84 | mix_json = dict() 85 | mix_json['images'] = img_list 86 | mix_json['annotations'] = ann_list 87 | mix_json['videos'] = video_list 88 | mix_json['categories'] = category_list 89 | json.dump(mix_json, open('datasets/mix_mot20_ch/annotations/train.json','w')) -------------------------------------------------------------------------------- /yolox/utils/ema.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | import torch 5 | import torch.nn as nn 6 | 7 | import math 8 | from copy import deepcopy 9 | 10 | 11 | def is_parallel(model): 12 | """check if model is in parallel mode.""" 13 | 14 | parallel_type = ( 15 | nn.parallel.DataParallel, 16 | nn.parallel.DistributedDataParallel, 17 | ) 18 | return isinstance(model, parallel_type) 19 | 20 | 21 | def copy_attr(a, b, include=(), exclude=()): 22 | # Copy attributes from b to a, options to only include [...] and to exclude [...] 23 | for k, v in b.__dict__.items(): 24 | if (len(include) and k not in include) or k.startswith("_") or k in exclude: 25 | continue 26 | else: 27 | setattr(a, k, v) 28 | 29 | 30 | class ModelEMA: 31 | """ 32 | Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models 33 | Keep a moving average of everything in the model state_dict (parameters and buffers). 34 | This is intended to allow functionality like 35 | https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage 36 | A smoothed version of the weights is necessary for some training schemes to perform well. 37 | This class is sensitive where it is initialized in the sequence of model init, 38 | GPU assignment and distributed training wrappers. 39 | """ 40 | 41 | def __init__(self, model, decay=0.9999, updates=0): 42 | """ 43 | Args: 44 | model (nn.Module): model to apply EMA. 45 | decay (float): ema decay reate. 46 | updates (int): counter of EMA updates. 47 | """ 48 | # Create EMA(FP32) 49 | self.ema = deepcopy(model.module if is_parallel(model) else model).eval() 50 | self.updates = updates 51 | # decay exponential ramp (to help early epochs) 52 | self.decay = lambda x: decay * (1 - math.exp(-x / 2000)) 53 | for p in self.ema.parameters(): 54 | p.requires_grad_(False) 55 | 56 | def update(self, model): 57 | # Update EMA parameters 58 | with torch.no_grad(): 59 | self.updates += 1 60 | d = self.decay(self.updates) 61 | 62 | msd = ( 63 | model.module.state_dict() if is_parallel(model) else model.state_dict() 64 | ) # model state_dict 65 | for k, v in self.ema.state_dict().items(): 66 | if v.dtype.is_floating_point: 67 | v *= d 68 | v += (1.0 - d) * msd[k].detach() 69 | 70 | def update_attr(self, model, include=(), exclude=("process_group", "reducer")): 71 | # Update EMA attributes 72 | copy_attr(self.ema, model, include, exclude) 73 | -------------------------------------------------------------------------------- /yolox/models/yolo_fpn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | import torch 6 | import torch.nn as nn 7 | 8 | from .darknet import Darknet 9 | from .network_blocks import BaseConv 10 | 11 | 12 | class YOLOFPN(nn.Module): 13 | """ 14 | YOLOFPN module. Darknet 53 is the default backbone of this model. 15 | """ 16 | 17 | def __init__( 18 | self, 19 | depth=53, 20 | in_features=["dark3", "dark4", "dark5"], 21 | ): 22 | super().__init__() 23 | 24 | self.backbone = Darknet(depth) 25 | self.in_features = in_features 26 | 27 | # out 1 28 | self.out1_cbl = self._make_cbl(512, 256, 1) 29 | self.out1 = self._make_embedding([256, 512], 512 + 256) 30 | 31 | # out 2 32 | self.out2_cbl = self._make_cbl(256, 128, 1) 33 | self.out2 = self._make_embedding([128, 256], 256 + 128) 34 | 35 | # upsample 36 | self.upsample = nn.Upsample(scale_factor=2, mode="nearest") 37 | 38 | def _make_cbl(self, _in, _out, ks): 39 | return BaseConv(_in, _out, ks, stride=1, act="lrelu") 40 | 41 | def _make_embedding(self, filters_list, in_filters): 42 | m = nn.Sequential( 43 | *[ 44 | self._make_cbl(in_filters, filters_list[0], 1), 45 | self._make_cbl(filters_list[0], filters_list[1], 3), 46 | self._make_cbl(filters_list[1], filters_list[0], 1), 47 | self._make_cbl(filters_list[0], filters_list[1], 3), 48 | self._make_cbl(filters_list[1], filters_list[0], 1), 49 | ] 50 | ) 51 | return m 52 | 53 | def load_pretrained_model(self, filename="./weights/darknet53.mix.pth"): 54 | with open(filename, "rb") as f: 55 | state_dict = torch.load(f, map_location="cpu") 56 | print("loading pretrained weights...") 57 | self.backbone.load_state_dict(state_dict) 58 | 59 | def forward(self, inputs): 60 | """ 61 | Args: 62 | inputs (Tensor): input image. 63 | 64 | Returns: 65 | Tuple[Tensor]: FPN output features.. 66 | """ 67 | # backbone 68 | out_features = self.backbone(inputs) 69 | x2, x1, x0 = [out_features[f] for f in self.in_features] 70 | 71 | # yolo branch 1 72 | x1_in = self.out1_cbl(x0) 73 | x1_in = self.upsample(x1_in) 74 | x1_in = torch.cat([x1_in, x1], 1) 75 | out_dark4 = self.out1(x1_in) 76 | 77 | # yolo branch 2 78 | x2_in = self.out2_cbl(out_dark4) 79 | x2_in = self.upsample(x2_in) 80 | x2_in = torch.cat([x2_in, x2], 1) 81 | out_dark3 = self.out2(x2_in) 82 | 83 | outputs = (out_dark3, out_dark4, x0) 84 | return outputs 85 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | datasets/* 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # output 132 | docs/api 133 | .code-workspace.code-workspace 134 | *.pkl 135 | *.npy 136 | *.pth 137 | *.onnx 138 | *.engine 139 | events.out.tfevents* 140 | pretrained 141 | *_outputs/ 142 | DiffusionTrack_*/ 143 | datasets/ 144 | *.pth.tar 145 | *.tar.gz 146 | src/* 147 | test.py 148 | id_rsa_cs 149 | module_test.py 150 | vis_fold -------------------------------------------------------------------------------- /tools/mix_data_ablation.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | 5 | """ 6 | cd datasets 7 | mkdir -p mix_mot_ch/annotations 8 | cp mot/annotations/val_half.json mix_mot_ch/annotations/val_half.json 9 | cp mot/annotations/test.json mix_mot_ch/annotations/test.json 10 | cd mix_mot_ch 11 | ln -s ../mot/train mot_train 12 | ln -s ../crowdhuman/CrowdHuman_train crowdhuman_train 13 | ln -s ../crowdhuman/CrowdHuman_val crowdhuman_val 14 | cd .. 15 | """ 16 | 17 | mot_json = json.load(open('datasets/mot/annotations/train_half.json','r')) 18 | 19 | img_list = list() 20 | for img in mot_json['images']: 21 | img['file_name'] = 'mot_train/' + img['file_name'] 22 | img_list.append(img) 23 | 24 | ann_list = list() 25 | for ann in mot_json['annotations']: 26 | ann_list.append(ann) 27 | 28 | video_list = mot_json['videos'] 29 | category_list = mot_json['categories'] 30 | 31 | print('mot17') 32 | 33 | max_img = 10000 34 | max_ann = 2000000 35 | max_video = 10 36 | 37 | crowdhuman_json = json.load(open('datasets/crowdhuman/annotations/train.json','r')) 38 | img_id_count = 0 39 | for img in crowdhuman_json['images']: 40 | img_id_count += 1 41 | img['file_name'] = 'crowdhuman_train/Images/' + img['file_name'] 42 | img['frame_id'] = img_id_count 43 | img['prev_image_id'] = img['id'] + max_img 44 | img['next_image_id'] = img['id'] + max_img 45 | img['id'] = img['id'] + max_img 46 | img['video_id'] = max_video 47 | img_list.append(img) 48 | 49 | for ann in crowdhuman_json['annotations']: 50 | ann['id'] = ann['id'] + max_ann 51 | ann['image_id'] = ann['image_id'] + max_img 52 | ann_list.append(ann) 53 | 54 | video_list.append({ 55 | 'id': max_video, 56 | 'file_name': 'crowdhuman_train' 57 | }) 58 | 59 | print('crowdhuman_train') 60 | 61 | max_img = 30000 62 | max_ann = 10000000 63 | 64 | crowdhuman_val_json = json.load(open('datasets/crowdhuman/annotations/val.json','r')) 65 | img_id_count = 0 66 | for img in crowdhuman_val_json['images']: 67 | img_id_count += 1 68 | img['file_name'] = 'crowdhuman_val/Images/' + img['file_name'] 69 | img['frame_id'] = img_id_count 70 | img['prev_image_id'] = img['id'] + max_img 71 | img['next_image_id'] = img['id'] + max_img 72 | img['id'] = img['id'] + max_img 73 | img['video_id'] = max_video 74 | img_list.append(img) 75 | 76 | for ann in crowdhuman_val_json['annotations']: 77 | ann['id'] = ann['id'] + max_ann 78 | ann['image_id'] = ann['image_id'] + max_img 79 | ann_list.append(ann) 80 | 81 | video_list.append({ 82 | 'id': max_video, 83 | 'file_name': 'crowdhuman_val' 84 | }) 85 | 86 | print('crowdhuman_val') 87 | 88 | mix_json = dict() 89 | mix_json['images'] = img_list 90 | mix_json['annotations'] = ann_list 91 | mix_json['videos'] = video_list 92 | mix_json['categories'] = category_list 93 | json.dump(mix_json, open('datasets/mix_mot_ch/annotations/train.json','w')) -------------------------------------------------------------------------------- /yolox/utils/logger.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | from loguru import logger 6 | 7 | import inspect 8 | import os 9 | import sys 10 | 11 | 12 | def get_caller_name(depth=0): 13 | """ 14 | Args: 15 | depth (int): Depth of caller conext, use 0 for caller depth. Default value: 0. 16 | 17 | Returns: 18 | str: module name of the caller 19 | """ 20 | # the following logic is a little bit faster than inspect.stack() logic 21 | frame = inspect.currentframe().f_back 22 | for _ in range(depth): 23 | frame = frame.f_back 24 | 25 | return frame.f_globals["__name__"] 26 | 27 | 28 | class StreamToLoguru: 29 | """ 30 | stream object that redirects writes to a logger instance. 31 | """ 32 | 33 | def __init__(self, level="INFO", caller_names=("apex", "pycocotools")): 34 | """ 35 | Args: 36 | level(str): log level string of loguru. Default value: "INFO". 37 | caller_names(tuple): caller names of redirected module. 38 | Default value: (apex, pycocotools). 39 | """ 40 | self.level = level 41 | self.linebuf = "" 42 | self.caller_names = caller_names 43 | 44 | def write(self, buf): 45 | full_name = get_caller_name(depth=1) 46 | module_name = full_name.rsplit(".", maxsplit=-1)[0] 47 | if module_name in self.caller_names: 48 | for line in buf.rstrip().splitlines(): 49 | # use caller level log 50 | logger.opt(depth=2).log(self.level, line.rstrip()) 51 | else: 52 | sys.__stdout__.write(buf) 53 | 54 | def flush(self): 55 | pass 56 | 57 | 58 | def redirect_sys_output(log_level="INFO"): 59 | redirect_logger = StreamToLoguru(log_level) 60 | sys.stderr = redirect_logger 61 | sys.stdout = redirect_logger 62 | 63 | 64 | def setup_logger(save_dir, distributed_rank=0, filename="log.txt", mode="a"): 65 | """setup logger for training and testing. 66 | Args: 67 | save_dir(str): location to save log file 68 | distributed_rank(int): device rank when multi-gpu environment 69 | filename (string): log save name. 70 | mode(str): log file write mode, `append` or `override`. default is `a`. 71 | 72 | Return: 73 | logger instance. 74 | """ 75 | loguru_format = ( 76 | "{time:YYYY-MM-DD HH:mm:ss} | " 77 | "{level: <8} | " 78 | "{name}:{line} - {message}" 79 | ) 80 | 81 | logger.remove() 82 | save_file = os.path.join(save_dir, filename) 83 | if mode == "o" and os.path.exists(save_file): 84 | os.remove(save_file) 85 | # only keep logger in rank0 process 86 | if distributed_rank == 0: 87 | logger.add( 88 | sys.stderr, 89 | format=loguru_format, 90 | level="INFO", 91 | enqueue=True, 92 | ) 93 | logger.add(save_file) 94 | 95 | # redirect stdout/stderr to loguru 96 | redirect_sys_output("INFO") 97 | -------------------------------------------------------------------------------- /yolox/utils/demo_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | import numpy as np 6 | 7 | import os 8 | 9 | __all__ = ["mkdir", "nms", "multiclass_nms", "demo_postprocess"] 10 | 11 | 12 | def mkdir(path): 13 | if not os.path.exists(path): 14 | os.makedirs(path) 15 | 16 | 17 | def nms(boxes, scores, nms_thr): 18 | """Single class NMS implemented in Numpy.""" 19 | x1 = boxes[:, 0] 20 | y1 = boxes[:, 1] 21 | x2 = boxes[:, 2] 22 | y2 = boxes[:, 3] 23 | 24 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 25 | order = scores.argsort()[::-1] 26 | 27 | keep = [] 28 | while order.size > 0: 29 | i = order[0] 30 | keep.append(i) 31 | xx1 = np.maximum(x1[i], x1[order[1:]]) 32 | yy1 = np.maximum(y1[i], y1[order[1:]]) 33 | xx2 = np.minimum(x2[i], x2[order[1:]]) 34 | yy2 = np.minimum(y2[i], y2[order[1:]]) 35 | 36 | w = np.maximum(0.0, xx2 - xx1 + 1) 37 | h = np.maximum(0.0, yy2 - yy1 + 1) 38 | inter = w * h 39 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 40 | 41 | inds = np.where(ovr <= nms_thr)[0] 42 | order = order[inds + 1] 43 | 44 | return keep 45 | 46 | 47 | def multiclass_nms(boxes, scores, nms_thr, score_thr): 48 | """Multiclass NMS implemented in Numpy""" 49 | final_dets = [] 50 | num_classes = scores.shape[1] 51 | for cls_ind in range(num_classes): 52 | cls_scores = scores[:, cls_ind] 53 | valid_score_mask = cls_scores > score_thr 54 | if valid_score_mask.sum() == 0: 55 | continue 56 | else: 57 | valid_scores = cls_scores[valid_score_mask] 58 | valid_boxes = boxes[valid_score_mask] 59 | keep = nms(valid_boxes, valid_scores, nms_thr) 60 | if len(keep) > 0: 61 | cls_inds = np.ones((len(keep), 1)) * cls_ind 62 | dets = np.concatenate( 63 | [valid_boxes[keep], valid_scores[keep, None], cls_inds], 1 64 | ) 65 | final_dets.append(dets) 66 | if len(final_dets) == 0: 67 | return None 68 | return np.concatenate(final_dets, 0) 69 | 70 | 71 | def demo_postprocess(outputs, img_size, p6=False): 72 | 73 | grids = [] 74 | expanded_strides = [] 75 | 76 | if not p6: 77 | strides = [8, 16, 32] 78 | else: 79 | strides = [8, 16, 32, 64] 80 | 81 | hsizes = [img_size[0] // stride for stride in strides] 82 | wsizes = [img_size[1] // stride for stride in strides] 83 | 84 | for hsize, wsize, stride in zip(hsizes, wsizes, strides): 85 | xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize)) 86 | grid = np.stack((xv, yv), 2).reshape(1, -1, 2) 87 | grids.append(grid) 88 | shape = grid.shape[:2] 89 | expanded_strides.append(np.full((*shape, 1), stride)) 90 | 91 | grids = np.concatenate(grids, 1) 92 | expanded_strides = np.concatenate(expanded_strides, 1) 93 | outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides 94 | outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides 95 | 96 | return outputs 97 | -------------------------------------------------------------------------------- /yolox/models/losses.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | 10 | class IOUloss(nn.Module): 11 | def __init__(self, reduction="none", loss_type="iou"): 12 | super(IOUloss, self).__init__() 13 | self.reduction = reduction 14 | self.loss_type = loss_type 15 | 16 | def forward(self, pred, target): 17 | assert pred.shape[0] == target.shape[0] 18 | 19 | pred = pred.view(-1, 4) 20 | target = target.view(-1, 4) 21 | tl = torch.max( 22 | (pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2) 23 | ) 24 | br = torch.min( 25 | (pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2) 26 | ) 27 | 28 | area_p = torch.prod(pred[:, 2:], 1) 29 | area_g = torch.prod(target[:, 2:], 1) 30 | 31 | en = (tl < br).type(tl.type()).prod(dim=1) 32 | area_i = torch.prod(br - tl, 1) * en 33 | iou = (area_i) / (area_p + area_g - area_i + 1e-16) 34 | 35 | if self.loss_type == "iou": 36 | loss = 1 - iou ** 2 37 | elif self.loss_type == "giou": 38 | c_tl = torch.min( 39 | (pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2) 40 | ) 41 | c_br = torch.max( 42 | (pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2) 43 | ) 44 | area_c = torch.prod(c_br - c_tl, 1) 45 | giou = iou - (area_c - area_i) / area_c.clamp(1e-16) 46 | loss = 1 - giou.clamp(min=-1.0, max=1.0) 47 | 48 | if self.reduction == "mean": 49 | loss = loss.mean() 50 | elif self.reduction == "sum": 51 | loss = loss.sum() 52 | 53 | return loss 54 | 55 | 56 | def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): 57 | """ 58 | Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. 59 | Args: 60 | inputs: A float tensor of arbitrary shape. 61 | The predictions for each example. 62 | targets: A float tensor with the same shape as inputs. Stores the binary 63 | classification label for each element in inputs 64 | (0 for the negative class and 1 for the positive class). 65 | alpha: (optional) Weighting factor in range (0,1) to balance 66 | positive vs negative examples. Default = -1 (no weighting). 67 | gamma: Exponent of the modulating factor (1 - p_t) to 68 | balance easy vs hard examples. 69 | Returns: 70 | Loss tensor 71 | """ 72 | prob = inputs.sigmoid() 73 | ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") 74 | p_t = prob * targets + (1 - prob) * (1 - targets) 75 | loss = ce_loss * ((1 - p_t) ** gamma) 76 | 77 | if alpha >= 0: 78 | alpha_t = alpha * targets + (1 - alpha) * (1 - targets) 79 | loss = alpha_t * loss 80 | #return loss.mean(0).sum() / num_boxes 81 | return loss.sum() / num_boxes -------------------------------------------------------------------------------- /exps/default/yolov3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | import os 6 | import torch 7 | import torch.nn as nn 8 | 9 | from yolox.exp import Exp as MyExp 10 | 11 | 12 | class Exp(MyExp): 13 | def __init__(self): 14 | super(Exp, self).__init__() 15 | self.depth = 1.0 16 | self.width = 1.0 17 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 18 | 19 | def get_model(self, sublinear=False): 20 | def init_yolo(M): 21 | for m in M.modules(): 22 | if isinstance(m, nn.BatchNorm2d): 23 | m.eps = 1e-3 24 | m.momentum = 0.03 25 | if "model" not in self.__dict__: 26 | from yolox.models import YOLOX, YOLOFPN, YOLOXHead 27 | backbone = YOLOFPN() 28 | head = YOLOXHead(self.num_classes, self.width, in_channels=[128, 256, 512], act="lrelu") 29 | self.model = YOLOX(backbone, head) 30 | self.model.apply(init_yolo) 31 | self.model.head.initialize_biases(1e-2) 32 | 33 | return self.model 34 | 35 | def get_data_loader(self, batch_size, is_distributed, no_aug=False): 36 | from data.datasets.cocodataset import COCODataset 37 | from data.datasets.mosaicdetection import MosaicDetection 38 | from data.datasets.data_augment import TrainTransform 39 | from data.datasets.dataloading import YoloBatchSampler, DataLoader, InfiniteSampler 40 | import torch.distributed as dist 41 | 42 | dataset = COCODataset( 43 | data_dir='data/COCO/', 44 | json_file=self.train_ann, 45 | img_size=self.input_size, 46 | preproc=TrainTransform( 47 | rgb_means=(0.485, 0.456, 0.406), 48 | std=(0.229, 0.224, 0.225), 49 | max_labels=50 50 | ), 51 | ) 52 | 53 | dataset = MosaicDetection( 54 | dataset, 55 | mosaic=not no_aug, 56 | img_size=self.input_size, 57 | preproc=TrainTransform( 58 | rgb_means=(0.485, 0.456, 0.406), 59 | std=(0.229, 0.224, 0.225), 60 | max_labels=120 61 | ), 62 | degrees=self.degrees, 63 | translate=self.translate, 64 | scale=self.scale, 65 | shear=self.shear, 66 | perspective=self.perspective, 67 | ) 68 | 69 | self.dataset = dataset 70 | 71 | if is_distributed: 72 | batch_size = batch_size // dist.get_world_size() 73 | sampler = InfiniteSampler(len(self.dataset), seed=self.seed if self.seed else 0) 74 | else: 75 | sampler = torch.utils.data.RandomSampler(self.dataset) 76 | 77 | batch_sampler = YoloBatchSampler( 78 | sampler=sampler, 79 | batch_size=batch_size, 80 | drop_last=False, 81 | input_dimension=self.input_size, 82 | mosaic=not no_aug 83 | ) 84 | 85 | dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True} 86 | dataloader_kwargs["batch_sampler"] = batch_sampler 87 | train_loader = DataLoader(self.dataset, **dataloader_kwargs) 88 | 89 | return train_loader 90 | -------------------------------------------------------------------------------- /diffusion/models/diffusionnet.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | from typing import List 4 | from collections import namedtuple 5 | 6 | import torch 7 | import torch.nn.functional as F 8 | from torch import nn 9 | from yolox.models.yolo_pafpn import YOLOPAFPN 10 | from .diffusion_head import DiffusionHead 11 | from yolox.models.network_blocks import BaseConv 12 | 13 | class DiffusionNet(nn.Module): 14 | """ 15 | Implement DiffusionNet 16 | """ 17 | 18 | def __init__(self, backbone=None, head=None, act="silu"): 19 | super().__init__() 20 | self.backbone=backbone 21 | self.head=head 22 | self.projs=nn.ModuleList() 23 | in_channels=backbone.in_channels 24 | for i in range(len(in_channels)): 25 | self.projs.append( 26 | BaseConv( 27 | in_channels=int(in_channels[i] * head.width), 28 | out_channels=int(head.hidden_dim), 29 | ksize=1, 30 | stride=1, 31 | act=act, 32 | )) 33 | 34 | def forward(self, x, targets=(None,None),random_flip=False,input_size=None): 35 | # fpn output content features of [dark3, dark4, dark5] 36 | # x format (pre_imgs,cur_imgs) (B,C,H,W) 37 | # targets format (pre_targets,cur_targets) (B,N,5) class cx cy w h 38 | pre_imgs,cur_imgs=x 39 | pre_targets,cur_targets=targets 40 | mate_info=(pre_imgs.shape,pre_imgs.device,pre_imgs.dtype) 41 | bs,_,_,_=mate_info[0] 42 | if cur_imgs is None: 43 | x_input=pre_imgs 44 | else: 45 | x_input=torch.cat([pre_imgs,cur_imgs],dim=0) 46 | 47 | fpn_outs = self.backbone(x_input) 48 | flip_mode=False 49 | if random_flip and torch.randn((1,1))[0]>0.5: 50 | flip_mode=True 51 | pre_features,cur_features=[],[] 52 | 53 | for proj,x_out in zip(self.projs,fpn_outs): 54 | l_feat=proj(x_out) 55 | if cur_imgs is None: 56 | pre_features.append(l_feat) 57 | if flip_mode: 58 | cur_features.append(torch.flip(l_feat,dims=[3])) 59 | else: 60 | cur_features.append(l_feat.clone()) 61 | else: 62 | pre_l_feat,cur_l_feat=l_feat.split(bs,dim=0) 63 | pre_features.append(pre_l_feat) 64 | cur_features.append(cur_l_feat) 65 | 66 | features=(pre_features,cur_features) 67 | 68 | if self.training: 69 | assert pre_targets is not None 70 | if cur_targets is None: 71 | cur_targets=pre_targets.clone() 72 | if flip_mode: 73 | nlabels=(cur_targets.sum(-1)>0).sum(-1) 74 | for idx,nlabel in enumerate(nlabels): 75 | cur_targets[idx,:nlabel,1]=input_size[1]-cur_targets[idx,:nlabel,1] 76 | loss_dict = self.head( 77 | features,mate_info,targets=torch.cat([pre_targets,cur_targets],dim=0)) 78 | if 'total_loss' not in loss_dict: 79 | loss_dict['total_loss']=sum(loss_dict.values()) 80 | outputs=loss_dict 81 | return outputs 82 | else: 83 | outputs = self.head(features,mate_info,targets=pre_targets) 84 | 85 | return outputs 86 | 87 | 88 | -------------------------------------------------------------------------------- /yolox/utils/allreduce_norm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | import torch 6 | from torch import distributed as dist 7 | from torch import nn 8 | 9 | import pickle 10 | from collections import OrderedDict 11 | 12 | from .dist import _get_global_gloo_group, get_world_size 13 | 14 | ASYNC_NORM = ( 15 | nn.BatchNorm1d, 16 | nn.BatchNorm2d, 17 | nn.BatchNorm3d, 18 | nn.InstanceNorm1d, 19 | nn.InstanceNorm2d, 20 | nn.InstanceNorm3d, 21 | ) 22 | 23 | __all__ = [ 24 | "get_async_norm_states", 25 | "pyobj2tensor", 26 | "tensor2pyobj", 27 | "all_reduce", 28 | "all_reduce_norm", 29 | ] 30 | 31 | 32 | def get_async_norm_states(module): 33 | async_norm_states = OrderedDict() 34 | for name, child in module.named_modules(): 35 | if isinstance(child, ASYNC_NORM): 36 | for k, v in child.state_dict().items(): 37 | async_norm_states[".".join([name, k])] = v 38 | return async_norm_states 39 | 40 | 41 | def pyobj2tensor(pyobj, device="cuda"): 42 | """serialize picklable python object to tensor""" 43 | storage = torch.ByteStorage.from_buffer(pickle.dumps(pyobj)) 44 | return torch.ByteTensor(storage).to(device=device) 45 | 46 | 47 | def tensor2pyobj(tensor): 48 | """deserialize tensor to picklable python object""" 49 | return pickle.loads(tensor.cpu().numpy().tobytes()) 50 | 51 | 52 | def _get_reduce_op(op_name): 53 | return { 54 | "sum": dist.ReduceOp.SUM, 55 | "mean": dist.ReduceOp.SUM, 56 | }[op_name.lower()] 57 | 58 | 59 | def all_reduce(py_dict, op="sum", group=None): 60 | """ 61 | Apply all reduce function for python dict object. 62 | NOTE: make sure that every py_dict has the same keys and values are in the same shape. 63 | 64 | Args: 65 | py_dict (dict): dict to apply all reduce op. 66 | op (str): operator, could be "sum" or "mean". 67 | """ 68 | world_size = get_world_size() 69 | if world_size == 1: 70 | return py_dict 71 | if group is None: 72 | group = _get_global_gloo_group() 73 | if dist.get_world_size(group) == 1: 74 | return py_dict 75 | 76 | # all reduce logic across different devices. 77 | py_key = list(py_dict.keys()) 78 | py_key_tensor = pyobj2tensor(py_key) 79 | dist.broadcast(py_key_tensor, src=0) 80 | py_key = tensor2pyobj(py_key_tensor) 81 | 82 | tensor_shapes = [py_dict[k].shape for k in py_key] 83 | tensor_numels = [py_dict[k].numel() for k in py_key] 84 | 85 | flatten_tensor = torch.cat([py_dict[k].flatten() for k in py_key]) 86 | dist.all_reduce(flatten_tensor, op=_get_reduce_op(op)) 87 | if op == "mean": 88 | flatten_tensor /= world_size 89 | 90 | split_tensors = [ 91 | x.reshape(shape) 92 | for x, shape in zip(torch.split(flatten_tensor, tensor_numels), tensor_shapes) 93 | ] 94 | return OrderedDict({k: v for k, v in zip(py_key, split_tensors)}) 95 | 96 | 97 | def all_reduce_norm(module): 98 | """ 99 | All reduce norm statistics in different devices. 100 | """ 101 | states = get_async_norm_states(module) 102 | states = all_reduce(states, op="mean") 103 | module.load_state_dict(states, strict=False) 104 | -------------------------------------------------------------------------------- /yolox/data/data_prefetcher.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | import torch 6 | import torch.distributed as dist 7 | 8 | from yolox.utils import synchronize 9 | 10 | import random 11 | 12 | 13 | class DataPrefetcher: 14 | """ 15 | DataPrefetcher is inspired by code of following file: 16 | https://github.com/NVIDIA/apex/blob/master/examples/imagenet/main_amp.py 17 | It could speedup your pytorch dataloader. For more information, please check 18 | https://github.com/NVIDIA/apex/issues/304#issuecomment-493562789. 19 | """ 20 | 21 | def __init__(self, loader,task): 22 | self.loader = iter(loader) 23 | self.task=task 24 | self.stream = torch.cuda.Stream() 25 | self.record_stream = DataPrefetcher._record_stream_for_image 26 | self.preload() 27 | 28 | def preload(self): 29 | try: 30 | if self.task=="tracking": 31 | self.next_input_pre, self.next_target_pre,self.next_input_cur, self.next_target_cur,_, _ = next(self.loader) 32 | else: 33 | self.next_input_pre, self.next_target_pre, _, _ = next(self.loader) 34 | except StopIteration: 35 | self.next_input_pre = None 36 | self.next_target_pre = None 37 | if self.task=="tracking": 38 | self.next_input_cur = None 39 | self.next_target_cur = None 40 | return 41 | 42 | with torch.cuda.stream(self.stream): 43 | self.next_input_pre = self.next_input_pre.cuda(non_blocking=True) 44 | self.next_target_pre = self.next_target_pre.cuda(non_blocking=True) 45 | if self.task=="tracking": 46 | self.next_input_cur = self.next_input_cur.cuda(non_blocking=True) 47 | self.next_target_cur = self.next_target_cur.cuda(non_blocking=True) 48 | 49 | 50 | def next(self): 51 | torch.cuda.current_stream().wait_stream(self.stream) 52 | input_pre = self.next_input_pre 53 | target_pre = self.next_target_pre 54 | input_cur = None 55 | target_cur = None 56 | if self.task=="tracking": 57 | input_cur = self.next_input_cur 58 | target_cur = self.next_target_cur 59 | if input_pre is not None: 60 | self.record_stream(input_pre) 61 | if target_pre is not None: 62 | target_pre.record_stream(torch.cuda.current_stream()) 63 | if self.task=="tracking": 64 | if input_cur is not None: 65 | self.record_stream(input_cur) 66 | if target_cur is not None: 67 | target_cur.record_stream(torch.cuda.current_stream()) 68 | self.preload() 69 | return input_pre,target_pre,input_cur,target_cur 70 | 71 | 72 | @staticmethod 73 | def _record_stream_for_image(input): 74 | input.record_stream(torch.cuda.current_stream()) 75 | 76 | 77 | def random_resize(data_loader, exp, epoch, rank, is_distributed): 78 | tensor = torch.LongTensor(1).cuda() 79 | if is_distributed: 80 | synchronize() 81 | 82 | if rank == 0: 83 | if epoch > exp.max_epoch - 10: 84 | size = exp.input_size 85 | else: 86 | size = random.randint(*exp.random_size) 87 | size = int(32 * size) 88 | tensor.fill_(size) 89 | 90 | if is_distributed: 91 | synchronize() 92 | dist.broadcast(tensor, 0) 93 | 94 | input_size = data_loader.change_input_dim(multiple=tensor.item(), random_range=None) 95 | return -------------------------------------------------------------------------------- /yolox/data/samplers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | import torch 6 | import torch.distributed as dist 7 | from torch.utils.data.sampler import BatchSampler as torchBatchSampler 8 | from torch.utils.data.sampler import Sampler 9 | 10 | import itertools 11 | from typing import Optional 12 | 13 | 14 | class YoloBatchSampler(torchBatchSampler): 15 | """ 16 | This batch sampler will generate mini-batches of (dim, index) tuples from another sampler. 17 | It works just like the :class:`torch.utils.data.sampler.BatchSampler`, 18 | but it will prepend a dimension, whilst ensuring it stays the same across one mini-batch. 19 | """ 20 | 21 | def __init__(self, *args, input_dimension=None, mosaic=True, **kwargs): 22 | super().__init__(*args, **kwargs) 23 | self.input_dim = input_dimension 24 | self.new_input_dim = None 25 | self.mosaic = mosaic 26 | 27 | def __iter__(self): 28 | self.__set_input_dim() 29 | for batch in super().__iter__(): 30 | yield [(self.input_dim, idx, self.mosaic) for idx in batch] 31 | self.__set_input_dim() 32 | 33 | def __set_input_dim(self): 34 | """ This function randomly changes the the input dimension of the dataset. """ 35 | if self.new_input_dim is not None: 36 | self.input_dim = (self.new_input_dim[0], self.new_input_dim[1]) 37 | self.new_input_dim = None 38 | 39 | 40 | class InfiniteSampler(Sampler): 41 | """ 42 | In training, we only care about the "infinite stream" of training data. 43 | So this sampler produces an infinite stream of indices and 44 | all workers cooperate to correctly shuffle the indices and sample different indices. 45 | The samplers in each worker effectively produces `indices[worker_id::num_workers]` 46 | where `indices` is an infinite stream of indices consisting of 47 | `shuffle(range(size)) + shuffle(range(size)) + ...` (if shuffle is True) 48 | or `range(size) + range(size) + ...` (if shuffle is False) 49 | """ 50 | 51 | def __init__( 52 | self, 53 | size: int, 54 | shuffle: bool = True, 55 | seed: Optional[int] = 0, 56 | rank=0, 57 | world_size=1, 58 | ): 59 | """ 60 | Args: 61 | size (int): the total number of data of the underlying dataset to sample from 62 | shuffle (bool): whether to shuffle the indices or not 63 | seed (int): the initial seed of the shuffle. Must be the same 64 | across all workers. If None, will use a random seed shared 65 | among workers (require synchronization among all workers). 66 | """ 67 | self._size = size 68 | assert size > 0 69 | self._shuffle = shuffle 70 | self._seed = int(seed) 71 | 72 | if dist.is_available() and dist.is_initialized(): 73 | self._rank = dist.get_rank() 74 | self._world_size = dist.get_world_size() 75 | else: 76 | self._rank = rank 77 | self._world_size = world_size 78 | 79 | def __iter__(self): 80 | start = self._rank 81 | yield from itertools.islice( 82 | self._infinite_indices(), start, None, self._world_size 83 | ) 84 | 85 | def _infinite_indices(self): 86 | g = torch.Generator() 87 | g.manual_seed(self._seed) 88 | while True: 89 | if self._shuffle: 90 | yield from torch.randperm(self._size, generator=g) 91 | else: 92 | yield from torch.arange(self._size) 93 | 94 | def __len__(self): 95 | return self._size // self._world_size 96 | -------------------------------------------------------------------------------- /yolox/utils/metric.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | import numpy as np 5 | 6 | import torch 7 | 8 | import functools 9 | import os 10 | import time 11 | from collections import defaultdict, deque 12 | 13 | __all__ = [ 14 | "AverageMeter", 15 | "MeterBuffer", 16 | "get_total_and_free_memory_in_Mb", 17 | "occupy_mem", 18 | "gpu_mem_usage", 19 | ] 20 | 21 | 22 | def get_total_and_free_memory_in_Mb(cuda_device): 23 | devices_info_str = os.popen( 24 | "nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader" 25 | ) 26 | devices_info = devices_info_str.read().strip().split("\n") 27 | total, used = devices_info[int(cuda_device)].split(",") 28 | return int(total), int(used) 29 | 30 | 31 | def occupy_mem(cuda_device, mem_ratio=0.95): 32 | """ 33 | pre-allocate gpu memory for training to avoid memory Fragmentation. 34 | """ 35 | total, used = get_total_and_free_memory_in_Mb(cuda_device) 36 | max_mem = int(total * mem_ratio) 37 | block_mem = max_mem - used 38 | x = torch.cuda.FloatTensor(256, 1024, block_mem) 39 | del x 40 | time.sleep(5) 41 | 42 | 43 | def gpu_mem_usage(): 44 | """ 45 | Compute the GPU memory usage for the current device (MB). 46 | """ 47 | mem_usage_bytes = torch.cuda.max_memory_allocated() 48 | return mem_usage_bytes / (1024 * 1024) 49 | 50 | 51 | class AverageMeter: 52 | """Track a series of values and provide access to smoothed values over a 53 | window or the global series average. 54 | """ 55 | 56 | def __init__(self, window_size=50): 57 | self._deque = deque(maxlen=window_size) 58 | self._total = 0.0 59 | self._count = 0 60 | 61 | def update(self, value): 62 | self._deque.append(value) 63 | self._count += 1 64 | self._total += value 65 | 66 | @property 67 | def median(self): 68 | d = np.array(list(self._deque)) 69 | return np.median(d) 70 | 71 | @property 72 | def avg(self): 73 | # if deque is empty, nan will be returned. 74 | d = np.array(list(self._deque)) 75 | return d.mean() 76 | 77 | @property 78 | def global_avg(self): 79 | return self._total / max(self._count, 1e-5) 80 | 81 | @property 82 | def latest(self): 83 | return self._deque[-1] if len(self._deque) > 0 else None 84 | 85 | @property 86 | def total(self): 87 | return self._total 88 | 89 | def reset(self): 90 | self._deque.clear() 91 | self._total = 0.0 92 | self._count = 0 93 | 94 | def clear(self): 95 | self._deque.clear() 96 | 97 | 98 | class MeterBuffer(defaultdict): 99 | """Computes and stores the average and current value""" 100 | 101 | def __init__(self, window_size=20): 102 | factory = functools.partial(AverageMeter, window_size=window_size) 103 | super().__init__(factory) 104 | 105 | def reset(self): 106 | for v in self.values(): 107 | v.reset() 108 | 109 | def get_filtered_meter(self, filter_key="time"): 110 | return {k: v for k, v in self.items() if filter_key in k} 111 | 112 | def update(self, values=None, **kwargs): 113 | if values is None: 114 | values = {} 115 | values.update(kwargs) 116 | for k, v in values.items(): 117 | if isinstance(v, torch.Tensor): 118 | v = v.detach() 119 | self[k].update(v) 120 | 121 | def clear_meters(self): 122 | for v in self.values(): 123 | v.clear() 124 | -------------------------------------------------------------------------------- /yolox/layers/csrc/cocoeval/cocoeval.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | #pragma once 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace py = pybind11; 11 | 12 | namespace COCOeval { 13 | 14 | // Annotation data for a single object instance in an image 15 | struct InstanceAnnotation { 16 | InstanceAnnotation( 17 | uint64_t id, 18 | double score, 19 | double area, 20 | bool is_crowd, 21 | bool ignore) 22 | : id{id}, score{score}, area{area}, is_crowd{is_crowd}, ignore{ignore} {} 23 | uint64_t id; 24 | double score = 0.; 25 | double area = 0.; 26 | bool is_crowd = false; 27 | bool ignore = false; 28 | }; 29 | 30 | // Stores intermediate results for evaluating detection results for a single 31 | // image that has D detected instances and G ground truth instances. This stores 32 | // matches between detected and ground truth instances 33 | struct ImageEvaluation { 34 | // For each of the D detected instances, the id of the matched ground truth 35 | // instance, or 0 if unmatched 36 | std::vector detection_matches; 37 | 38 | // The detection score of each of the D detected instances 39 | std::vector detection_scores; 40 | 41 | // Marks whether or not each of G instances was ignored from evaluation (e.g., 42 | // because it's outside area_range) 43 | std::vector ground_truth_ignores; 44 | 45 | // Marks whether or not each of D instances was ignored from evaluation (e.g., 46 | // because it's outside aRng) 47 | std::vector detection_ignores; 48 | }; 49 | 50 | template 51 | using ImageCategoryInstances = std::vector>>; 52 | 53 | // C++ implementation of COCO API cocoeval.py::COCOeval.evaluateImg(). For each 54 | // combination of image, category, area range settings, and IOU thresholds to 55 | // evaluate, it matches detected instances to ground truth instances and stores 56 | // the results into a vector of ImageEvaluation results, which will be 57 | // interpreted by the COCOeval::Accumulate() function to produce precion-recall 58 | // curves. The parameters of nested vectors have the following semantics: 59 | // image_category_ious[i][c][d][g] is the intersection over union of the d'th 60 | // detected instance and g'th ground truth instance of 61 | // category category_ids[c] in image image_ids[i] 62 | // image_category_ground_truth_instances[i][c] is a vector of ground truth 63 | // instances in image image_ids[i] of category category_ids[c] 64 | // image_category_detection_instances[i][c] is a vector of detected 65 | // instances in image image_ids[i] of category category_ids[c] 66 | std::vector EvaluateImages( 67 | const std::vector>& area_ranges, // vector of 2-tuples 68 | int max_detections, 69 | const std::vector& iou_thresholds, 70 | const ImageCategoryInstances>& image_category_ious, 71 | const ImageCategoryInstances& 72 | image_category_ground_truth_instances, 73 | const ImageCategoryInstances& 74 | image_category_detection_instances); 75 | 76 | // C++ implementation of COCOeval.accumulate(), which generates precision 77 | // recall curves for each set of category, IOU threshold, detection area range, 78 | // and max number of detections parameters. It is assumed that the parameter 79 | // evaluations is the return value of the functon COCOeval::EvaluateImages(), 80 | // which was called with the same parameter settings params 81 | py::dict Accumulate( 82 | const py::object& params, 83 | const std::vector& evalutations); 84 | 85 | } // namespace COCOeval 86 | -------------------------------------------------------------------------------- /yolox/utils/model_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | import torch 6 | import torch.nn as nn 7 | from thop import profile 8 | 9 | from copy import deepcopy 10 | 11 | __all__ = [ 12 | "fuse_conv_and_bn", 13 | "fuse_model", 14 | "get_model_info", 15 | "replace_module", 16 | ] 17 | 18 | 19 | def get_model_info(model, tsize): 20 | 21 | stride = 64 22 | img = torch.zeros((2, 3, stride, stride), device=next(model.parameters()).device) 23 | flops, params = profile(deepcopy(model), inputs=(img.split(1,dim=0),), verbose=False) 24 | params /= 1e6 25 | flops /= 1e9 26 | flops *= tsize[0] * tsize[1] / stride / stride * 2 # Gflops 27 | info = "Params: {:.2f}M, Gflops: {:.2f}".format(params, flops) 28 | return info 29 | 30 | 31 | def fuse_conv_and_bn(conv, bn): 32 | # Fuse convolution and batchnorm layers https://tehnokv.com/posts/fusing-batchnorm-and-conv/ 33 | fusedconv = ( 34 | nn.Conv2d( 35 | conv.in_channels, 36 | conv.out_channels, 37 | kernel_size=conv.kernel_size, 38 | stride=conv.stride, 39 | padding=conv.padding, 40 | groups=conv.groups, 41 | bias=True, 42 | ) 43 | .requires_grad_(False) 44 | .to(conv.weight.device) 45 | ) 46 | 47 | # prepare filters 48 | w_conv = conv.weight.clone().view(conv.out_channels, -1) 49 | w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var))) 50 | fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape)) 51 | 52 | # prepare spatial bias 53 | b_conv = ( 54 | torch.zeros(conv.weight.size(0), device=conv.weight.device) 55 | if conv.bias is None 56 | else conv.bias 57 | ) 58 | b_bn = bn.bias - bn.weight.mul(bn.running_mean).div( 59 | torch.sqrt(bn.running_var + bn.eps) 60 | ) 61 | fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn) 62 | 63 | return fusedconv 64 | 65 | 66 | def fuse_model(model): 67 | from yolox.models.network_blocks import BaseConv 68 | 69 | for m in model.modules(): 70 | if type(m) is BaseConv and hasattr(m, "bn"): 71 | m.conv = fuse_conv_and_bn(m.conv, m.bn) # update conv 72 | delattr(m, "bn") # remove batchnorm 73 | m.forward = m.fuseforward # update forward 74 | return model 75 | 76 | 77 | def replace_module(module, replaced_module_type, new_module_type, replace_func=None): 78 | """ 79 | Replace given type in module to a new type. mostly used in deploy. 80 | 81 | Args: 82 | module (nn.Module): model to apply replace operation. 83 | replaced_module_type (Type): module type to be replaced. 84 | new_module_type (Type) 85 | replace_func (function): python function to describe replace logic. Defalut value None. 86 | 87 | Returns: 88 | model (nn.Module): module that already been replaced. 89 | """ 90 | 91 | def default_replace_func(replaced_module_type, new_module_type): 92 | return new_module_type() 93 | 94 | if replace_func is None: 95 | replace_func = default_replace_func 96 | 97 | model = module 98 | if isinstance(module, replaced_module_type): 99 | model = replace_func(replaced_module_type, new_module_type) 100 | else: # recurrsively replace 101 | for name, child in module.named_children(): 102 | new_child = replace_module(child, replaced_module_type, new_module_type) 103 | if new_child is not child: # child is already replaced 104 | model.add_module(name, new_child) 105 | 106 | return model 107 | -------------------------------------------------------------------------------- /tools/mota.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | import numpy as np 3 | np.float = float 4 | np.int = int 5 | np.object = object 6 | np.bool = bool 7 | import torch 8 | import torch.backends.cudnn as cudnn 9 | from torch.nn.parallel import DistributedDataParallel as DDP 10 | import sys 11 | import os 12 | 13 | prj_path = os.path.join(os.path.dirname(__file__), '..') 14 | if prj_path not in sys.path: 15 | sys.path.append(prj_path) 16 | 17 | from yolox.core import launch 18 | from yolox.exp import get_exp 19 | from yolox.utils import configure_nccl, fuse_model, get_local_rank, get_model_info, setup_logger 20 | 21 | import argparse 22 | import os 23 | import random 24 | import warnings 25 | import glob 26 | import motmetrics as mm 27 | from collections import OrderedDict 28 | from pathlib import Path 29 | 30 | 31 | def compare_dataframes(gts, ts): 32 | accs = [] 33 | names = [] 34 | for k, tsacc in ts.items(): 35 | if k in gts: 36 | logger.info('Comparing {}...'.format(k)) 37 | accs.append(mm.utils.compare_to_groundtruth(gts[k], tsacc, 'iou', distth=0.5)) 38 | names.append(k) 39 | else: 40 | logger.warning('No ground truth for {}, skipping.'.format(k)) 41 | 42 | return accs, names 43 | 44 | 45 | # evaluate MOTA 46 | 47 | results_folder = 'DiffusionTrack_outputs/yolox_x_diffusion_track_mot17_ablation/track_results_mot17_ablation_1_500' 48 | mm.lap.default_solver = 'lap' 49 | 50 | gt_type = '_val_half' 51 | #gt_type = '' 52 | print('gt_type', gt_type) 53 | gtfiles = glob.glob( 54 | os.path.join('datasets/mot/train', '*/gt/gt{}.txt'.format(gt_type))) 55 | print('gt_files', gtfiles) 56 | tsfiles = [f for f in glob.glob(os.path.join(results_folder, '*.txt')) if not os.path.basename(f).startswith('eval')] 57 | 58 | logger.info('Found {} groundtruths and {} test files.'.format(len(gtfiles), len(tsfiles))) 59 | logger.info('Available LAP solvers {}'.format(mm.lap.available_solvers)) 60 | logger.info('Default LAP solver \'{}\''.format(mm.lap.default_solver)) 61 | logger.info('Loading files.') 62 | 63 | gt = OrderedDict([(Path(f).parts[-3], mm.io.loadtxt(f, fmt='mot15-2D', min_confidence=1)) for f in gtfiles]) 64 | ts = OrderedDict([(os.path.splitext(Path(f).parts[-1])[0], mm.io.loadtxt(f, fmt='mot15-2D', min_confidence=-1.0)) for f in tsfiles]) 65 | 66 | mh = mm.metrics.create() 67 | accs, names = compare_dataframes(gt, ts) 68 | 69 | logger.info('Running metrics') 70 | metrics = ['recall', 'precision', 'num_unique_objects', 'mostly_tracked', 71 | 'partially_tracked', 'mostly_lost', 'num_false_positives', 'num_misses', 72 | 'num_switches', 'num_fragmentations', 'mota', 'motp', 'num_objects'] 73 | summary = mh.compute_many(accs, names=names, metrics=metrics, generate_overall=True) 74 | # summary = mh.compute_many(accs, names=names, metrics=mm.metrics.motchallenge_metrics, generate_overall=True) 75 | # print(mm.io.render_summary( 76 | # summary, formatters=mh.formatters, 77 | # namemap=mm.io.motchallenge_metric_names)) 78 | div_dict = { 79 | 'num_objects': ['num_false_positives', 'num_misses', 'num_switches', 'num_fragmentations'], 80 | 'num_unique_objects': ['mostly_tracked', 'partially_tracked', 'mostly_lost']} 81 | for divisor in div_dict: 82 | for divided in div_dict[divisor]: 83 | summary[divided] = (summary[divided] / summary[divisor]) 84 | fmt = mh.formatters 85 | change_fmt_list = ['num_false_positives', 'num_misses', 'num_switches', 'num_fragmentations', 'mostly_tracked', 86 | 'partially_tracked', 'mostly_lost'] 87 | for k in change_fmt_list: 88 | fmt[k] = fmt['mota'] 89 | print(mm.io.render_summary(summary, formatters=fmt, namemap=mm.io.motchallenge_metric_names)) 90 | 91 | metrics = mm.metrics.motchallenge_metrics + ['num_objects'] 92 | summary = mh.compute_many(accs, names=names, metrics=metrics, generate_overall=True) 93 | print(mm.io.render_summary(summary, formatters=mh.formatters, namemap=mm.io.motchallenge_metric_names)) 94 | logger.info('Completed') 95 | -------------------------------------------------------------------------------- /yolox/models/yolo_pafpn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | import torch 6 | import torch.nn as nn 7 | 8 | from .darknet import CSPDarknet 9 | from .network_blocks import BaseConv, CSPLayer, DWConv 10 | 11 | 12 | class YOLOPAFPN(nn.Module): 13 | """ 14 | YOLOv3 model. Darknet 53 is the default backbone of this model. 15 | """ 16 | 17 | def __init__( 18 | self, 19 | depth=1.0, 20 | width=1.0, 21 | in_features=("dark3", "dark4", "dark5"), 22 | in_channels=[256, 512, 1024], 23 | depthwise=False, 24 | act="silu", 25 | ): 26 | super().__init__() 27 | self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act) 28 | self.in_features = in_features 29 | self.in_channels = in_channels 30 | Conv = DWConv if depthwise else BaseConv 31 | 32 | self.upsample = nn.Upsample(scale_factor=2, mode="nearest") 33 | self.lateral_conv0 = BaseConv( 34 | int(in_channels[2] * width), int(in_channels[1] * width), 1, 1, act=act 35 | ) 36 | self.C3_p4 = CSPLayer( 37 | int(2 * in_channels[1] * width), 38 | int(in_channels[1] * width), 39 | round(3 * depth), 40 | False, 41 | depthwise=depthwise, 42 | act=act, 43 | ) # cat 44 | 45 | self.reduce_conv1 = BaseConv( 46 | int(in_channels[1] * width), int(in_channels[0] * width), 1, 1, act=act 47 | ) 48 | self.C3_p3 = CSPLayer( 49 | int(2 * in_channels[0] * width), 50 | int(in_channels[0] * width), 51 | round(3 * depth), 52 | False, 53 | depthwise=depthwise, 54 | act=act, 55 | ) 56 | 57 | # bottom-up conv 58 | self.bu_conv2 = Conv( 59 | int(in_channels[0] * width), int(in_channels[0] * width), 3, 2, act=act 60 | ) 61 | self.C3_n3 = CSPLayer( 62 | int(2 * in_channels[0] * width), 63 | int(in_channels[1] * width), 64 | round(3 * depth), 65 | False, 66 | depthwise=depthwise, 67 | act=act, 68 | ) 69 | 70 | # bottom-up conv 71 | self.bu_conv1 = Conv( 72 | int(in_channels[1] * width), int(in_channels[1] * width), 3, 2, act=act 73 | ) 74 | self.C3_n4 = CSPLayer( 75 | int(2 * in_channels[1] * width), 76 | int(in_channels[2] * width), 77 | round(3 * depth), 78 | False, 79 | depthwise=depthwise, 80 | act=act, 81 | ) 82 | 83 | 84 | def forward(self, input): 85 | """ 86 | Args: 87 | inputs: input images. 88 | 89 | Returns: 90 | Tuple[Tensor]: FPN feature. 91 | """ 92 | 93 | # backbone 94 | out_features = self.backbone(input) 95 | features = [out_features[f] for f in self.in_features] 96 | [x2, x1, x0] = features 97 | 98 | fpn_out0 = self.lateral_conv0(x0) # 1024->512/32 99 | f_out0 = self.upsample(fpn_out0) # 512/16 100 | f_out0 = torch.cat([f_out0, x1], 1) # 512->1024/16 101 | f_out0 = self.C3_p4(f_out0) # 1024->512/16 102 | 103 | fpn_out1 = self.reduce_conv1(f_out0) # 512->256/16 104 | f_out1 = self.upsample(fpn_out1) # 256/8 105 | f_out1 = torch.cat([f_out1, x2], 1) # 256->512/8 106 | pan_out2 = self.C3_p3(f_out1) # 512->256/8 107 | 108 | p_out1 = self.bu_conv2(pan_out2) # 256->256/16 109 | p_out1 = torch.cat([p_out1, fpn_out1], 1) # 256->512/16 110 | pan_out1 = self.C3_n3(p_out1) # 512->512/16 111 | 112 | p_out0 = self.bu_conv1(pan_out1) # 512->512/32 113 | p_out0 = torch.cat([p_out0, fpn_out0], 1) # 512->1024/32 114 | pan_out0 = self.C3_n4(p_out0) # 1024->1024/32 115 | 116 | outputs = (pan_out2, pan_out1, pan_out0) 117 | return outputs 118 | -------------------------------------------------------------------------------- /yolox/tracking_utils/io.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict 3 | import numpy as np 4 | 5 | 6 | def write_results(filename, results_dict: Dict, data_type: str): 7 | if not filename: 8 | return 9 | path = os.path.dirname(filename) 10 | if not os.path.exists(path): 11 | os.makedirs(path) 12 | 13 | if data_type in ('mot', 'mcmot', 'lab'): 14 | save_format = '{frame},{id},{x1},{y1},{w},{h},1,-1,-1,-1\n' 15 | elif data_type == 'kitti': 16 | save_format = '{frame} {id} pedestrian -1 -1 -10 {x1} {y1} {x2} {y2} -1 -1 -1 -1000 -1000 -1000 -10 {score}\n' 17 | else: 18 | raise ValueError(data_type) 19 | 20 | with open(filename, 'w') as f: 21 | for frame_id, frame_data in results_dict.items(): 22 | if data_type == 'kitti': 23 | frame_id -= 1 24 | for tlwh, track_id in frame_data: 25 | if track_id < 0: 26 | continue 27 | x1, y1, w, h = tlwh 28 | x2, y2 = x1 + w, y1 + h 29 | line = save_format.format(frame=frame_id, id=track_id, x1=x1, y1=y1, x2=x2, y2=y2, w=w, h=h, score=1.0) 30 | f.write(line) 31 | 32 | 33 | def read_results(filename, data_type: str, is_gt=False, is_ignore=False): 34 | if data_type in ('mot', 'lab'): 35 | read_fun = read_mot_results 36 | else: 37 | raise ValueError('Unknown data type: {}'.format(data_type)) 38 | 39 | return read_fun(filename, is_gt, is_ignore) 40 | 41 | 42 | """ 43 | labels={'ped', ... % 1 44 | 'person_on_vhcl', ... % 2 45 | 'car', ... % 3 46 | 'bicycle', ... % 4 47 | 'mbike', ... % 5 48 | 'non_mot_vhcl', ... % 6 49 | 'static_person', ... % 7 50 | 'distractor', ... % 8 51 | 'occluder', ... % 9 52 | 'occluder_on_grnd', ... %10 53 | 'occluder_full', ... % 11 54 | 'reflection', ... % 12 55 | 'crowd' ... % 13 56 | }; 57 | """ 58 | 59 | 60 | def read_mot_results(filename, is_gt, is_ignore): 61 | valid_labels = {1} 62 | ignore_labels = {2, 7, 8, 12} 63 | results_dict = dict() 64 | if os.path.isfile(filename): 65 | with open(filename, 'r') as f: 66 | for line in f.readlines(): 67 | linelist = line.split(',') 68 | if len(linelist) < 7: 69 | continue 70 | fid = int(linelist[0]) 71 | if fid < 1: 72 | continue 73 | results_dict.setdefault(fid, list()) 74 | 75 | box_size = float(linelist[4]) * float(linelist[5]) 76 | 77 | if is_gt: 78 | if 'MOT16-' in filename or 'MOT17-' in filename: 79 | label = int(float(linelist[7])) 80 | mark = int(float(linelist[6])) 81 | if mark == 0 or label not in valid_labels: 82 | continue 83 | score = 1 84 | elif is_ignore: 85 | if 'MOT16-' in filename or 'MOT17-' in filename: 86 | label = int(float(linelist[7])) 87 | vis_ratio = float(linelist[8]) 88 | if label not in ignore_labels and vis_ratio >= 0: 89 | continue 90 | else: 91 | continue 92 | score = 1 93 | else: 94 | score = float(linelist[6]) 95 | 96 | #if box_size > 7000: 97 | #if box_size <= 7000 or box_size >= 15000: 98 | #if box_size < 15000: 99 | #continue 100 | 101 | tlwh = tuple(map(float, linelist[2:6])) 102 | target_id = int(linelist[1]) 103 | 104 | results_dict[fid].append((tlwh, target_id, score)) 105 | 106 | return results_dict 107 | 108 | 109 | def unzip_objs(objs): 110 | if len(objs) > 0: 111 | tlwhs, ids, scores = zip(*objs) 112 | else: 113 | tlwhs, ids, scores = [], [], [] 114 | tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4) 115 | 116 | return tlwhs, ids, scores -------------------------------------------------------------------------------- /tools/train.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | import numpy as np 3 | np.float = float 4 | np.int = int 5 | np.object = object 6 | np.bool = bool 7 | import torch 8 | import torch.backends.cudnn as cudnn 9 | import os 10 | # os.environ["CUDA_VISIBLE_DEVICES"]="2,3,4,5,6,7" 11 | import sys 12 | prj_path = os.path.join(os.path.dirname(__file__), '..') 13 | 14 | if prj_path not in sys.path: 15 | sys.path.append(prj_path) 16 | from yolox.core import Trainer, launch 17 | from yolox.exp import get_exp 18 | 19 | import argparse 20 | import random 21 | import warnings 22 | 23 | 24 | def make_parser(): 25 | parser = argparse.ArgumentParser("YOLOX train parser") 26 | parser.add_argument("-expn", "--experiment-name", type=str, default=None) 27 | parser.add_argument("-n", "--name", type=str, default=None, help="model name") 28 | 29 | # distributed 30 | parser.add_argument( 31 | "--dist-backend", default="nccl", type=str, help="distributed backend" 32 | ) 33 | parser.add_argument( 34 | "--dist-url", 35 | default=None, 36 | type=str, 37 | help="url used to set up distributed training", 38 | ) 39 | parser.add_argument("-b", "--batch-size", type=int, default=2*8, help="batch size") 40 | parser.add_argument( 41 | "-d", "--devices", default=8, type=int, help="device for training" 42 | ) 43 | parser.add_argument( 44 | "--local_rank", default=0, type=int, help="local rank for dist training" 45 | ) 46 | parser.add_argument( 47 | "-f", 48 | "--exp_file", 49 | default="exps/example/mot/yolox_x_diffusion_track_dancetrack_baseline.py", 50 | type=str, 51 | help="plz input your expriment description file", 52 | ) 53 | parser.add_argument( 54 | "--resume", default=False, action="store_true", help="resume training" 55 | ) 56 | parser.add_argument("-c", "--ckpt", default="diffusion_dancetrack_det.pth.tar", type=str, help="checkpoint file") 57 | parser.add_argument( 58 | "-e", 59 | "--start_epoch", 60 | default=None, 61 | type=int, 62 | help="resume training start epoch", 63 | ) 64 | parser.add_argument( 65 | "--num_machines", default=1, type=int, help="num of node for training" 66 | ) 67 | parser.add_argument( 68 | "--machine_rank", default=0, type=int, help="node rank for multi-node training" 69 | ) 70 | parser.add_argument( 71 | "--fp16", 72 | dest="fp16", 73 | default=False, 74 | action="store_true", 75 | help="Adopting mix precision training.", 76 | ) 77 | parser.add_argument( 78 | "-o", 79 | "--occupy", 80 | dest="occupy", 81 | default=False, 82 | action="store_true", 83 | help="occupy GPU memory first for training.", 84 | ) 85 | parser.add_argument( 86 | "opts", 87 | help="Modify config options using the command-line", 88 | default=None, 89 | nargs=argparse.REMAINDER, 90 | ) 91 | return parser 92 | 93 | 94 | @logger.catch 95 | def main(exp, args): 96 | if exp.seed is not None: 97 | random.seed(exp.seed) 98 | torch.manual_seed(exp.seed) 99 | cudnn.deterministic = True 100 | warnings.warn( 101 | "You have chosen to seed training. This will turn on the CUDNN deterministic setting, " 102 | "which can slow down your training considerably! You may see unexpected behavior " 103 | "when restarting from checkpoints." 104 | ) 105 | 106 | # set environment variables for distributed training 107 | cudnn.benchmark = True 108 | 109 | trainer = Trainer(exp, args) 110 | trainer.train() 111 | 112 | 113 | if __name__ == "__main__": 114 | args = make_parser().parse_args() 115 | # args.exp_file=f 116 | # args.ckpt=c 117 | exp = get_exp(args.exp_file, args.name) 118 | exp.merge(args.opts) 119 | 120 | if not args.experiment_name: 121 | args.experiment_name = exp.exp_name 122 | 123 | num_gpu = torch.cuda.device_count() if args.devices is None else args.devices 124 | assert num_gpu <= torch.cuda.device_count() 125 | 126 | launch( 127 | main, 128 | num_gpu, 129 | args.num_machines, 130 | args.machine_rank, 131 | backend=args.dist_backend, 132 | dist_url=args.dist_url, 133 | args=(exp, args), 134 | ) 135 | -------------------------------------------------------------------------------- /yolox/tracking_utils/evaluation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import copy 4 | import motmetrics as mm 5 | mm.lap.default_solver = 'lap' 6 | 7 | from yolox.tracking_utils.io import read_results, unzip_objs 8 | 9 | 10 | class Evaluator(object): 11 | 12 | def __init__(self, data_root, seq_name, data_type): 13 | self.data_root = data_root 14 | self.seq_name = seq_name 15 | self.data_type = data_type 16 | 17 | self.load_annotations() 18 | self.reset_accumulator() 19 | 20 | def load_annotations(self): 21 | assert self.data_type == 'mot' 22 | 23 | gt_filename = os.path.join(self.data_root, self.seq_name, 'gt', 'gt.txt') 24 | self.gt_frame_dict = read_results(gt_filename, self.data_type, is_gt=True) 25 | self.gt_ignore_frame_dict = read_results(gt_filename, self.data_type, is_ignore=True) 26 | 27 | def reset_accumulator(self): 28 | self.acc = mm.MOTAccumulator(auto_id=True) 29 | 30 | def eval_frame(self, frame_id, trk_tlwhs, trk_ids, rtn_events=False): 31 | # results 32 | trk_tlwhs = np.copy(trk_tlwhs) 33 | trk_ids = np.copy(trk_ids) 34 | 35 | # gts 36 | gt_objs = self.gt_frame_dict.get(frame_id, []) 37 | gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2] 38 | 39 | # ignore boxes 40 | ignore_objs = self.gt_ignore_frame_dict.get(frame_id, []) 41 | ignore_tlwhs = unzip_objs(ignore_objs)[0] 42 | 43 | # remove ignored results 44 | keep = np.ones(len(trk_tlwhs), dtype=bool) 45 | iou_distance = mm.distances.iou_matrix(ignore_tlwhs, trk_tlwhs, max_iou=0.5) 46 | if len(iou_distance) > 0: 47 | match_is, match_js = mm.lap.linear_sum_assignment(iou_distance) 48 | match_is, match_js = map(lambda a: np.asarray(a, dtype=int), [match_is, match_js]) 49 | match_ious = iou_distance[match_is, match_js] 50 | 51 | match_js = np.asarray(match_js, dtype=int) 52 | match_js = match_js[np.logical_not(np.isnan(match_ious))] 53 | keep[match_js] = False 54 | trk_tlwhs = trk_tlwhs[keep] 55 | trk_ids = trk_ids[keep] 56 | #match_is, match_js = mm.lap.linear_sum_assignment(iou_distance) 57 | #match_is, match_js = map(lambda a: np.asarray(a, dtype=int), [match_is, match_js]) 58 | #match_ious = iou_distance[match_is, match_js] 59 | 60 | #match_js = np.asarray(match_js, dtype=int) 61 | #match_js = match_js[np.logical_not(np.isnan(match_ious))] 62 | #keep[match_js] = False 63 | #trk_tlwhs = trk_tlwhs[keep] 64 | #trk_ids = trk_ids[keep] 65 | 66 | # get distance matrix 67 | iou_distance = mm.distances.iou_matrix(gt_tlwhs, trk_tlwhs, max_iou=0.5) 68 | 69 | # acc 70 | self.acc.update(gt_ids, trk_ids, iou_distance) 71 | 72 | if rtn_events and iou_distance.size > 0 and hasattr(self.acc, 'last_mot_events'): 73 | events = self.acc.last_mot_events # only supported by https://github.com/longcw/py-motmetrics 74 | else: 75 | events = None 76 | return events 77 | 78 | def eval_file(self, filename): 79 | self.reset_accumulator() 80 | 81 | result_frame_dict = read_results(filename, self.data_type, is_gt=False) 82 | #frames = sorted(list(set(self.gt_frame_dict.keys()) | set(result_frame_dict.keys()))) 83 | frames = sorted(list(set(result_frame_dict.keys()))) 84 | for frame_id in frames: 85 | trk_objs = result_frame_dict.get(frame_id, []) 86 | trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2] 87 | self.eval_frame(frame_id, trk_tlwhs, trk_ids, rtn_events=False) 88 | 89 | return self.acc 90 | 91 | @staticmethod 92 | def get_summary(accs, names, metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1', 'precision', 'recall')): 93 | names = copy.deepcopy(names) 94 | if metrics is None: 95 | metrics = mm.metrics.motchallenge_metrics 96 | metrics = copy.deepcopy(metrics) 97 | 98 | mh = mm.metrics.create() 99 | summary = mh.compute_many( 100 | accs, 101 | metrics=metrics, 102 | names=names, 103 | generate_overall=True 104 | ) 105 | 106 | return summary 107 | 108 | @staticmethod 109 | def save_summary(summary, filename): 110 | import pandas as pd 111 | writer = pd.ExcelWriter(filename) 112 | summary.to_excel(writer) 113 | writer.save() -------------------------------------------------------------------------------- /tools/mix_data_test_mot17.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | 5 | """ 6 | cd datasets 7 | mkdir -p mix_det/annotations 8 | cp mot/annotations/val_half.json mix_det/annotations/val_half.json 9 | cp mot/annotations/test.json mix_det/annotations/test.json 10 | cd mix_det 11 | ln -s ../mot/train mot_train 12 | ln -s ../crowdhuman/CrowdHuman_train crowdhuman_train 13 | ln -s ../crowdhuman/CrowdHuman_val crowdhuman_val 14 | ln -s ../Cityscapes cp_train 15 | ln -s ../ETHZ ethz_train 16 | cd .. 17 | """ 18 | 19 | mot_json = json.load(open('datasets/mot/annotations/train.json','r')) 20 | 21 | img_list = list() 22 | for img in mot_json['images']: 23 | img['file_name'] = 'mot_train/' + img['file_name'] 24 | img_list.append(img) 25 | 26 | ann_list = list() 27 | for ann in mot_json['annotations']: 28 | ann_list.append(ann) 29 | 30 | video_list = mot_json['videos'] 31 | category_list = mot_json['categories'] 32 | 33 | 34 | print('mot17') 35 | 36 | max_img = 10000 37 | max_ann = 2000000 38 | max_video = 10 39 | 40 | crowdhuman_json = json.load(open('datasets/crowdhuman/annotations/train.json','r')) 41 | img_id_count = 0 42 | for img in crowdhuman_json['images']: 43 | img_id_count += 1 44 | img['file_name'] = 'crowdhuman_train/Images/' + img['file_name'] 45 | img['frame_id'] = img_id_count 46 | img['prev_image_id'] = img['id'] + max_img 47 | img['next_image_id'] = img['id'] + max_img 48 | img['id'] = img['id'] + max_img 49 | img['video_id'] = max_video 50 | img_list.append(img) 51 | 52 | for ann in crowdhuman_json['annotations']: 53 | ann['id'] = ann['id'] + max_ann 54 | ann['image_id'] = ann['image_id'] + max_img 55 | ann_list.append(ann) 56 | 57 | print('crowdhuman_train') 58 | 59 | video_list.append({ 60 | 'id': max_video, 61 | 'file_name': 'crowdhuman_train' 62 | }) 63 | 64 | 65 | max_img = 30000 66 | max_ann = 10000000 67 | 68 | crowdhuman_val_json = json.load(open('datasets/crowdhuman/annotations/val.json','r')) 69 | img_id_count = 0 70 | for img in crowdhuman_val_json['images']: 71 | img_id_count += 1 72 | img['file_name'] = 'crowdhuman_val/Images/' + img['file_name'] 73 | img['frame_id'] = img_id_count 74 | img['prev_image_id'] = img['id'] + max_img 75 | img['next_image_id'] = img['id'] + max_img 76 | img['id'] = img['id'] + max_img 77 | img['video_id'] = max_video 78 | img_list.append(img) 79 | 80 | for ann in crowdhuman_val_json['annotations']: 81 | ann['id'] = ann['id'] + max_ann 82 | ann['image_id'] = ann['image_id'] + max_img 83 | ann_list.append(ann) 84 | 85 | print('crowdhuman_val') 86 | 87 | video_list.append({ 88 | 'id': max_video, 89 | 'file_name': 'crowdhuman_val' 90 | }) 91 | 92 | max_img = 40000 93 | max_ann = 20000000 94 | 95 | ethz_json = json.load(open('datasets/ETHZ/annotations/train.json','r')) 96 | img_id_count = 0 97 | for img in ethz_json['images']: 98 | img_id_count += 1 99 | img['file_name'] = 'ethz_train/' + img['file_name'][5:] 100 | img['frame_id'] = img_id_count 101 | img['prev_image_id'] = img['id'] + max_img 102 | img['next_image_id'] = img['id'] + max_img 103 | img['id'] = img['id'] + max_img 104 | img['video_id'] = max_video 105 | img_list.append(img) 106 | 107 | for ann in ethz_json['annotations']: 108 | ann['id'] = ann['id'] + max_ann 109 | ann['image_id'] = ann['image_id'] + max_img 110 | ann_list.append(ann) 111 | 112 | print('ETHZ') 113 | 114 | video_list.append({ 115 | 'id': max_video, 116 | 'file_name': 'ethz' 117 | }) 118 | 119 | max_img = 50000 120 | max_ann = 25000000 121 | 122 | cp_json = json.load(open('datasets/Cityscapes/annotations/train.json','r')) 123 | img_id_count = 0 124 | for img in cp_json['images']: 125 | img_id_count += 1 126 | img['file_name'] = 'cp_train/' + img['file_name'][11:] 127 | img['frame_id'] = img_id_count 128 | img['prev_image_id'] = img['id'] + max_img 129 | img['next_image_id'] = img['id'] + max_img 130 | img['id'] = img['id'] + max_img 131 | img['video_id'] = max_video 132 | img_list.append(img) 133 | 134 | for ann in cp_json['annotations']: 135 | ann['id'] = ann['id'] + max_ann 136 | ann['image_id'] = ann['image_id'] + max_img 137 | ann_list.append(ann) 138 | 139 | print('Cityscapes') 140 | 141 | video_list.append({ 142 | 'id': max_video, 143 | 'file_name': 'cityperson' 144 | }) 145 | 146 | mix_json = dict() 147 | mix_json['images'] = img_list 148 | mix_json['annotations'] = ann_list 149 | mix_json['videos'] = video_list 150 | mix_json['categories'] = category_list 151 | json.dump(mix_json, open('datasets/mix_det/annotations/train.json','w')) 152 | -------------------------------------------------------------------------------- /yolox/data/datasets/datasets_wrapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | from torch.utils.data.dataset import ConcatDataset as torchConcatDataset 6 | from torch.utils.data.dataset import Dataset as torchDataset 7 | 8 | import bisect 9 | from functools import wraps 10 | 11 | 12 | class ConcatDataset(torchConcatDataset): 13 | def __init__(self, datasets): 14 | super(ConcatDataset, self).__init__(datasets) 15 | if hasattr(self.datasets[0], "input_dim"): 16 | self._input_dim = self.datasets[0].input_dim 17 | self.input_dim = self.datasets[0].input_dim 18 | 19 | def pull_item(self, idx): 20 | if idx < 0: 21 | if -idx > len(self): 22 | raise ValueError( 23 | "absolute value of index should not exceed dataset length" 24 | ) 25 | idx = len(self) + idx 26 | dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx) 27 | if dataset_idx == 0: 28 | sample_idx = idx 29 | else: 30 | sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] 31 | return self.datasets[dataset_idx].pull_item(sample_idx) 32 | 33 | 34 | class MixConcatDataset(torchConcatDataset): 35 | def __init__(self, datasets): 36 | super(MixConcatDataset, self).__init__(datasets) 37 | if hasattr(self.datasets[0], "input_dim"): 38 | self._input_dim = self.datasets[0].input_dim 39 | self.input_dim = self.datasets[0].input_dim 40 | 41 | def __getitem__(self, index): 42 | 43 | if not isinstance(index, int): 44 | idx = index[1] 45 | if idx < 0: 46 | if -idx > len(self): 47 | raise ValueError( 48 | "absolute value of index should not exceed dataset length" 49 | ) 50 | idx = len(self) + idx 51 | dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx) 52 | if dataset_idx == 0: 53 | sample_idx = idx 54 | else: 55 | sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] 56 | if not isinstance(index, int): 57 | index = (index[0], sample_idx, index[2]) 58 | 59 | return self.datasets[dataset_idx][index] 60 | 61 | 62 | class Dataset(torchDataset): 63 | """ This class is a subclass of the base :class:`torch.utils.data.Dataset`, 64 | that enables on the fly resizing of the ``input_dim``. 65 | 66 | Args: 67 | input_dimension (tuple): (width,height) tuple with default dimensions of the network 68 | """ 69 | 70 | def __init__(self, input_dimension, mosaic=True): 71 | super().__init__() 72 | self.__input_dim = input_dimension[:2] 73 | self.enable_mosaic = mosaic 74 | 75 | @property 76 | def input_dim(self): 77 | """ 78 | Dimension that can be used by transforms to set the correct image size, etc. 79 | This allows transforms to have a single source of truth 80 | for the input dimension of the network. 81 | 82 | Return: 83 | list: Tuple containing the current width,height 84 | """ 85 | if hasattr(self, "_input_dim"): 86 | return self._input_dim 87 | return self.__input_dim 88 | 89 | @staticmethod 90 | def resize_getitem(getitem_fn): 91 | """ 92 | Decorator method that needs to be used around the ``__getitem__`` method. |br| 93 | This decorator enables the on the fly resizing of 94 | the ``input_dim`` with our :class:`~lightnet.data.DataLoader` class. 95 | 96 | Example: 97 | >>> class CustomSet(ln.data.Dataset): 98 | ... def __len__(self): 99 | ... return 10 100 | ... @ln.data.Dataset.resize_getitem 101 | ... def __getitem__(self, index): 102 | ... # Should return (image, anno) but here we return input_dim 103 | ... return self.input_dim 104 | >>> data = CustomSet((200,200)) 105 | >>> data[0] 106 | (200, 200) 107 | >>> data[(480,320), 0] 108 | (480, 320) 109 | """ 110 | 111 | @wraps(getitem_fn) 112 | def wrapper(self, index): 113 | if not isinstance(index, int): 114 | has_dim = True 115 | self._input_dim = index[0] 116 | self.enable_mosaic = index[2] 117 | index = index[1] 118 | else: 119 | has_dim = False 120 | 121 | ret_val = getitem_fn(self, index) 122 | 123 | if has_dim: 124 | del self._input_dim 125 | 126 | return ret_val 127 | 128 | return wrapper 129 | -------------------------------------------------------------------------------- /yolox/utils/cluster_nms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | @torch.jit.script 4 | def intersect(box_a, box_b): 5 | """ We resize both tensors to [A,B,2] without new malloc: 6 | [A,2] -> [A,1,2] -> [A,B,2] 7 | [B,2] -> [1,B,2] -> [A,B,2] 8 | Then we compute the area of intersect between box_a and box_b. 9 | Args: 10 | box_a: (tensor) bounding boxes, Shape: [n,A,4]. 11 | box_b: (tensor) bounding boxes, Shape: [n,B,4]. 12 | Return: 13 | (tensor) intersection area, Shape: [n,A,B]. 14 | """ 15 | n = box_a.size(0) 16 | A = box_a.size(1) 17 | B = box_b.size(1) 18 | max_xy = torch.min(box_a[:, :, 2:].unsqueeze(2).expand(n, A, B, 2), 19 | box_b[:, :, 2:].unsqueeze(1).expand(n, A, B, 2)) 20 | min_xy = torch.max(box_a[:, :, :2].unsqueeze(2).expand(n, A, B, 2), 21 | box_b[:, :, :2].unsqueeze(1).expand(n, A, B, 2)) 22 | return torch.clamp(max_xy - min_xy, min=0).prod(3) # inter 23 | 24 | @torch.jit.script 25 | def garea(box_a, box_b): 26 | """ We resize both tensors to [A,B,2] without new malloc: 27 | [A,2] -> [A,1,2] -> [A,B,2] 28 | [B,2] -> [1,B,2] -> [A,B,2] 29 | Then we compute the area of intersect between box_a and box_b. 30 | Args: 31 | box_a: (tensor) bounding boxes, Shape: [n,A,4]. 32 | box_b: (tensor) bounding boxes, Shape: [n,B,4]. 33 | Return: 34 | (tensor) intersection area, Shape: [n,A,B]. 35 | """ 36 | n = box_a.size(0) 37 | A = box_a.size(1) 38 | B = box_b.size(1) 39 | max_xy = torch.max(box_a[:, :, 2:].unsqueeze(2).expand(n, A, B, 2), 40 | box_b[:, :, 2:].unsqueeze(1).expand(n, A, B, 2)) 41 | min_xy = torch.min(box_a[:, :, :2].unsqueeze(2).expand(n, A, B, 2), 42 | box_b[:, :, :2].unsqueeze(1).expand(n, A, B, 2)) 43 | return torch.clamp(max_xy - min_xy, min=0).prod(3) # inter 44 | 45 | @torch.jit.script 46 | def get_box_area(box): 47 | return (box[:, :, 2]-box[:, :, 0]) *(box[:, :, 3]-box[:, :, 1]) 48 | 49 | def giou_3d(box_a,box_b,box_c,box_d): 50 | """Compute the jaccard overlap of two sets of boxes. The jaccard overlap 51 | is simply the intersection over union of two boxes. Here we operate on 52 | ground truth boxes and default boxes. If iscrowd=True, put the crowd in box_b. 53 | E.g.: 54 | A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) 55 | Args: 56 | box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] 57 | box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4] 58 | Return: 59 | jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] 60 | """ 61 | use_batch = True 62 | if box_a.dim() == 2: 63 | use_batch = False 64 | box_a = box_a[None, ...] 65 | box_b = box_b[None, ...] 66 | box_c = box_c[None, ...] 67 | box_d = box_d[None, ...] 68 | 69 | interab = intersect(box_a,box_b) 70 | intercd = intersect(box_c,box_d) 71 | 72 | area_ab= garea(box_a,box_b) 73 | area_cd=garea(box_c,box_d) 74 | 75 | area_a = get_box_area(box_a).unsqueeze(2).expand_as(interab) # [A,B] 76 | area_b = get_box_area(box_b).unsqueeze(1).expand_as(interab) # [A,B] 77 | area_c = get_box_area(box_c).unsqueeze(2).expand_as(intercd) # [A,B] 78 | area_d = get_box_area(box_d).unsqueeze(1).expand_as(intercd) # [A,B] 79 | unionab = area_a + area_b - interab 80 | unioncd = area_c+area_d-intercd 81 | 82 | uiouabcd = (interab+intercd) / (unionab+unioncd) 83 | out=uiouabcd-(area_ab+area_cd-unionab-unioncd)/(area_ab+area_cd) 84 | return out if use_batch else out.squeeze(0) 85 | 86 | def cluster_nms(boxes_a,boxes_c,scores,iou_threshold:float=0.5, top_k:int=500): 87 | # Collapse all the classes into 1 88 | _, idx = scores.sort(0, descending=True) 89 | idx = idx[:top_k] 90 | boxes_a = boxes_a[idx] 91 | boxes_b = boxes_a 92 | boxes_c = boxes_c[idx] 93 | boxes_d = boxes_c 94 | iou = giou_3d(boxes_a,boxes_b,boxes_c,boxes_d).triu_(diagonal=1) 95 | B = iou 96 | for i in range(200): 97 | A=B 98 | maxA,_=torch.max(A, dim=0) 99 | E = (maxA<=iou_threshold).float().unsqueeze(1).expand_as(A) 100 | B=iou.mul(E) 101 | if A.equal(B)==True: 102 | break 103 | idx_out = idx[maxA <= iou_threshold] 104 | return idx_out 105 | 106 | 107 | 108 | # ## test 109 | 110 | # boxes_a=[[100,100,200,200], 111 | # [110,110,210,210], 112 | # [50,50,150,150], 113 | # [100,100,200,200], 114 | # [90,90,190,190],] 115 | 116 | # boxes_c=[[100,100,200,200], 117 | # [110,110,210,210], 118 | # [150,150,250,250], 119 | # [0,0,100,100], 120 | # [10,10,110,110],] 121 | 122 | # scores=[0.91,0.9,0.95,0.9,0.8] 123 | 124 | # boxes_a=torch.tensor(boxes_a,dtype=torch.float) 125 | # boxes_c=torch.tensor(boxes_c,dtype=torch.float) 126 | # scores=torch.tensor(scores,dtype=torch.float) 127 | 128 | 129 | # indix=cluster_nms(boxes_a,boxes_c,scores) 130 | # print(indix) 131 | -------------------------------------------------------------------------------- /tools/convert_bdd100k_to_coco.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import os 3 | import json 4 | import tqdm 5 | import numpy as np 6 | 7 | labels_path = 'datasets/bdd100k/labels' 8 | img_path = 'datasets/bdd100k/images' 9 | # mot_labels_path = '/data/yourname/BDD100K-MOT/GT' 10 | 11 | out_path = 'datasets/bdd100k/annotations/' 12 | 13 | split = ['train'] 14 | categories = [ 15 | {"id": 1, "name": "pedestrian"}, 16 | {"id": 2, "name": "rider"}, 17 | {"id": 3, "name": "car"}, 18 | {"id": 4, "name": "truck"}, 19 | {"id": 5, "name": "bus"}, 20 | {"id": 6, "name": "train"}, 21 | {"id": 7, "name": "motorcycle"}, 22 | {"id": 8, "name": "bicycle"}, 23 | # {"id": 9, "name": "traffic light"}, 24 | # {"id": 10, "name": "traffic sign"}, 25 | ] 26 | 27 | # "traffic light":9, "traffic sign":10 28 | cat = {"pedestrian":1, "rider":2, "car":3, "truck":4, "bus":5, "train":6, "motorcycle":7, "bicycle":8,} 29 | # 1: pedestrian 30 | # 2: rider 31 | # 3: car 32 | # 4: truck 33 | # 5: bus 34 | # 6: train 35 | # 7: motorcycle 36 | # 8: bicycle 37 | # 9: traffic light --- Don't need tracking 38 | # 10: traffic sign --- Don't need tracking 39 | # For MOT and MOTS, only the first 8 classes are used and evaluated 40 | 41 | def read_tid_num_per_video(video_ann_dir): 42 | anns = np.loadtxt(video_ann_dir, dtype=np.float32, delimiter=',') 43 | max_tid = max(anns[:, 1]) 44 | return int(max_tid) 45 | 46 | 47 | for s in split: 48 | img_id = 1; ann_id = 1; video_cnt = 0; 49 | tid_cnt = 0 50 | images = []; annotations=[]; videos = [] 51 | all_video=[d for d in os.listdir(os.path.join(labels_path, s)) if '.json' in d] 52 | need_index=np.random.choice(range(len(all_video)),len(all_video)//3,replace=False) 53 | video_labels_list = [all_video[i] for i in need_index] 54 | 55 | for v_label in tqdm.tqdm(video_labels_list): 56 | video_cnt += 1 57 | video = {'id': video_cnt, 'file_name':v_label[:-5]} 58 | videos.append(video) 59 | 60 | v_lab_path = os.path.join(os.path.join(labels_path, s, v_label)) 61 | with open(v_lab_path, 'r') as f: 62 | annos=json.load(f)# anns per video 63 | num_frames = len(annos)# the number of frames per video 64 | sign_cnt = 0 65 | for ann in annos:# ann --- 每一帧的标注信息,这里放过了空白帧 66 | 67 | img_name = os.path.join(img_path, s, ann['videoName'], ann['name']) 68 | img=cv2.imread(img_name) 69 | h,w,_ = img.shape 70 | 71 | img_info = { 72 | 'file_name':img_name, 73 | 'width':w, 74 | 'height':h, 75 | 'id': img_id, 76 | 'frame_id': ann['frameIndex'] + 1,# 严格按照 数据集 标记的帧indx 来进行排序,这将有利于 判断 相邻帧 之间的关系 77 | 'prev_image_id': -1 if ann['frameIndex'] == 0 else img_id - 1, 78 | 'next_image_id': -1 if ann['frameIndex'] == num_frames-1 else img_id + 1, 79 | 'video_id': video_cnt 80 | }# 所有的图像信息images中 ,这里也会添加空白标注帧的图像信息 81 | images.append(img_info) 82 | 83 | for j, lab in enumerate(ann['labels']): 84 | # lab---每一个实例的标注信息 如果遇到空白标注帧--ann['labels']为空 则循环不执行 如果帧为非空 则继续执行此循环 85 | if lab['category'] in cat:# 为了避免 'other vehicle' 类 86 | pass 87 | else: 88 | continue 89 | 90 | track_id = lab['id'] 91 | 92 | if sign_cnt == 0 and j==0: 93 | firstid = track_id 94 | sign_cnt = 1 95 | 96 | tid_curr = int(track_id) - int(firstid) + 1 97 | tid_cnt+=1 98 | is_crowd = lab['attributes']['crowd'] 99 | x1, y1, x2, y2=lab['box2d']['x1'], lab['box2d']['y1'], lab['box2d']['x2'], lab['box2d']['y2'] 100 | 101 | annotation = { 102 | 'image_id': img_id, 103 | 'conf': 1, 104 | 'bbox': [x1, y1, x2-x1, y2-y1], 105 | 'category_id': cat[lab['category']], 106 | 'id': ann_id, 107 | 'iscrowd': 1 if is_crowd else 0, 108 | 'track_id': tid_curr + tid_cnt, 109 | 'segmentation': [], 110 | 'area': (x2-x1)*(y2-y1), 111 | 'box_id':int(track_id) 112 | } 113 | annotations.append(annotation) 114 | ann_id += 1 115 | 116 | img_id += 1 117 | 118 | # tid_cnt += read_tid_num_per_video(os.path.join(mot_labels_path, s, v_label[:-5]+'.txt')) 119 | 120 | dataset_dict = {} 121 | dataset_dict["images"] = images 122 | dataset_dict["annotations"] = annotations 123 | dataset_dict["categories"] = categories 124 | dataset_dict["videos"] = videos 125 | 126 | json_str = json.dumps(dataset_dict) 127 | print(f' The number of detection objects is {ann_id - 1}, The number of detection imgs is {img_id -1} .') 128 | with open(out_path+f'{s}.json', 'w') as json_file: 129 | json_file.write(json_str) -------------------------------------------------------------------------------- /yolox/data/datasets/mot.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from pycocotools.coco import COCO 4 | from collections import defaultdict 5 | import os 6 | 7 | from ..dataloading import get_yolox_datadir 8 | from .datasets_wrapper import Dataset 9 | 10 | 11 | class MOTDataset(Dataset): 12 | """ 13 | COCO dataset class. 14 | """ 15 | 16 | def __init__( 17 | self, 18 | data_dir=None, 19 | json_file="train_half.json", 20 | name="train", 21 | img_size=(608, 1088), 22 | preproc=None, 23 | ): 24 | """ 25 | COCO dataset initialization. Annotation data are read into memory by COCO API. 26 | Args: 27 | data_dir (str): dataset root directory 28 | json_file (str): COCO json file name 29 | name (str): COCO data name (e.g. 'train2017' or 'val2017') 30 | img_size (int): target image size after pre-processing 31 | preproc: data augmentation strategy 32 | """ 33 | super().__init__(img_size) 34 | if data_dir is None: 35 | data_dir = os.path.join(get_yolox_datadir(), "mot") 36 | self.data_dir = data_dir 37 | self.json_file = json_file 38 | 39 | self.coco = COCO(os.path.join(self.data_dir, "annotations", self.json_file)) 40 | self.ids = self.coco.getImgIds() 41 | self.class_ids = sorted(self.coco.getCatIds()) 42 | cats = self.coco.loadCats(self.coco.getCatIds()) 43 | self._classes = tuple([c["name"] for c in cats]) 44 | self.video_info=defaultdict(list) 45 | self.annotations = self._load_coco_annotations() 46 | # "DanceTrack FRCNN" in self.coco.loadImgs(min(v))[0]["file_name"] or "MOT20" in self.coco.loadImgs(min(v))[0]["file_name"] 47 | self.video_info={k:(min(v),max(v),True) for k,v in self.video_info.items()} 48 | self.name = name 49 | self.img_size = img_size 50 | self.preproc = preproc 51 | 52 | def __len__(self): 53 | return len(self.ids) 54 | 55 | def _load_coco_annotations(self): 56 | return [self.load_anno_from_ids(index,_ids) for index,_ids in enumerate(self.ids)] 57 | 58 | def load_anno_from_ids(self,index,id_): 59 | im_ann = self.coco.loadImgs(id_)[0] 60 | width = im_ann["width"] 61 | height = im_ann["height"] 62 | frame_id = im_ann["frame_id"] 63 | video_id = im_ann["video_id"] 64 | self.video_info[video_id].append(index) 65 | anno_ids = self.coco.getAnnIds(imgIds=[int(id_)], iscrowd=False) 66 | annotations = self.coco.loadAnns(anno_ids) 67 | objs = [] 68 | for obj in annotations: 69 | x1 = obj["bbox"][0] 70 | y1 = obj["bbox"][1] 71 | x2 = x1 + obj["bbox"][2] 72 | y2 = y1 + obj["bbox"][3] 73 | if obj["area"] > 0 and x2 >= x1 and y2 >= y1: 74 | obj["clean_bbox"] = [x1, y1, x2, y2] 75 | objs.append(obj) 76 | 77 | num_objs = len(objs) 78 | 79 | res = np.zeros((num_objs, 6)) 80 | 81 | for ix, obj in enumerate(objs): 82 | cls = self.class_ids.index(obj["category_id"]) 83 | res[ix, 0:4] = obj["clean_bbox"] 84 | res[ix, 4] = cls 85 | res[ix, 5] = obj["track_id"] 86 | 87 | file_name = im_ann["file_name"] if "file_name" in im_ann else "{:012}".format(id_) + ".jpg" 88 | img_info = (height, width, frame_id, video_id, file_name) 89 | 90 | del im_ann, annotations 91 | 92 | return (res, img_info, file_name) 93 | 94 | def load_anno(self, index): 95 | return self.annotations[index][0] 96 | 97 | def pull_item(self, index): 98 | id_ = self.ids[index] 99 | 100 | res, img_info, file_name = self.annotations[index] 101 | # load image and preprocess 102 | img_file = os.path.join( 103 | self.data_dir, self.name, file_name 104 | ) 105 | # img_file=file_name 106 | img = cv2.imread(img_file) 107 | assert img is not None 108 | 109 | return img, res.copy(), img_info, np.array([id_]) 110 | 111 | @Dataset.resize_getitem 112 | def __getitem__(self, index): 113 | """ 114 | One image / label pair for the given index is picked up and pre-processed. 115 | 116 | Args: 117 | index (int): data index 118 | 119 | Returns: 120 | img (numpy.ndarray): pre-processed image 121 | padded_labels (torch.Tensor): pre-processed label data. 122 | The shape is :math:`[max_labels, 5]`. 123 | each label consists of [class, xc, yc, w, h]: 124 | class (float): class index. 125 | xc, yc (float) : center of bbox whose values range from 0 to 1. 126 | w, h (float) : size of bbox whose values range from 0 to 1. 127 | info_img : tuple of h, w, nh, nw, dx, dy. 128 | h, w (int): original shape of the image 129 | nh, nw (int): shape of the resized image without padding 130 | dx, dy (int): pad size 131 | img_id (int): same as the input index. Used for evaluation. 132 | """ 133 | img, target, img_info, img_id = self.pull_item(index) 134 | 135 | if self.preproc is not None: 136 | img, target = self.preproc(img, target, self.input_dim) 137 | return img,target,img_info,img_id 138 | -------------------------------------------------------------------------------- /yolox/utils/visualize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | import cv2 6 | import numpy as np 7 | 8 | __all__ = ["vis"] 9 | 10 | 11 | def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None): 12 | 13 | for i in range(len(boxes)): 14 | box = boxes[i] 15 | cls_id = int(cls_ids[i]) 16 | score = scores[i] 17 | if score < conf: 18 | continue 19 | x0 = int(box[0]) 20 | y0 = int(box[1]) 21 | x1 = int(box[2]) 22 | y1 = int(box[3]) 23 | 24 | color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist() 25 | text = '{}:{:.1f}%'.format(class_names[cls_id], score * 100) 26 | txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255) 27 | font = cv2.FONT_HERSHEY_SIMPLEX 28 | 29 | txt_size = cv2.getTextSize(text, font, 0.4, 1)[0] 30 | cv2.rectangle(img, (x0, y0), (x1, y1), color, 2) 31 | 32 | txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist() 33 | cv2.rectangle( 34 | img, 35 | (x0, y0 + 1), 36 | (x0 + txt_size[0] + 1, y0 + int(1.5*txt_size[1])), 37 | txt_bk_color, 38 | -1 39 | ) 40 | cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1) 41 | 42 | return img 43 | 44 | 45 | def get_color(idx): 46 | idx = idx * 3 47 | color = ((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255) 48 | 49 | return color 50 | 51 | 52 | def plot_tracking(image, tlwhs, obj_ids, scores=None, frame_id=0, fps=0., ids2=None): 53 | im = np.ascontiguousarray(np.copy(image)) 54 | im_h, im_w = im.shape[:2] 55 | 56 | top_view = np.zeros([im_w, im_w, 3], dtype=np.uint8) + 255 57 | 58 | #text_scale = max(1, image.shape[1] / 1600.) 59 | #text_thickness = 2 60 | #line_thickness = max(1, int(image.shape[1] / 500.)) 61 | text_scale = 2 62 | text_thickness = 2 63 | line_thickness = 3 64 | 65 | radius = max(5, int(im_w/140.)) 66 | cv2.putText(im, 'frame: %d fps: %.2f num: %d' % (frame_id, fps, len(tlwhs)), 67 | (0, int(15 * text_scale)), cv2.FONT_HERSHEY_PLAIN, 2, (0, 0, 255), thickness=2) 68 | 69 | for i, tlwh in enumerate(tlwhs): 70 | x1, y1, w, h = tlwh 71 | intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h))) 72 | obj_id = int(obj_ids[i]) 73 | id_text = '{}'.format(int(obj_id)) 74 | if ids2 is not None: 75 | id_text = id_text + ', {}'.format(int(ids2[i])) 76 | color = get_color(abs(obj_id)) 77 | cv2.rectangle(im, intbox[0:2], intbox[2:4], color=color, thickness=line_thickness) 78 | cv2.putText(im, id_text, (intbox[0], intbox[1]), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255), 79 | thickness=text_thickness) 80 | return im 81 | 82 | 83 | _COLORS = np.array( 84 | [ 85 | 0.000, 0.447, 0.741, 86 | 0.850, 0.325, 0.098, 87 | 0.929, 0.694, 0.125, 88 | 0.494, 0.184, 0.556, 89 | 0.466, 0.674, 0.188, 90 | 0.301, 0.745, 0.933, 91 | 0.635, 0.078, 0.184, 92 | 0.300, 0.300, 0.300, 93 | 0.600, 0.600, 0.600, 94 | 1.000, 0.000, 0.000, 95 | 1.000, 0.500, 0.000, 96 | 0.749, 0.749, 0.000, 97 | 0.000, 1.000, 0.000, 98 | 0.000, 0.000, 1.000, 99 | 0.667, 0.000, 1.000, 100 | 0.333, 0.333, 0.000, 101 | 0.333, 0.667, 0.000, 102 | 0.333, 1.000, 0.000, 103 | 0.667, 0.333, 0.000, 104 | 0.667, 0.667, 0.000, 105 | 0.667, 1.000, 0.000, 106 | 1.000, 0.333, 0.000, 107 | 1.000, 0.667, 0.000, 108 | 1.000, 1.000, 0.000, 109 | 0.000, 0.333, 0.500, 110 | 0.000, 0.667, 0.500, 111 | 0.000, 1.000, 0.500, 112 | 0.333, 0.000, 0.500, 113 | 0.333, 0.333, 0.500, 114 | 0.333, 0.667, 0.500, 115 | 0.333, 1.000, 0.500, 116 | 0.667, 0.000, 0.500, 117 | 0.667, 0.333, 0.500, 118 | 0.667, 0.667, 0.500, 119 | 0.667, 1.000, 0.500, 120 | 1.000, 0.000, 0.500, 121 | 1.000, 0.333, 0.500, 122 | 1.000, 0.667, 0.500, 123 | 1.000, 1.000, 0.500, 124 | 0.000, 0.333, 1.000, 125 | 0.000, 0.667, 1.000, 126 | 0.000, 1.000, 1.000, 127 | 0.333, 0.000, 1.000, 128 | 0.333, 0.333, 1.000, 129 | 0.333, 0.667, 1.000, 130 | 0.333, 1.000, 1.000, 131 | 0.667, 0.000, 1.000, 132 | 0.667, 0.333, 1.000, 133 | 0.667, 0.667, 1.000, 134 | 0.667, 1.000, 1.000, 135 | 1.000, 0.000, 1.000, 136 | 1.000, 0.333, 1.000, 137 | 1.000, 0.667, 1.000, 138 | 0.333, 0.000, 0.000, 139 | 0.500, 0.000, 0.000, 140 | 0.667, 0.000, 0.000, 141 | 0.833, 0.000, 0.000, 142 | 1.000, 0.000, 0.000, 143 | 0.000, 0.167, 0.000, 144 | 0.000, 0.333, 0.000, 145 | 0.000, 0.500, 0.000, 146 | 0.000, 0.667, 0.000, 147 | 0.000, 0.833, 0.000, 148 | 0.000, 1.000, 0.000, 149 | 0.000, 0.000, 0.167, 150 | 0.000, 0.000, 0.333, 151 | 0.000, 0.000, 0.500, 152 | 0.000, 0.000, 0.667, 153 | 0.000, 0.000, 0.833, 154 | 0.000, 0.000, 1.000, 155 | 0.000, 0.000, 0.000, 156 | 0.143, 0.143, 0.143, 157 | 0.286, 0.286, 0.286, 158 | 0.429, 0.429, 0.429, 159 | 0.571, 0.571, 0.571, 160 | 0.714, 0.714, 0.714, 161 | 0.857, 0.857, 0.857, 162 | 0.000, 0.447, 0.741, 163 | 0.314, 0.717, 0.741, 164 | 0.50, 0.5, 0 165 | ] 166 | ).astype(np.float32).reshape(-1, 3) 167 | -------------------------------------------------------------------------------- /yolox/utils/box_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Utilities for bounding box manipulation and GIoU. 4 | """ 5 | import torch 6 | from torchvision.ops.boxes import box_area 7 | from yolox.utils.cluster_nms import giou_3d 8 | 9 | def box_cxcywh_to_xyxy(x): 10 | x_c, y_c, w, h = x.unbind(-1) 11 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h), 12 | (x_c + 0.5 * w), (y_c + 0.5 * h)] 13 | return torch.stack(b, dim=-1) 14 | 15 | 16 | def box_xyxy_to_cxcywh(x): 17 | x0, y0, x1, y1 = x.unbind(-1) 18 | b = [(x0 + x1) / 2, (y0 + y1) / 2, 19 | (x1 - x0), (y1 - y0)] 20 | return torch.stack(b, dim=-1) 21 | 22 | 23 | # modified from torchvision to also return the union 24 | def box_iou(boxes1, boxes2): 25 | area1 = box_area(boxes1) 26 | area2 = box_area(boxes2) 27 | 28 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 29 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 30 | 31 | wh = (rb - lt).clamp(min=0) # [N,M,2] 32 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 33 | 34 | union = area1[:, None] + area2 - inter 35 | 36 | iou = inter / union 37 | return iou, union 38 | 39 | 40 | def generalized_box_iou(boxes1,boxes2,boxes3,boxes4): 41 | """ 42 | Generalized IoU from https://giou.stanford.edu/ 43 | 44 | The boxes should be in [x0, y0, x1, y1] format 45 | 46 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 47 | and M = len(boxes2) 48 | """ 49 | # degenerate boxes gives inf / nan results 50 | # so do an early check 51 | # boxes1=boxes1.float() 52 | # boxes2=boxes2.float() 53 | # boxes3=boxes3.float() 54 | # boxes4=boxes4.float() 55 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 56 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 57 | assert (boxes3[:, 2:] >= boxes3[:, :2]).all() 58 | assert (boxes4[:, 2:] >= boxes4[:, :2]).all() 59 | # iou1, union1 = box_iou(boxes1, boxes3) 60 | # iou2, union2 = box_iou(boxes2, boxes4) 61 | # lt = torch.min(boxes1[:, None, :2], boxes3[:, :2]) 62 | # rb = torch.max(boxes1[:, None, 2:], boxes3[:, 2:]) 63 | 64 | # wh = (rb - lt).clamp(min=0) # [N,M,2] 65 | # area1 = wh[:, :, 0] * wh[:, :, 1] 66 | 67 | # lt = torch.min(boxes2[:, None, :2], boxes4[:, :2]) 68 | # rb = torch.max(boxes2[:, None, 2:], boxes4[:, 2:]) 69 | 70 | # wh = (rb - lt).clamp(min=0) # [N,M,2] 71 | # area2 = wh[:, :, 0] * wh[:, :, 1] 72 | # uiou=(iou1*union1+iou2*union2)/(union1+union2) 73 | # uunion=union1+union2 74 | # uarea=area1+area2 75 | # return uiou- (uarea - uunion) / uarea 76 | 77 | return giou_3d(boxes1,boxes3,boxes2,boxes4) 78 | 79 | 80 | def masks_to_boxes(masks): 81 | """Compute the bounding boxes around the provided masks 82 | 83 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. 84 | 85 | Returns a [N, 4] tensors, with the boxes in xyxy format 86 | """ 87 | if masks.numel() == 0: 88 | return torch.zeros((0, 4), device=masks.device) 89 | 90 | h, w = masks.shape[-2:] 91 | 92 | y = torch.arange(0, h, dtype=torch.float) 93 | x = torch.arange(0, w, dtype=torch.float) 94 | y, x = torch.meshgrid(y, x) 95 | 96 | x_mask = (masks * x.unsqueeze(0)) 97 | x_max = x_mask.flatten(1).max(-1)[0] 98 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 99 | 100 | y_mask = (masks * y.unsqueeze(0)) 101 | y_max = y_mask.flatten(1).max(-1)[0] 102 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 103 | 104 | return torch.stack([x_min, y_min, x_max, y_max], 1) 105 | 106 | 107 | 108 | # boxes = targets[:, :4].copy() 109 | # labels = targets[:, 4].copy() 110 | # ids = targets[:, 5].copy() 111 | # if len(boxes) == 0: 112 | # targets = np.zeros((self.max_labels, 6), dtype=np.float32) 113 | # image, r_o = preproc(image, input_dim, self.means, self.std) 114 | # image = np.ascontiguousarray(image, dtype=np.float32) 115 | # return image, targets 116 | 117 | # image_o = image.copy() 118 | # targets_o = targets.copy() 119 | # height_o, width_o, _ = image_o.shape 120 | # boxes_o = targets_o[:, :4] 121 | # labels_o = targets_o[:, 4] 122 | # ids_o = targets_o[:, 5] 123 | # # bbox_o: [xyxy] to [c_x,c_y,w,h] 124 | # boxes_o = xyxy2cxcywh(boxes_o) 125 | 126 | # image_t = _distort(image) 127 | # image_t, boxes_t ,image_r,boxes_r= _mirror(image_t, boxes) 128 | # height, width, _ = image_t.shape 129 | # image_t, r_t = preproc(image_t, input_dim, self.means, self.std) 130 | # image_t, r_r = preproc(image_r, input_dim, self.means, self.std) 131 | # # boxes [xyxy] 2 [cx,cy,w,h] 132 | # boxes_t = xyxy2cxcywh(boxes_t) 133 | # boxes_t *= r_t 134 | 135 | # boxes_r = xyxy2cxcywh(boxes_r) 136 | # boxes_r *= r_r 137 | 138 | # mask_b = np.minimum(boxes_t[:, 2], boxes_t[:, 3]) > 1 139 | # boxes_t = boxes_t[mask_b] 140 | # boxes_r = boxes_r[mask_b] 141 | 142 | # labels_t = labels[mask_b] 143 | # ids_t = ids[mask_b] 144 | 145 | # if len(boxes_t) == 0: 146 | # image_t, r_o = preproc(image_o, input_dim, self.means, self.std) 147 | # boxes_o *= r_o 148 | # boxes_t = boxes_o 149 | # image_r=image_t 150 | # boxes_r=boxes_t 151 | # labels_t = labels_o 152 | # ids_t = ids_o 153 | 154 | # labels_t = np.expand_dims(labels_t, 1) 155 | # ids_t = np.expand_dims(ids_t, 1) 156 | 157 | # targets_t = np.hstack((labels_t, boxes_t, ids_t)) 158 | # padded_labels = np.zeros((self.max_labels, 6)) 159 | # padded_labels[range(len(targets_t))[: self.max_labels]] = targets_t[ 160 | # : self.max_labels 161 | # ] 162 | 163 | # targets_r = np.hstack((labels_t, boxes_r, ids_t)) 164 | # padded_labels_r = np.zeros((self.max_labels, 6)) 165 | # padded_labels_r[range(len(targets_r))[: self.max_labels]] = targets_r[ 166 | # : self.max_labels 167 | # ] 168 | # padded_labels = np.ascontiguousarray(padded_labels, dtype=np.float32) 169 | # image_t = np.ascontiguousarray(image_t, dtype=np.float32) 170 | # return image_t, padded_labels 171 | -------------------------------------------------------------------------------- /yolox/layers/fast_coco_eval_api.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # This file comes from 4 | # https://github.com/facebookresearch/detectron2/blob/master/detectron2/evaluation/fast_eval_api.py 5 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 6 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 7 | 8 | import numpy as np 9 | from pycocotools.cocoeval import COCOeval 10 | 11 | # import torch first to make yolox._C work without ImportError of libc10.so 12 | # in YOLOX, env is already set in __init__.py. 13 | from yolox import _C 14 | 15 | import copy 16 | import time 17 | 18 | 19 | class COCOeval_opt(COCOeval): 20 | """ 21 | This is a slightly modified version of the original COCO API, where the functions evaluateImg() 22 | and accumulate() are implemented in C++ to speedup evaluation 23 | """ 24 | 25 | def evaluate(self): 26 | """ 27 | Run per image evaluation on given images and store results in self.evalImgs_cpp, a 28 | datastructure that isn't readable from Python but is used by a c++ implementation of 29 | accumulate(). Unlike the original COCO PythonAPI, we don't populate the datastructure 30 | self.evalImgs because this datastructure is a computational bottleneck. 31 | :return: None 32 | """ 33 | tic = time.time() 34 | 35 | print("Running per image evaluation...") 36 | p = self.params 37 | # add backward compatibility if useSegm is specified in params 38 | if p.useSegm is not None: 39 | p.iouType = "segm" if p.useSegm == 1 else "bbox" 40 | print( 41 | "useSegm (deprecated) is not None. Running {} evaluation".format( 42 | p.iouType 43 | ) 44 | ) 45 | print("Evaluate annotation type *{}*".format(p.iouType)) 46 | p.imgIds = list(np.unique(p.imgIds)) 47 | if p.useCats: 48 | p.catIds = list(np.unique(p.catIds)) 49 | p.maxDets = sorted(p.maxDets) 50 | self.params = p 51 | 52 | self._prepare() 53 | 54 | # loop through images, area range, max detection number 55 | catIds = p.catIds if p.useCats else [-1] 56 | 57 | if p.iouType == "segm" or p.iouType == "bbox": 58 | computeIoU = self.computeIoU 59 | elif p.iouType == "keypoints": 60 | computeIoU = self.computeOks 61 | self.ious = { 62 | (imgId, catId): computeIoU(imgId, catId) 63 | for imgId in p.imgIds 64 | for catId in catIds 65 | } 66 | 67 | maxDet = p.maxDets[-1] 68 | 69 | # <<<< Beginning of code differences with original COCO API 70 | def convert_instances_to_cpp(instances, is_det=False): 71 | # Convert annotations for a list of instances in an image to a format that's fast 72 | # to access in C++ 73 | instances_cpp = [] 74 | for instance in instances: 75 | instance_cpp = _C.InstanceAnnotation( 76 | int(instance["id"]), 77 | instance["score"] if is_det else instance.get("score", 0.0), 78 | instance["area"], 79 | bool(instance.get("iscrowd", 0)), 80 | bool(instance.get("ignore", 0)), 81 | ) 82 | instances_cpp.append(instance_cpp) 83 | return instances_cpp 84 | 85 | # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++ 86 | ground_truth_instances = [ 87 | [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds] 88 | for imgId in p.imgIds 89 | ] 90 | detected_instances = [ 91 | [ 92 | convert_instances_to_cpp(self._dts[imgId, catId], is_det=True) 93 | for catId in p.catIds 94 | ] 95 | for imgId in p.imgIds 96 | ] 97 | ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds] 98 | 99 | if not p.useCats: 100 | # For each image, flatten per-category lists into a single list 101 | ground_truth_instances = [ 102 | [[o for c in i for o in c]] for i in ground_truth_instances 103 | ] 104 | detected_instances = [ 105 | [[o for c in i for o in c]] for i in detected_instances 106 | ] 107 | 108 | # Call C++ implementation of self.evaluateImgs() 109 | self._evalImgs_cpp = _C.COCOevalEvaluateImages( 110 | p.areaRng, 111 | maxDet, 112 | p.iouThrs, 113 | ious, 114 | ground_truth_instances, 115 | detected_instances, 116 | ) 117 | self._evalImgs = None 118 | 119 | self._paramsEval = copy.deepcopy(self.params) 120 | toc = time.time() 121 | print("COCOeval_opt.evaluate() finished in {:0.2f} seconds.".format(toc - tic)) 122 | # >>>> End of code differences with original COCO API 123 | 124 | def accumulate(self): 125 | """ 126 | Accumulate per image evaluation results and store the result in self.eval. Does not 127 | support changing parameter settings from those used by self.evaluate() 128 | """ 129 | print("Accumulating evaluation results...") 130 | tic = time.time() 131 | if not hasattr(self, "_evalImgs_cpp"): 132 | print("Please run evaluate() first") 133 | 134 | self.eval = _C.COCOevalAccumulate(self._paramsEval, self._evalImgs_cpp) 135 | 136 | # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections 137 | self.eval["recall"] = np.array(self.eval["recall"]).reshape( 138 | self.eval["counts"][:1] + self.eval["counts"][2:] 139 | ) 140 | 141 | # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X 142 | # num_area_ranges X num_max_detections 143 | self.eval["precision"] = np.array(self.eval["precision"]).reshape( 144 | self.eval["counts"] 145 | ) 146 | self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"]) 147 | toc = time.time() 148 | print( 149 | "COCOeval_opt.accumulate() finished in {:0.2f} seconds.".format(toc - tic) 150 | ) 151 | -------------------------------------------------------------------------------- /exps/example/mot/yolox_x_diffusion_det_mot17.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | import os 3 | import random 4 | import torch 5 | import torch.nn as nn 6 | import torch.distributed as dist 7 | from torch.optim import AdamW 8 | from yolox.exp import Exp as MyExp 9 | from yolox.data import get_yolox_datadir 10 | 11 | class Exp(MyExp): 12 | def __init__(self): 13 | super(Exp, self).__init__() 14 | self.num_classes = 1 15 | self.depth = 1.33 16 | self.width = 1.25 17 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 18 | self.train_ann = "train.json" 19 | self.val_ann = "val_half.json" 20 | self.input_size = (800, 1440) 21 | self.test_size = (800, 1440) 22 | self.random_size = (18, 32) 23 | self.max_epoch = 30 24 | self.print_interval = 20 25 | self.eval_interval = 5 26 | self.no_aug_epochs = 10 27 | self.basic_lr_per_img = 0.001 / 64.0 28 | self.warmup_epochs = 1 29 | self.task="detection" 30 | self.enable_mixup = True 31 | self.seed=8823 32 | self.conf_thresh=0.4 33 | self.det_thresh=0.7 34 | self.nms_thresh2d=0.75 35 | self.nms_thresh3d=0.7 36 | self.interval=5 37 | 38 | def get_data_loader(self, batch_size, is_distributed, no_aug=False): 39 | from yolox.data import ( 40 | MOTDataset, 41 | TrainTransform, 42 | YoloBatchSampler, 43 | DataLoader, 44 | InfiniteSampler, 45 | MosaicDetection, 46 | ) 47 | 48 | dataset = MOTDataset( 49 | data_dir=os.path.join(get_yolox_datadir(), "mix_det"), 50 | json_file=self.train_ann, 51 | name='', 52 | img_size=self.input_size, 53 | preproc=TrainTransform( 54 | rgb_means=(0.485, 0.456, 0.406), 55 | std=(0.229, 0.224, 0.225), 56 | max_labels=500, 57 | ), 58 | ) 59 | 60 | dataset = MosaicDetection( 61 | dataset, 62 | mosaic=not no_aug, 63 | img_size=self.input_size, 64 | preproc=TrainTransform( 65 | rgb_means=(0.485, 0.456, 0.406), 66 | std=(0.229, 0.224, 0.225), 67 | max_labels=1000, 68 | ), 69 | degrees=self.degrees, 70 | translate=self.translate, 71 | scale=self.scale, 72 | shear=self.shear, 73 | perspective=self.perspective, 74 | enable_mixup=self.enable_mixup, 75 | ) 76 | 77 | self.dataset = dataset 78 | 79 | if is_distributed: 80 | batch_size = batch_size // dist.get_world_size() 81 | 82 | sampler = InfiniteSampler( 83 | len(self.dataset), seed=self.seed if self.seed else 0 84 | ) 85 | 86 | batch_sampler = YoloBatchSampler( 87 | sampler=sampler, 88 | batch_size=batch_size, 89 | drop_last=False, 90 | input_dimension=self.input_size, 91 | mosaic=not no_aug, 92 | ) 93 | 94 | dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True} 95 | dataloader_kwargs["batch_sampler"] = batch_sampler 96 | train_loader = DataLoader(self.dataset, **dataloader_kwargs) 97 | 98 | return train_loader 99 | 100 | def get_eval_loader(self, batch_size, is_distributed, testdev=False): 101 | from yolox.data import MOTDataset,DiffusionValTransform 102 | 103 | valdataset = MOTDataset( 104 | data_dir=os.path.join(get_yolox_datadir(), "mot"), 105 | json_file=self.val_ann, 106 | img_size=self.test_size, 107 | name='train', 108 | preproc=DiffusionValTransform( 109 | rgb_means=(0.485, 0.456, 0.406), 110 | std=(0.229, 0.224, 0.225), 111 | max_labels=1000, 112 | ) 113 | ) 114 | 115 | if is_distributed: 116 | batch_size = batch_size // dist.get_world_size() 117 | sampler = torch.utils.data.distributed.DistributedSampler( 118 | valdataset, shuffle=False 119 | ) 120 | else: 121 | sampler = torch.utils.data.SequentialSampler(valdataset) 122 | 123 | dataloader_kwargs = { 124 | "num_workers": self.data_num_workers, 125 | "pin_memory": True, 126 | "sampler": sampler, 127 | } 128 | dataloader_kwargs["batch_size"] = batch_size 129 | val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs) 130 | 131 | return val_loader 132 | 133 | def get_evaluator(self, batch_size, is_distributed, testdev=False): 134 | from yolox.evaluators import COCOEvaluator 135 | 136 | val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev) 137 | evaluator = COCOEvaluator( 138 | dataloader=val_loader, 139 | img_size=self.test_size, 140 | confthre=self.conf_thresh, 141 | nmsthre3d=self.nms_thresh3d, 142 | detthre=self.det_thresh, 143 | nmsthre2d=self.nms_thresh2d, 144 | num_classes=self.num_classes, 145 | testdev=testdev, 146 | ) 147 | return evaluator 148 | 149 | def get_model(self): 150 | from yolox.models import YOLOPAFPN, YOLOX, YOLOXHead 151 | from diffusion.models.diffusionnet import DiffusionNet,DiffusionHead 152 | 153 | def init_yolo(M): 154 | for m in M.modules(): 155 | if isinstance(m, nn.BatchNorm2d): 156 | m.eps = 1e-3 157 | m.momentum = 0.03 158 | 159 | if getattr(self, "model", None) is None: 160 | in_channels = [256, 512, 1024] 161 | backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels) 162 | for value in backbone.parameters(): 163 | value.requires_grad=False 164 | head=DiffusionHead(self.num_classes,self.width) 165 | self.model = DiffusionNet(backbone, head) 166 | 167 | self.model.apply(init_yolo) 168 | # self.model.head.initialize_biases(1e-2) 169 | return self.model 170 | 171 | def get_optimizer(self, batch_size): 172 | lr=2.5e-05 173 | weight_decay = 0.0001 174 | self.optimizer=AdamW(self.model.parameters(),lr=lr,weight_decay=weight_decay) 175 | return self.optimizer 176 | -------------------------------------------------------------------------------- /exps/example/mot/yolox_x_diffusion_det_mot20.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | import os 3 | import random 4 | import torch 5 | import torch.nn as nn 6 | import torch.distributed as dist 7 | from torch.optim import AdamW 8 | from yolox.exp import Exp as MyExp 9 | from yolox.data import get_yolox_datadir 10 | 11 | class Exp(MyExp): 12 | def __init__(self): 13 | super(Exp, self).__init__() 14 | self.num_classes = 1 15 | self.depth = 1.33 16 | self.width = 1.25 17 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 18 | self.train_ann = "train.json" 19 | self.val_ann = "val_half.json" 20 | self.input_size = (896, 1600) 21 | self.test_size = (896, 1600) 22 | self.random_size = (20, 36) 23 | self.max_epoch = 30 24 | self.print_interval = 20 25 | self.eval_interval = 5 26 | self.no_aug_epochs = 10 27 | self.basic_lr_per_img = 0.001 / 64.0 28 | self.warmup_epochs = 1 29 | self.task="detection" 30 | self.enable_mixup = True 31 | self.seed=8823 32 | self.conf_thresh=0.4 33 | self.det_thresh=0.7 34 | self.nms_thresh2d=0.75 35 | self.nms_thresh3d=0.7 36 | self.interval=5 37 | 38 | def get_data_loader(self, batch_size, is_distributed, no_aug=False): 39 | from yolox.data import ( 40 | MOTDataset, 41 | TrainTransform, 42 | YoloBatchSampler, 43 | DataLoader, 44 | InfiniteSampler, 45 | MosaicDetection, 46 | ) 47 | 48 | dataset = MOTDataset( 49 | data_dir=os.path.join(get_yolox_datadir(), "mix_mot20_ch"), 50 | json_file=self.train_ann, 51 | name='', 52 | img_size=self.input_size, 53 | preproc=TrainTransform( 54 | rgb_means=(0.485, 0.456, 0.406), 55 | std=(0.229, 0.224, 0.225), 56 | max_labels=500, 57 | ), 58 | ) 59 | 60 | dataset = MosaicDetection( 61 | dataset, 62 | mosaic=not no_aug, 63 | img_size=self.input_size, 64 | preproc=TrainTransform( 65 | rgb_means=(0.485, 0.456, 0.406), 66 | std=(0.229, 0.224, 0.225), 67 | max_labels=1200, 68 | ), 69 | degrees=self.degrees, 70 | translate=self.translate, 71 | scale=self.scale, 72 | shear=self.shear, 73 | perspective=self.perspective, 74 | enable_mixup=self.enable_mixup, 75 | ) 76 | 77 | self.dataset = dataset 78 | 79 | if is_distributed: 80 | batch_size = batch_size // dist.get_world_size() 81 | 82 | sampler = InfiniteSampler( 83 | len(self.dataset), seed=self.seed if self.seed else 0 84 | ) 85 | 86 | batch_sampler = YoloBatchSampler( 87 | sampler=sampler, 88 | batch_size=batch_size, 89 | drop_last=False, 90 | input_dimension=self.input_size, 91 | mosaic=not no_aug, 92 | ) 93 | 94 | dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True} 95 | dataloader_kwargs["batch_sampler"] = batch_sampler 96 | train_loader = DataLoader(self.dataset, **dataloader_kwargs) 97 | 98 | return train_loader 99 | 100 | def get_eval_loader(self, batch_size, is_distributed, testdev=False): 101 | from yolox.data import MOTDataset,DiffusionValTransform 102 | 103 | valdataset = MOTDataset( 104 | data_dir=os.path.join(get_yolox_datadir(), "MOT20"), 105 | json_file=self.val_ann, 106 | img_size=self.test_size, 107 | name='train', 108 | preproc=DiffusionValTransform( 109 | rgb_means=(0.485, 0.456, 0.406), 110 | std=(0.229, 0.224, 0.225), 111 | max_labels=1200, 112 | ) 113 | ) 114 | 115 | if is_distributed: 116 | batch_size = batch_size // dist.get_world_size() 117 | sampler = torch.utils.data.distributed.DistributedSampler( 118 | valdataset, shuffle=False 119 | ) 120 | else: 121 | sampler = torch.utils.data.SequentialSampler(valdataset) 122 | 123 | dataloader_kwargs = { 124 | "num_workers": self.data_num_workers, 125 | "pin_memory": True, 126 | "sampler": sampler, 127 | } 128 | dataloader_kwargs["batch_size"] = batch_size 129 | val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs) 130 | 131 | return val_loader 132 | 133 | def get_evaluator(self, batch_size, is_distributed, testdev=False): 134 | from yolox.evaluators import COCOEvaluator 135 | 136 | val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev) 137 | evaluator = COCOEvaluator( 138 | dataloader=val_loader, 139 | img_size=self.test_size, 140 | confthre=self.conf_thresh, 141 | nmsthre3d=self.nms_thresh3d, 142 | detthre=self.det_thresh, 143 | nmsthre2d=self.nms_thresh2d, 144 | num_classes=self.num_classes, 145 | testdev=testdev, 146 | ) 147 | return evaluator 148 | 149 | def get_model(self): 150 | from yolox.models import YOLOPAFPN, YOLOX, YOLOXHead 151 | from diffusion.models.diffusionnet import DiffusionNet,DiffusionHead 152 | 153 | def init_yolo(M): 154 | for m in M.modules(): 155 | if isinstance(m, nn.BatchNorm2d): 156 | m.eps = 1e-3 157 | m.momentum = 0.03 158 | 159 | if getattr(self, "model", None) is None: 160 | in_channels = [256, 512, 1024] 161 | backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels) 162 | for value in backbone.parameters(): 163 | value.requires_grad=False 164 | head=DiffusionHead(self.num_classes,self.width) 165 | self.model = DiffusionNet(backbone, head) 166 | 167 | self.model.apply(init_yolo) 168 | # self.model.head.initialize_biases(1e-2) 169 | return self.model 170 | 171 | def get_optimizer(self, batch_size): 172 | lr=2.5e-05 173 | weight_decay = 0.0001 174 | self.optimizer=AdamW(self.model.parameters(),lr=lr,weight_decay=weight_decay) 175 | return self.optimizer 176 | -------------------------------------------------------------------------------- /exps/example/mot/yolox_x_diffusion_det_dancetrack.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | import os 3 | import random 4 | import torch 5 | import torch.nn as nn 6 | import torch.distributed as dist 7 | from torch.optim import AdamW 8 | from yolox.exp import Exp as MyExp 9 | from yolox.data import get_yolox_datadir 10 | 11 | class Exp(MyExp): 12 | def __init__(self): 13 | super(Exp, self).__init__() 14 | self.num_classes = 1 15 | self.depth = 1.33 16 | self.width = 1.25 17 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 18 | self.train_ann = "train.json" 19 | self.val_ann = "train.json" 20 | self.input_size = (896, 1600) 21 | self.test_size = (896, 1600) 22 | self.random_size = (18, 32) 23 | self.max_epoch = 20 24 | self.print_interval = 20 25 | self.eval_interval = 40 26 | self.no_aug_epochs = 5 27 | self.basic_lr_per_img = 0.001 / 64.0 28 | self.warmup_epochs = 1 29 | self.task="detection" 30 | self.enable_mixup = True 31 | self.seed=8823 32 | self.conf_thresh=0.4 33 | self.det_thresh=0.7 34 | self.nms_thresh2d=0.75 35 | self.nms_thresh3d=0.7 36 | self.interval=5 37 | 38 | def get_data_loader(self, batch_size, is_distributed, no_aug=False): 39 | from yolox.data import ( 40 | MOTDataset, 41 | TrainTransform, 42 | YoloBatchSampler, 43 | DataLoader, 44 | InfiniteSampler, 45 | MosaicDetection, 46 | ) 47 | 48 | dataset = MOTDataset( 49 | data_dir=os.path.join(get_yolox_datadir(), "dancetrack"), 50 | json_file=self.train_ann, 51 | name='train', 52 | img_size=self.input_size, 53 | preproc=TrainTransform( 54 | rgb_means=(0.485, 0.456, 0.406), 55 | std=(0.229, 0.224, 0.225), 56 | max_labels=500, 57 | ), 58 | ) 59 | 60 | dataset = MosaicDetection( 61 | dataset, 62 | mosaic=not no_aug, 63 | img_size=self.input_size, 64 | preproc=TrainTransform( 65 | rgb_means=(0.485, 0.456, 0.406), 66 | std=(0.229, 0.224, 0.225), 67 | max_labels=1000, 68 | ), 69 | degrees=self.degrees, 70 | translate=self.translate, 71 | scale=self.scale, 72 | shear=self.shear, 73 | perspective=self.perspective, 74 | enable_mixup=self.enable_mixup, 75 | ) 76 | 77 | self.dataset = dataset 78 | 79 | if is_distributed: 80 | batch_size = batch_size // dist.get_world_size() 81 | 82 | sampler = InfiniteSampler( 83 | len(self.dataset), seed=self.seed if self.seed else 0 84 | ) 85 | 86 | batch_sampler = YoloBatchSampler( 87 | sampler=sampler, 88 | batch_size=batch_size, 89 | drop_last=False, 90 | input_dimension=self.input_size, 91 | mosaic=not no_aug, 92 | ) 93 | 94 | dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True} 95 | dataloader_kwargs["batch_sampler"] = batch_sampler 96 | train_loader = DataLoader(self.dataset, **dataloader_kwargs) 97 | 98 | return train_loader 99 | 100 | def get_eval_loader(self, batch_size, is_distributed, testdev=False): 101 | from yolox.data import MOTDataset,DiffusionValTransform 102 | 103 | valdataset = MOTDataset( 104 | data_dir=os.path.join(get_yolox_datadir(), "dancetrack"), 105 | json_file=self.val_ann, 106 | img_size=self.test_size, 107 | name='train', 108 | preproc=DiffusionValTransform( 109 | rgb_means=(0.485, 0.456, 0.406), 110 | std=(0.229, 0.224, 0.225), 111 | max_labels=1000, 112 | ) 113 | ) 114 | 115 | if is_distributed: 116 | batch_size = batch_size // dist.get_world_size() 117 | sampler = torch.utils.data.distributed.DistributedSampler( 118 | valdataset, shuffle=False 119 | ) 120 | else: 121 | sampler = torch.utils.data.SequentialSampler(valdataset) 122 | 123 | dataloader_kwargs = { 124 | "num_workers": self.data_num_workers, 125 | "pin_memory": True, 126 | "sampler": sampler, 127 | } 128 | dataloader_kwargs["batch_size"] = batch_size 129 | val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs) 130 | 131 | return val_loader 132 | 133 | def get_evaluator(self, batch_size, is_distributed, testdev=False): 134 | from yolox.evaluators import COCOEvaluator 135 | 136 | val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev) 137 | evaluator = COCOEvaluator( 138 | dataloader=val_loader, 139 | img_size=self.test_size, 140 | confthre=self.conf_thresh, 141 | nmsthre3d=self.nms_thresh3d, 142 | detthre=self.det_thresh, 143 | nmsthre2d=self.nms_thresh2d, 144 | num_classes=self.num_classes, 145 | testdev=testdev, 146 | ) 147 | return evaluator 148 | 149 | def get_model(self): 150 | from yolox.models import YOLOPAFPN, YOLOX, YOLOXHead 151 | from diffusion.models.diffusionnet import DiffusionNet,DiffusionHead 152 | 153 | def init_yolo(M): 154 | for m in M.modules(): 155 | if isinstance(m, nn.BatchNorm2d): 156 | m.eps = 1e-3 157 | m.momentum = 0.03 158 | 159 | if getattr(self, "model", None) is None: 160 | in_channels = [256, 512, 1024] 161 | backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels) 162 | for value in backbone.parameters(): 163 | value.requires_grad=False 164 | head=DiffusionHead(self.num_classes,self.width) 165 | self.model = DiffusionNet(backbone, head) 166 | 167 | self.model.apply(init_yolo) 168 | # self.model.head.initialize_biases(1e-2) 169 | return self.model 170 | 171 | def get_optimizer(self, batch_size): 172 | lr=2.5e-05 173 | weight_decay = 0.0001 174 | self.optimizer=AdamW(self.model.parameters(),lr=lr,weight_decay=weight_decay) 175 | return self.optimizer 176 | -------------------------------------------------------------------------------- /exps/example/mot/yolox_x_diffusion_det_mot17_ablation.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | import os 3 | import random 4 | import torch 5 | import torch.nn as nn 6 | import torch.distributed as dist 7 | from torch.optim import AdamW 8 | from yolox.exp import Exp as MyExp 9 | from yolox.data import get_yolox_datadir 10 | 11 | class Exp(MyExp): 12 | def __init__(self): 13 | super(Exp, self).__init__() 14 | self.num_classes = 1 15 | self.depth = 1.33 16 | self.width = 1.25 17 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 18 | self.train_ann = "train_half.json" 19 | self.val_ann = "val_half.json" 20 | self.input_size = (800, 1440) 21 | self.test_size = (800, 1440) 22 | self.random_size = (18, 32) 23 | self.max_epoch = 30 24 | self.print_interval = 20 25 | self.eval_interval = 5 26 | self.no_aug_epochs = 10 27 | self.basic_lr_per_img = 0.001 / 64.0 28 | self.warmup_epochs = 1 29 | self.task="detection" 30 | self.enable_mixup = True 31 | self.seed=8823 32 | self.conf_thresh=0.4 33 | self.det_thresh=0.7 34 | self.nms_thresh2d=0.75 35 | self.nms_thresh3d=0.7 36 | self.interval=5 37 | 38 | def get_data_loader(self, batch_size, is_distributed, no_aug=False): 39 | from yolox.data import ( 40 | MOTDataset, 41 | TrainTransform, 42 | YoloBatchSampler, 43 | DataLoader, 44 | InfiniteSampler, 45 | MosaicDetection, 46 | ) 47 | 48 | dataset = MOTDataset( 49 | data_dir=os.path.join(get_yolox_datadir(), "mix_mot_ch"), 50 | json_file=self.train_ann, 51 | name='', 52 | img_size=self.input_size, 53 | preproc=TrainTransform( 54 | rgb_means=(0.485, 0.456, 0.406), 55 | std=(0.229, 0.224, 0.225), 56 | max_labels=500, 57 | ), 58 | ) 59 | 60 | dataset = MosaicDetection( 61 | dataset, 62 | mosaic=not no_aug, 63 | img_size=self.input_size, 64 | preproc=TrainTransform( 65 | rgb_means=(0.485, 0.456, 0.406), 66 | std=(0.229, 0.224, 0.225), 67 | max_labels=1000, 68 | ), 69 | degrees=self.degrees, 70 | translate=self.translate, 71 | scale=self.scale, 72 | shear=self.shear, 73 | perspective=self.perspective, 74 | enable_mixup=self.enable_mixup, 75 | ) 76 | 77 | self.dataset = dataset 78 | 79 | if is_distributed: 80 | batch_size = batch_size // dist.get_world_size() 81 | 82 | sampler = InfiniteSampler( 83 | len(self.dataset), seed=self.seed if self.seed else 0 84 | ) 85 | 86 | batch_sampler = YoloBatchSampler( 87 | sampler=sampler, 88 | batch_size=batch_size, 89 | drop_last=False, 90 | input_dimension=self.input_size, 91 | mosaic=not no_aug, 92 | ) 93 | 94 | dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True} 95 | dataloader_kwargs["batch_sampler"] = batch_sampler 96 | train_loader = DataLoader(self.dataset, **dataloader_kwargs) 97 | 98 | return train_loader 99 | 100 | def get_eval_loader(self, batch_size, is_distributed, testdev=False): 101 | from yolox.data import MOTDataset,DiffusionValTransform 102 | 103 | valdataset = MOTDataset( 104 | data_dir=os.path.join(get_yolox_datadir(), "mot"), 105 | json_file=self.val_ann, 106 | img_size=self.test_size, 107 | name='train', 108 | preproc=DiffusionValTransform( 109 | rgb_means=(0.485, 0.456, 0.406), 110 | std=(0.229, 0.224, 0.225), 111 | max_labels=1000, 112 | ) 113 | ) 114 | 115 | if is_distributed: 116 | batch_size = batch_size // dist.get_world_size() 117 | sampler = torch.utils.data.distributed.DistributedSampler( 118 | valdataset, shuffle=False 119 | ) 120 | else: 121 | sampler = torch.utils.data.SequentialSampler(valdataset) 122 | 123 | dataloader_kwargs = { 124 | "num_workers": self.data_num_workers, 125 | "pin_memory": True, 126 | "sampler": sampler, 127 | } 128 | dataloader_kwargs["batch_size"] = batch_size 129 | val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs) 130 | 131 | return val_loader 132 | 133 | def get_evaluator(self, batch_size, is_distributed, testdev=False): 134 | from yolox.evaluators import COCOEvaluator 135 | 136 | val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev) 137 | evaluator = COCOEvaluator( 138 | dataloader=val_loader, 139 | img_size=self.test_size, 140 | confthre=self.conf_thresh, 141 | nmsthre3d=self.nms_thresh3d, 142 | detthre=self.det_thresh, 143 | nmsthre2d=self.nms_thresh2d, 144 | num_classes=self.num_classes, 145 | testdev=testdev, 146 | ) 147 | return evaluator 148 | 149 | def get_model(self): 150 | from yolox.models import YOLOPAFPN, YOLOX, YOLOXHead 151 | from diffusion.models.diffusionnet import DiffusionNet,DiffusionHead 152 | 153 | def init_yolo(M): 154 | for m in M.modules(): 155 | if isinstance(m, nn.BatchNorm2d): 156 | m.eps = 1e-3 157 | m.momentum = 0.03 158 | 159 | if getattr(self, "model", None) is None: 160 | in_channels = [256, 512, 1024] 161 | backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels) 162 | for value in backbone.parameters(): 163 | value.requires_grad=False 164 | head=DiffusionHead(self.num_classes,self.width) 165 | self.model = DiffusionNet(backbone, head) 166 | 167 | self.model.apply(init_yolo) 168 | # self.model.head.initialize_biases(1e-2) 169 | return self.model 170 | 171 | def get_optimizer(self, batch_size): 172 | lr=2.5e-05 173 | weight_decay = 0.0001 174 | self.optimizer=AdamW(self.model.parameters(),lr=lr,weight_decay=weight_decay) 175 | return self.optimizer 176 | -------------------------------------------------------------------------------- /exps/example/mot/yolox_x_diffusion_track_mot17.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | import os 3 | import random 4 | import torch 5 | import torch.nn as nn 6 | import torch.distributed as dist 7 | from torch.optim import AdamW 8 | from yolox.exp import Exp as MyExp 9 | from yolox.data import get_yolox_datadir 10 | 11 | class Exp(MyExp): 12 | def __init__(self): 13 | super(Exp, self).__init__() 14 | self.num_classes = 1 15 | self.depth = 1.33 16 | self.width = 1.25 17 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 18 | self.train_ann = "train.json" 19 | self.val_ann = "val_half.json" 20 | self.input_size = (800, 1440) 21 | self.test_size = (800, 1440) 22 | self.random_size = (18, 32) 23 | self.max_epoch = 30 24 | self.print_interval = 20 25 | self.eval_interval = 5 26 | self.no_aug_epochs = 10 27 | self.basic_lr_per_img = 0.001 / 64.0 28 | self.warmup_epochs = 1 29 | self.task="tracking" 30 | self.seed=8823 31 | self.conf_thresh=0.4 32 | self.det_thresh=0.7 33 | self.nms_thresh2d=0.75 34 | self.nms_thresh3d=0.7 35 | self.interval=5 36 | 37 | def get_data_loader(self, batch_size, is_distributed, no_aug=False): 38 | from yolox.data import ( 39 | MOTDataset, 40 | TrainTransform, 41 | YoloBatchSampler, 42 | DataLoader, 43 | InfiniteSampler, 44 | MosaicDetection, 45 | DiffusionMosaicDetection, 46 | DiffusionTrainTransform 47 | ) 48 | 49 | dataset = MOTDataset( 50 | data_dir=os.path.join(get_yolox_datadir(), "mot"), 51 | json_file=self.train_ann, 52 | name='train', 53 | img_size=self.input_size, 54 | preproc=TrainTransform( 55 | rgb_means=(0.485, 0.456, 0.406), 56 | std=(0.229, 0.224, 0.225), 57 | max_labels=500, 58 | ), 59 | ) 60 | 61 | dataset = DiffusionMosaicDetection( 62 | dataset, 63 | mosaic=not no_aug, 64 | img_size=self.input_size, 65 | preproc=DiffusionTrainTransform( 66 | rgb_means=(0.485, 0.456, 0.406), 67 | std=(0.229, 0.224, 0.225), 68 | max_labels=1000, 69 | ), 70 | degrees=self.degrees, 71 | translate=self.translate, 72 | scale=self.scale, 73 | shear=self.shear, 74 | perspective=self.perspective, 75 | enable_mixup=self.enable_mixup, 76 | ) 77 | 78 | self.dataset = dataset 79 | 80 | if is_distributed: 81 | batch_size = batch_size // dist.get_world_size() 82 | 83 | sampler = InfiniteSampler( 84 | len(self.dataset), seed=self.seed if self.seed else 0 85 | ) 86 | 87 | batch_sampler = YoloBatchSampler( 88 | sampler=sampler, 89 | batch_size=batch_size, 90 | drop_last=False, 91 | input_dimension=self.input_size, 92 | mosaic=not no_aug, 93 | ) 94 | 95 | dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True} 96 | dataloader_kwargs["batch_sampler"] = batch_sampler 97 | train_loader = DataLoader(self.dataset, **dataloader_kwargs) 98 | 99 | return train_loader 100 | 101 | def get_eval_loader(self, batch_size, is_distributed, testdev=False): 102 | from yolox.data import MOTDataset,DiffusionValTransform 103 | 104 | valdataset = MOTDataset( 105 | data_dir=os.path.join(get_yolox_datadir(), "mot"), 106 | json_file=self.val_ann, 107 | img_size=self.test_size, 108 | name='train', 109 | preproc=DiffusionValTransform( 110 | rgb_means=(0.485, 0.456, 0.406), 111 | std=(0.229, 0.224, 0.225), 112 | max_labels=1000, 113 | ) 114 | ) 115 | 116 | if is_distributed: 117 | batch_size = batch_size // dist.get_world_size() 118 | sampler = torch.utils.data.distributed.DistributedSampler( 119 | valdataset, shuffle=False 120 | ) 121 | else: 122 | sampler = torch.utils.data.SequentialSampler(valdataset) 123 | 124 | dataloader_kwargs = { 125 | "num_workers": self.data_num_workers, 126 | "pin_memory": True, 127 | "sampler": sampler, 128 | } 129 | dataloader_kwargs["batch_size"] = batch_size 130 | val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs) 131 | 132 | return val_loader 133 | 134 | def get_evaluator(self, batch_size, is_distributed, testdev=False): 135 | from yolox.evaluators import COCOEvaluator 136 | 137 | val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev) 138 | evaluator = COCOEvaluator( 139 | dataloader=val_loader, 140 | img_size=self.test_size, 141 | confthre=self.conf_thresh, 142 | nmsthre3d=self.nms_thresh3d, 143 | detthre=self.det_thresh, 144 | nmsthre2d=self.nms_thresh2d, 145 | num_classes=self.num_classes, 146 | testdev=testdev, 147 | ) 148 | return evaluator 149 | 150 | def get_model(self): 151 | from yolox.models import YOLOPAFPN, YOLOX, YOLOXHead 152 | from diffusion.models.diffusionnet import DiffusionNet,DiffusionHead 153 | 154 | def init_yolo(M): 155 | for m in M.modules(): 156 | if isinstance(m, nn.BatchNorm2d): 157 | m.eps = 1e-3 158 | m.momentum = 0.03 159 | 160 | if getattr(self, "model", None) is None: 161 | in_channels = [256, 512, 1024] 162 | backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels) 163 | for value in backbone.parameters(): 164 | value.requires_grad=False 165 | head=DiffusionHead(self.num_classes,self.width) 166 | self.model = DiffusionNet(backbone, head) 167 | 168 | self.model.apply(init_yolo) 169 | # self.model.head.initialize_biases(1e-2) 170 | return self.model 171 | 172 | def get_optimizer(self, batch_size): 173 | lr=2.5e-05 174 | weight_decay = 0.0001 175 | self.optimizer=AdamW(self.model.parameters(),lr=lr,weight_decay=weight_decay) 176 | return self.optimizer 177 | -------------------------------------------------------------------------------- /exps/example/mot/yolox_x_diffusion_track_mot17_baseline.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | import os 3 | import random 4 | import torch 5 | import torch.nn as nn 6 | import torch.distributed as dist 7 | from torch.optim import AdamW 8 | from yolox.exp import Exp as MyExp 9 | from yolox.data import get_yolox_datadir 10 | 11 | class Exp(MyExp): 12 | def __init__(self): 13 | super(Exp, self).__init__() 14 | self.num_classes = 1 15 | self.depth = 1.33 16 | self.width = 1.25 17 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 18 | self.train_ann = "train.json" 19 | self.val_ann = "test.json" 20 | self.input_size = (800, 1440) 21 | self.test_size = (800, 1440) 22 | self.random_size = (18, 32) 23 | self.max_epoch = 30 24 | self.print_interval = 20 25 | self.eval_interval = 5 26 | self.no_aug_epochs = 10 27 | self.basic_lr_per_img = 0.001 / 64.0 28 | self.warmup_epochs = 1 29 | self.task="tracking" 30 | self.seed=8823 31 | self.conf_thresh=0.4 32 | self.det_thresh=0.7 33 | self.nms_thresh2d=0.75 34 | self.nms_thresh3d=0.7 35 | self.interval=5 36 | 37 | def get_data_loader(self, batch_size, is_distributed, no_aug=False): 38 | from yolox.data import ( 39 | MOTDataset, 40 | TrainTransform, 41 | YoloBatchSampler, 42 | DataLoader, 43 | InfiniteSampler, 44 | MosaicDetection, 45 | DiffusionMosaicDetection, 46 | DiffusionTrainTransform 47 | ) 48 | 49 | dataset = MOTDataset( 50 | data_dir=os.path.join(get_yolox_datadir(), "mot"), 51 | json_file=self.train_ann, 52 | name='train', 53 | img_size=self.input_size, 54 | preproc=TrainTransform( 55 | rgb_means=(0.485, 0.456, 0.406), 56 | std=(0.229, 0.224, 0.225), 57 | max_labels=500, 58 | ), 59 | ) 60 | 61 | dataset = DiffusionMosaicDetection( 62 | dataset, 63 | mosaic=not no_aug, 64 | img_size=self.input_size, 65 | preproc=DiffusionTrainTransform( 66 | rgb_means=(0.485, 0.456, 0.406), 67 | std=(0.229, 0.224, 0.225), 68 | max_labels=1000, 69 | ), 70 | degrees=self.degrees, 71 | translate=self.translate, 72 | scale=self.scale, 73 | shear=self.shear, 74 | perspective=self.perspective, 75 | enable_mixup=self.enable_mixup, 76 | ) 77 | 78 | self.dataset = dataset 79 | 80 | if is_distributed: 81 | batch_size = batch_size // dist.get_world_size() 82 | 83 | sampler = InfiniteSampler( 84 | len(self.dataset), seed=self.seed if self.seed else 0 85 | ) 86 | 87 | batch_sampler = YoloBatchSampler( 88 | sampler=sampler, 89 | batch_size=batch_size, 90 | drop_last=False, 91 | input_dimension=self.input_size, 92 | mosaic=not no_aug, 93 | ) 94 | 95 | dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True} 96 | dataloader_kwargs["batch_sampler"] = batch_sampler 97 | train_loader = DataLoader(self.dataset, **dataloader_kwargs) 98 | 99 | return train_loader 100 | 101 | def get_eval_loader(self, batch_size, is_distributed, testdev=False): 102 | from yolox.data import MOTDataset,DiffusionValTransform 103 | 104 | valdataset = MOTDataset( 105 | data_dir=os.path.join(get_yolox_datadir(), "mot"), 106 | json_file=self.val_ann, 107 | img_size=self.test_size, 108 | name='test', 109 | preproc=DiffusionValTransform( 110 | rgb_means=(0.485, 0.456, 0.406), 111 | std=(0.229, 0.224, 0.225), 112 | max_labels=1000, 113 | ) 114 | ) 115 | 116 | if is_distributed: 117 | batch_size = batch_size // dist.get_world_size() 118 | sampler = torch.utils.data.distributed.DistributedSampler( 119 | valdataset, shuffle=False 120 | ) 121 | else: 122 | sampler = torch.utils.data.SequentialSampler(valdataset) 123 | 124 | dataloader_kwargs = { 125 | "num_workers": self.data_num_workers, 126 | "pin_memory": True, 127 | "sampler": sampler, 128 | } 129 | dataloader_kwargs["batch_size"] = batch_size 130 | val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs) 131 | 132 | return val_loader 133 | 134 | def get_evaluator(self, batch_size, is_distributed, testdev=False): 135 | from yolox.evaluators import COCOEvaluator 136 | 137 | val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev) 138 | evaluator = COCOEvaluator( 139 | dataloader=val_loader, 140 | img_size=self.test_size, 141 | confthre=self.conf_thresh, 142 | nmsthre3d=self.nms_thresh3d, 143 | detthre=self.det_thresh, 144 | nmsthre2d=self.nms_thresh2d, 145 | num_classes=self.num_classes, 146 | testdev=testdev, 147 | ) 148 | return evaluator 149 | 150 | def get_model(self): 151 | from yolox.models import YOLOPAFPN, YOLOX, YOLOXHead 152 | from diffusion.models.diffusionnet import DiffusionNet,DiffusionHead 153 | 154 | def init_yolo(M): 155 | for m in M.modules(): 156 | if isinstance(m, nn.BatchNorm2d): 157 | m.eps = 1e-3 158 | m.momentum = 0.03 159 | 160 | if getattr(self, "model", None) is None: 161 | in_channels = [256, 512, 1024] 162 | backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels) 163 | for value in backbone.parameters(): 164 | value.requires_grad=False 165 | head=DiffusionHead(self.num_classes,self.width) 166 | self.model = DiffusionNet(backbone, head) 167 | 168 | self.model.apply(init_yolo) 169 | # self.model.head.initialize_biases(1e-2) 170 | return self.model 171 | 172 | def get_optimizer(self, batch_size): 173 | lr=2.5e-05 174 | weight_decay = 0.0001 175 | self.optimizer=AdamW(self.model.parameters(),lr=lr,weight_decay=weight_decay) 176 | return self.optimizer 177 | -------------------------------------------------------------------------------- /exps/example/mot/yolox_x_diffusion_track_mot20.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | import os 3 | import random 4 | import torch 5 | import torch.nn as nn 6 | import torch.distributed as dist 7 | from torch.optim import AdamW 8 | from yolox.exp import Exp as MyExp 9 | from yolox.data import get_yolox_datadir 10 | 11 | class Exp(MyExp): 12 | def __init__(self): 13 | super(Exp, self).__init__() 14 | self.num_classes = 1 15 | self.depth = 1.33 16 | self.width = 1.25 17 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 18 | self.train_ann = "train.json" 19 | self.val_ann = "val_half.json" 20 | self.input_size = (896, 1600) 21 | self.test_size = (896, 1600) 22 | self.random_size = (20, 36) 23 | self.max_epoch = 30 24 | self.print_interval = 20 25 | self.eval_interval = 5 26 | self.no_aug_epochs = 10 27 | self.basic_lr_per_img = 0.001 / 64.0 28 | self.warmup_epochs = 1 29 | self.task="tracking" 30 | self.seed=8823 31 | self.conf_thresh=0.4 32 | self.det_thresh=0.7 33 | self.nms_thresh2d=0.75 34 | self.nms_thresh3d=0.7 35 | self.interval=5 36 | 37 | def get_data_loader(self, batch_size, is_distributed, no_aug=False): 38 | from yolox.data import ( 39 | MOTDataset, 40 | TrainTransform, 41 | YoloBatchSampler, 42 | DataLoader, 43 | InfiniteSampler, 44 | MosaicDetection, 45 | DiffusionMosaicDetection, 46 | DiffusionTrainTransform 47 | ) 48 | 49 | dataset = MOTDataset( 50 | data_dir=os.path.join(get_yolox_datadir(), "MOT20"), 51 | json_file=self.train_ann, 52 | name='train', 53 | img_size=self.input_size, 54 | preproc=TrainTransform( 55 | rgb_means=(0.485, 0.456, 0.406), 56 | std=(0.229, 0.224, 0.225), 57 | max_labels=500, 58 | ), 59 | ) 60 | 61 | dataset = DiffusionMosaicDetection( 62 | dataset, 63 | mosaic=not no_aug, 64 | img_size=self.input_size, 65 | preproc=DiffusionTrainTransform( 66 | rgb_means=(0.485, 0.456, 0.406), 67 | std=(0.229, 0.224, 0.225), 68 | max_labels=1200, 69 | ), 70 | degrees=self.degrees, 71 | translate=self.translate, 72 | scale=self.scale, 73 | shear=self.shear, 74 | perspective=self.perspective, 75 | enable_mixup=self.enable_mixup, 76 | ) 77 | 78 | self.dataset = dataset 79 | 80 | if is_distributed: 81 | batch_size = batch_size // dist.get_world_size() 82 | 83 | sampler = InfiniteSampler( 84 | len(self.dataset), seed=self.seed if self.seed else 0 85 | ) 86 | 87 | batch_sampler = YoloBatchSampler( 88 | sampler=sampler, 89 | batch_size=batch_size, 90 | drop_last=False, 91 | input_dimension=self.input_size, 92 | mosaic=not no_aug, 93 | ) 94 | 95 | dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True} 96 | dataloader_kwargs["batch_sampler"] = batch_sampler 97 | train_loader = DataLoader(self.dataset, **dataloader_kwargs) 98 | 99 | return train_loader 100 | 101 | def get_eval_loader(self, batch_size, is_distributed, testdev=False): 102 | from yolox.data import MOTDataset,DiffusionValTransform 103 | 104 | valdataset = MOTDataset( 105 | data_dir=os.path.join(get_yolox_datadir(), "MOT20"), 106 | json_file=self.val_ann, 107 | img_size=self.test_size, 108 | name='train', 109 | preproc=DiffusionValTransform( 110 | rgb_means=(0.485, 0.456, 0.406), 111 | std=(0.229, 0.224, 0.225), 112 | max_labels=1200, 113 | ) 114 | ) 115 | 116 | if is_distributed: 117 | batch_size = batch_size // dist.get_world_size() 118 | sampler = torch.utils.data.distributed.DistributedSampler( 119 | valdataset, shuffle=False 120 | ) 121 | else: 122 | sampler = torch.utils.data.SequentialSampler(valdataset) 123 | 124 | dataloader_kwargs = { 125 | "num_workers": self.data_num_workers, 126 | "pin_memory": True, 127 | "sampler": sampler, 128 | } 129 | dataloader_kwargs["batch_size"] = batch_size 130 | val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs) 131 | 132 | return val_loader 133 | 134 | def get_evaluator(self, batch_size, is_distributed, testdev=False): 135 | from yolox.evaluators import COCOEvaluator 136 | 137 | val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev) 138 | evaluator = COCOEvaluator( 139 | dataloader=val_loader, 140 | img_size=self.test_size, 141 | confthre=self.conf_thresh, 142 | nmsthre3d=self.nms_thresh3d, 143 | detthre=self.det_thresh, 144 | nmsthre2d=self.nms_thresh2d, 145 | num_classes=self.num_classes, 146 | testdev=testdev, 147 | ) 148 | return evaluator 149 | 150 | def get_model(self): 151 | from yolox.models import YOLOPAFPN, YOLOX, YOLOXHead 152 | from diffusion.models.diffusionnet import DiffusionNet,DiffusionHead 153 | 154 | def init_yolo(M): 155 | for m in M.modules(): 156 | if isinstance(m, nn.BatchNorm2d): 157 | m.eps = 1e-3 158 | m.momentum = 0.03 159 | 160 | if getattr(self, "model", None) is None: 161 | in_channels = [256, 512, 1024] 162 | backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels) 163 | for value in backbone.parameters(): 164 | value.requires_grad=False 165 | head=DiffusionHead(self.num_classes,self.width) 166 | self.model = DiffusionNet(backbone, head) 167 | 168 | self.model.apply(init_yolo) 169 | # self.model.head.initialize_biases(1e-2) 170 | return self.model 171 | 172 | def get_optimizer(self, batch_size): 173 | lr=2.5e-05 174 | weight_decay = 0.0001 175 | self.optimizer=AdamW(self.model.parameters(),lr=lr,weight_decay=weight_decay) 176 | return self.optimizer 177 | -------------------------------------------------------------------------------- /exps/example/mot/yolox_x_diffusion_track_dancetrack.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | import os 3 | import random 4 | import torch 5 | import torch.nn as nn 6 | import torch.distributed as dist 7 | from torch.optim import AdamW 8 | from yolox.exp import Exp as MyExp 9 | from yolox.data import get_yolox_datadir 10 | 11 | class Exp(MyExp): 12 | def __init__(self): 13 | super(Exp, self).__init__() 14 | self.num_classes = 1 15 | self.depth = 1.33 16 | self.width = 1.25 17 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 18 | self.train_ann = "train.json" 19 | self.val_ann = "train.json" 20 | self.input_size = (896, 1600) 21 | self.test_size = (896, 1600) 22 | self.random_size = (18, 32) 23 | self.max_epoch = 20 24 | self.print_interval = 20 25 | self.eval_interval = 40 26 | self.no_aug_epochs = 5 27 | self.basic_lr_per_img = 0.001 / 64.0 28 | self.warmup_epochs = 1 29 | self.task="tracking" 30 | self.seed=8823 31 | self.conf_thresh=0.4 32 | self.det_thresh=0.7 33 | self.nms_thresh2d=0.75 34 | self.nms_thresh3d=0.7 35 | self.interval=5 36 | 37 | def get_data_loader(self, batch_size, is_distributed, no_aug=False): 38 | from yolox.data import ( 39 | MOTDataset, 40 | TrainTransform, 41 | YoloBatchSampler, 42 | DataLoader, 43 | InfiniteSampler, 44 | MosaicDetection, 45 | DiffusionMosaicDetection, 46 | DiffusionTrainTransform 47 | ) 48 | 49 | dataset = MOTDataset( 50 | data_dir=os.path.join(get_yolox_datadir(), "dancetrack"), 51 | json_file=self.train_ann, 52 | name='train', 53 | img_size=self.input_size, 54 | preproc=TrainTransform( 55 | rgb_means=(0.485, 0.456, 0.406), 56 | std=(0.229, 0.224, 0.225), 57 | max_labels=500, 58 | ), 59 | ) 60 | 61 | dataset = DiffusionMosaicDetection( 62 | dataset, 63 | mosaic=not no_aug, 64 | img_size=self.input_size, 65 | preproc=DiffusionTrainTransform( 66 | rgb_means=(0.485, 0.456, 0.406), 67 | std=(0.229, 0.224, 0.225), 68 | max_labels=1000, 69 | ), 70 | degrees=self.degrees, 71 | translate=self.translate, 72 | scale=self.scale, 73 | shear=self.shear, 74 | perspective=self.perspective, 75 | enable_mixup=self.enable_mixup, 76 | ) 77 | 78 | self.dataset = dataset 79 | 80 | if is_distributed: 81 | batch_size = batch_size // dist.get_world_size() 82 | 83 | sampler = InfiniteSampler( 84 | len(self.dataset), seed=self.seed if self.seed else 0 85 | ) 86 | 87 | batch_sampler = YoloBatchSampler( 88 | sampler=sampler, 89 | batch_size=batch_size, 90 | drop_last=False, 91 | input_dimension=self.input_size, 92 | mosaic=not no_aug, 93 | ) 94 | 95 | dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True} 96 | dataloader_kwargs["batch_sampler"] = batch_sampler 97 | train_loader = DataLoader(self.dataset, **dataloader_kwargs) 98 | 99 | return train_loader 100 | 101 | def get_eval_loader(self, batch_size, is_distributed, testdev=False): 102 | from yolox.data import MOTDataset,DiffusionValTransform 103 | 104 | valdataset = MOTDataset( 105 | data_dir=os.path.join(get_yolox_datadir(), "dancetrack"), 106 | json_file=self.val_ann, 107 | img_size=self.test_size, 108 | name='train', 109 | preproc=DiffusionValTransform( 110 | rgb_means=(0.485, 0.456, 0.406), 111 | std=(0.229, 0.224, 0.225), 112 | max_labels=1000, 113 | ) 114 | ) 115 | 116 | if is_distributed: 117 | batch_size = batch_size // dist.get_world_size() 118 | sampler = torch.utils.data.distributed.DistributedSampler( 119 | valdataset, shuffle=False 120 | ) 121 | else: 122 | sampler = torch.utils.data.SequentialSampler(valdataset) 123 | 124 | dataloader_kwargs = { 125 | "num_workers": self.data_num_workers, 126 | "pin_memory": True, 127 | "sampler": sampler, 128 | } 129 | dataloader_kwargs["batch_size"] = batch_size 130 | val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs) 131 | 132 | return val_loader 133 | 134 | def get_evaluator(self, batch_size, is_distributed, testdev=False): 135 | from yolox.evaluators import COCOEvaluator 136 | 137 | val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev) 138 | evaluator = COCOEvaluator( 139 | dataloader=val_loader, 140 | img_size=self.test_size, 141 | confthre=self.conf_thresh, 142 | nmsthre3d=self.nms_thresh3d, 143 | detthre=self.det_thresh, 144 | nmsthre2d=self.nms_thresh2d, 145 | num_classes=self.num_classes, 146 | testdev=testdev, 147 | ) 148 | return evaluator 149 | 150 | def get_model(self): 151 | from yolox.models import YOLOPAFPN, YOLOX, YOLOXHead 152 | from diffusion.models.diffusionnet import DiffusionNet,DiffusionHead 153 | 154 | def init_yolo(M): 155 | for m in M.modules(): 156 | if isinstance(m, nn.BatchNorm2d): 157 | m.eps = 1e-3 158 | m.momentum = 0.03 159 | 160 | if getattr(self, "model", None) is None: 161 | in_channels = [256, 512, 1024] 162 | backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels) 163 | for value in backbone.parameters(): 164 | value.requires_grad=False 165 | head=DiffusionHead(self.num_classes,self.width) 166 | self.model = DiffusionNet(backbone, head) 167 | 168 | self.model.apply(init_yolo) 169 | # self.model.head.initialize_biases(1e-2) 170 | return self.model 171 | 172 | def get_optimizer(self, batch_size): 173 | lr=2.5e-05 174 | weight_decay = 0.0001 175 | self.optimizer=AdamW(self.model.parameters(),lr=lr,weight_decay=weight_decay) 176 | return self.optimizer 177 | -------------------------------------------------------------------------------- /exps/example/mot/yolox_x_diffusion_track_mot20_baseline.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | import os 3 | import random 4 | import torch 5 | import torch.nn as nn 6 | import torch.distributed as dist 7 | from torch.optim import AdamW 8 | from yolox.exp import Exp as MyExp 9 | from yolox.data import get_yolox_datadir 10 | 11 | class Exp(MyExp): 12 | def __init__(self): 13 | super(Exp, self).__init__() 14 | self.num_classes = 1 15 | self.depth = 1.33 16 | self.width = 1.25 17 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 18 | self.train_ann = "train.json" 19 | self.val_ann = "val_half.json" 20 | self.input_size = (896, 1600) 21 | self.test_size = (896, 1600) 22 | self.random_size = (20, 36) 23 | self.max_epoch = 30 24 | self.print_interval = 20 25 | self.eval_interval = 5 26 | self.no_aug_epochs = 10 27 | self.basic_lr_per_img = 0.001 / 64.0 28 | self.warmup_epochs = 1 29 | self.task="tracking" 30 | self.seed=8823 31 | self.conf_thresh=0.4 32 | self.det_thresh=0.7 33 | self.nms_thresh2d=0.75 34 | self.nms_thresh3d=0.7 35 | self.interval=5 36 | 37 | def get_data_loader(self, batch_size, is_distributed, no_aug=False): 38 | from yolox.data import ( 39 | MOTDataset, 40 | TrainTransform, 41 | YoloBatchSampler, 42 | DataLoader, 43 | InfiniteSampler, 44 | MosaicDetection, 45 | DiffusionMosaicDetection, 46 | DiffusionTrainTransform 47 | ) 48 | 49 | dataset = MOTDataset( 50 | data_dir=os.path.join(get_yolox_datadir(), "MOT20"), 51 | json_file=self.train_ann, 52 | name='train', 53 | img_size=self.input_size, 54 | preproc=TrainTransform( 55 | rgb_means=(0.485, 0.456, 0.406), 56 | std=(0.229, 0.224, 0.225), 57 | max_labels=500, 58 | ), 59 | ) 60 | 61 | dataset = DiffusionMosaicDetection( 62 | dataset, 63 | mosaic=not no_aug, 64 | img_size=self.input_size, 65 | preproc=DiffusionTrainTransform( 66 | rgb_means=(0.485, 0.456, 0.406), 67 | std=(0.229, 0.224, 0.225), 68 | max_labels=1200, 69 | ), 70 | degrees=self.degrees, 71 | translate=self.translate, 72 | scale=self.scale, 73 | shear=self.shear, 74 | perspective=self.perspective, 75 | enable_mixup=self.enable_mixup, 76 | ) 77 | 78 | self.dataset = dataset 79 | 80 | if is_distributed: 81 | batch_size = batch_size // dist.get_world_size() 82 | 83 | sampler = InfiniteSampler( 84 | len(self.dataset), seed=self.seed if self.seed else 0 85 | ) 86 | 87 | batch_sampler = YoloBatchSampler( 88 | sampler=sampler, 89 | batch_size=batch_size, 90 | drop_last=False, 91 | input_dimension=self.input_size, 92 | mosaic=not no_aug, 93 | ) 94 | 95 | dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True} 96 | dataloader_kwargs["batch_sampler"] = batch_sampler 97 | train_loader = DataLoader(self.dataset, **dataloader_kwargs) 98 | 99 | return train_loader 100 | 101 | def get_eval_loader(self, batch_size, is_distributed, testdev=False): 102 | from yolox.data import MOTDataset,DiffusionValTransform 103 | 104 | valdataset = MOTDataset( 105 | data_dir=os.path.join(get_yolox_datadir(), "MOT20"), 106 | json_file=self.val_ann, 107 | img_size=self.test_size, 108 | name='train', 109 | preproc=DiffusionValTransform( 110 | rgb_means=(0.485, 0.456, 0.406), 111 | std=(0.229, 0.224, 0.225), 112 | max_labels=1200, 113 | ) 114 | ) 115 | 116 | if is_distributed: 117 | batch_size = batch_size // dist.get_world_size() 118 | sampler = torch.utils.data.distributed.DistributedSampler( 119 | valdataset, shuffle=False 120 | ) 121 | else: 122 | sampler = torch.utils.data.SequentialSampler(valdataset) 123 | 124 | dataloader_kwargs = { 125 | "num_workers": self.data_num_workers, 126 | "pin_memory": True, 127 | "sampler": sampler, 128 | } 129 | dataloader_kwargs["batch_size"] = batch_size 130 | val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs) 131 | 132 | return val_loader 133 | 134 | def get_evaluator(self, batch_size, is_distributed, testdev=False): 135 | from yolox.evaluators import COCOEvaluator 136 | 137 | val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev) 138 | evaluator = COCOEvaluator( 139 | dataloader=val_loader, 140 | img_size=self.test_size, 141 | confthre=self.conf_thresh, 142 | nmsthre3d=self.nms_thresh3d, 143 | detthre=self.det_thresh, 144 | nmsthre2d=self.nms_thresh2d, 145 | num_classes=self.num_classes, 146 | testdev=testdev, 147 | ) 148 | return evaluator 149 | 150 | def get_model(self): 151 | from yolox.models import YOLOPAFPN, YOLOX, YOLOXHead 152 | from diffusion.models.diffusionnet import DiffusionNet,DiffusionHead 153 | 154 | def init_yolo(M): 155 | for m in M.modules(): 156 | if isinstance(m, nn.BatchNorm2d): 157 | m.eps = 1e-3 158 | m.momentum = 0.03 159 | 160 | if getattr(self, "model", None) is None: 161 | in_channels = [256, 512, 1024] 162 | backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels) 163 | for value in backbone.parameters(): 164 | value.requires_grad=False 165 | head=DiffusionHead(self.num_classes,self.width) 166 | self.model = DiffusionNet(backbone, head) 167 | 168 | self.model.apply(init_yolo) 169 | # self.model.head.initialize_biases(1e-2) 170 | return self.model 171 | 172 | def get_optimizer(self, batch_size): 173 | lr=2.5e-05 174 | weight_decay = 0.0001 175 | self.optimizer=AdamW(self.model.parameters(),lr=lr,weight_decay=weight_decay) 176 | return self.optimizer 177 | -------------------------------------------------------------------------------- /exps/example/mot/yolox_x_diffusion_track_dancetrack_baseline.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | import os 3 | import random 4 | import torch 5 | import torch.nn as nn 6 | import torch.distributed as dist 7 | from torch.optim import AdamW 8 | from yolox.exp import Exp as MyExp 9 | from yolox.data import get_yolox_datadir 10 | 11 | class Exp(MyExp): 12 | def __init__(self): 13 | super(Exp, self).__init__() 14 | self.num_classes = 1 15 | self.depth = 1.33 16 | self.width = 1.25 17 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 18 | self.train_ann = "train.json" 19 | self.val_ann = "test.json" 20 | self.input_size = (896, 1600) 21 | self.test_size = (896, 1600) 22 | self.random_size = (18, 32) 23 | self.max_epoch = 20 24 | self.print_interval = 20 25 | self.eval_interval = 40 26 | self.no_aug_epochs = 5 27 | self.basic_lr_per_img = 0.001 / 64.0 28 | self.warmup_epochs = 1 29 | self.task="tracking" 30 | self.seed=8823 31 | self.conf_thresh=0.4 32 | self.det_thresh=0.7 33 | self.nms_thresh2d=0.75 34 | self.nms_thresh3d=0.7 35 | self.interval=5 36 | 37 | def get_data_loader(self, batch_size, is_distributed, no_aug=False): 38 | from yolox.data import ( 39 | MOTDataset, 40 | TrainTransform, 41 | YoloBatchSampler, 42 | DataLoader, 43 | InfiniteSampler, 44 | MosaicDetection, 45 | DiffusionMosaicDetection, 46 | DiffusionTrainTransform 47 | ) 48 | 49 | dataset = MOTDataset( 50 | data_dir=os.path.join(get_yolox_datadir(), "dancetrack"), 51 | json_file=self.train_ann, 52 | name='train', 53 | img_size=self.input_size, 54 | preproc=TrainTransform( 55 | rgb_means=(0.485, 0.456, 0.406), 56 | std=(0.229, 0.224, 0.225), 57 | max_labels=500, 58 | ), 59 | ) 60 | 61 | dataset = DiffusionMosaicDetection( 62 | dataset, 63 | mosaic=not no_aug, 64 | img_size=self.input_size, 65 | preproc=DiffusionTrainTransform( 66 | rgb_means=(0.485, 0.456, 0.406), 67 | std=(0.229, 0.224, 0.225), 68 | max_labels=1000, 69 | ), 70 | degrees=self.degrees, 71 | translate=self.translate, 72 | scale=self.scale, 73 | shear=self.shear, 74 | perspective=self.perspective, 75 | enable_mixup=self.enable_mixup, 76 | ) 77 | 78 | self.dataset = dataset 79 | 80 | if is_distributed: 81 | batch_size = batch_size // dist.get_world_size() 82 | 83 | sampler = InfiniteSampler( 84 | len(self.dataset), seed=self.seed if self.seed else 0 85 | ) 86 | 87 | batch_sampler = YoloBatchSampler( 88 | sampler=sampler, 89 | batch_size=batch_size, 90 | drop_last=False, 91 | input_dimension=self.input_size, 92 | mosaic=not no_aug, 93 | ) 94 | 95 | dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True} 96 | dataloader_kwargs["batch_sampler"] = batch_sampler 97 | train_loader = DataLoader(self.dataset, **dataloader_kwargs) 98 | 99 | return train_loader 100 | 101 | def get_eval_loader(self, batch_size, is_distributed, testdev=False): 102 | from yolox.data import MOTDataset,DiffusionValTransform 103 | 104 | valdataset = MOTDataset( 105 | data_dir=os.path.join(get_yolox_datadir(), "dancetrack"), 106 | json_file=self.val_ann, 107 | img_size=self.test_size, 108 | name='test', 109 | preproc=DiffusionValTransform( 110 | rgb_means=(0.485, 0.456, 0.406), 111 | std=(0.229, 0.224, 0.225), 112 | max_labels=1000, 113 | ) 114 | ) 115 | 116 | if is_distributed: 117 | batch_size = batch_size // dist.get_world_size() 118 | sampler = torch.utils.data.distributed.DistributedSampler( 119 | valdataset, shuffle=False 120 | ) 121 | else: 122 | sampler = torch.utils.data.SequentialSampler(valdataset) 123 | 124 | dataloader_kwargs = { 125 | "num_workers": self.data_num_workers, 126 | "pin_memory": True, 127 | "sampler": sampler, 128 | } 129 | dataloader_kwargs["batch_size"] = batch_size 130 | val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs) 131 | 132 | return val_loader 133 | 134 | def get_evaluator(self, batch_size, is_distributed, testdev=False): 135 | from yolox.evaluators import COCOEvaluator 136 | 137 | val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev) 138 | evaluator = COCOEvaluator( 139 | dataloader=val_loader, 140 | img_size=self.test_size, 141 | confthre=self.conf_thresh, 142 | nmsthre3d=self.nms_thresh3d, 143 | detthre=self.det_thresh, 144 | nmsthre2d=self.nms_thresh2d, 145 | num_classes=self.num_classes, 146 | testdev=testdev, 147 | ) 148 | return evaluator 149 | 150 | def get_model(self): 151 | from yolox.models import YOLOPAFPN, YOLOX, YOLOXHead 152 | from diffusion.models.diffusionnet import DiffusionNet,DiffusionHead 153 | 154 | def init_yolo(M): 155 | for m in M.modules(): 156 | if isinstance(m, nn.BatchNorm2d): 157 | m.eps = 1e-3 158 | m.momentum = 0.03 159 | 160 | if getattr(self, "model", None) is None: 161 | in_channels = [256, 512, 1024] 162 | backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels) 163 | for value in backbone.parameters(): 164 | value.requires_grad=False 165 | head=DiffusionHead(self.num_classes,self.width) 166 | self.model = DiffusionNet(backbone, head) 167 | 168 | self.model.apply(init_yolo) 169 | # self.model.head.initialize_biases(1e-2) 170 | return self.model 171 | 172 | def get_optimizer(self, batch_size): 173 | lr=2.5e-05 174 | weight_decay = 0.0001 175 | self.optimizer=AdamW(self.model.parameters(),lr=lr,weight_decay=weight_decay) 176 | return self.optimizer 177 | -------------------------------------------------------------------------------- /exps/example/mot/yolox_x_diffusion_track_mot17_ablation.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | import os 3 | import random 4 | import torch 5 | import torch.nn as nn 6 | import torch.distributed as dist 7 | from torch.optim import AdamW 8 | from yolox.exp import Exp as MyExp 9 | from yolox.data import get_yolox_datadir 10 | 11 | class Exp(MyExp): 12 | def __init__(self): 13 | super(Exp, self).__init__() 14 | self.num_classes = 1 15 | self.depth = 1.33 16 | self.width = 1.25 17 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 18 | self.train_ann = "train_half.json" 19 | self.val_ann = "val_half.json" 20 | self.input_size = (800, 1440) 21 | self.test_size = (800, 1440) 22 | self.random_size = (18, 32) 23 | self.max_epoch = 30 24 | self.print_interval = 20 25 | self.eval_interval = 5 26 | self.no_aug_epochs = 10 27 | self.basic_lr_per_img = 0.001 / 64.0 28 | self.warmup_epochs = 1 29 | self.task="tracking" 30 | self.enable_mixup = True 31 | self.seed=8823 32 | self.conf_thresh=0.25 33 | self.det_thresh=0.7 34 | self.nms_thresh2d=0.75 35 | self.nms_thresh3d=0.7 36 | self.interval=5 37 | 38 | def get_data_loader(self, batch_size, is_distributed, no_aug=False): 39 | from yolox.data import ( 40 | MOTDataset, 41 | TrainTransform, 42 | YoloBatchSampler, 43 | DataLoader, 44 | InfiniteSampler, 45 | MosaicDetection, 46 | DiffusionMosaicDetection, 47 | DiffusionTrainTransform 48 | ) 49 | 50 | dataset = MOTDataset( 51 | data_dir=os.path.join(get_yolox_datadir(), "mot"), 52 | json_file=self.train_ann, 53 | name='train', 54 | img_size=self.input_size, 55 | preproc=TrainTransform( 56 | rgb_means=(0.485, 0.456, 0.406), 57 | std=(0.229, 0.224, 0.225), 58 | max_labels=500, 59 | ), 60 | ) 61 | 62 | dataset = DiffusionMosaicDetection( 63 | dataset, 64 | mosaic=not no_aug, 65 | img_size=self.input_size, 66 | preproc=DiffusionTrainTransform( 67 | rgb_means=(0.485, 0.456, 0.406), 68 | std=(0.229, 0.224, 0.225), 69 | max_labels=1000, 70 | ), 71 | degrees=self.degrees, 72 | translate=self.translate, 73 | scale=self.scale, 74 | shear=self.shear, 75 | perspective=self.perspective, 76 | enable_mixup=self.enable_mixup, 77 | ) 78 | 79 | self.dataset = dataset 80 | 81 | if is_distributed: 82 | batch_size = batch_size // dist.get_world_size() 83 | 84 | sampler = InfiniteSampler( 85 | len(self.dataset), seed=self.seed if self.seed else 0 86 | ) 87 | 88 | batch_sampler = YoloBatchSampler( 89 | sampler=sampler, 90 | batch_size=batch_size, 91 | drop_last=False, 92 | input_dimension=self.input_size, 93 | mosaic=not no_aug, 94 | ) 95 | 96 | dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True} 97 | dataloader_kwargs["batch_sampler"] = batch_sampler 98 | train_loader = DataLoader(self.dataset, **dataloader_kwargs) 99 | 100 | return train_loader 101 | 102 | def get_eval_loader(self, batch_size, is_distributed, testdev=False): 103 | from yolox.data import MOTDataset,DiffusionValTransform 104 | 105 | valdataset = MOTDataset( 106 | data_dir=os.path.join(get_yolox_datadir(), "mot"), 107 | json_file=self.val_ann, 108 | img_size=self.test_size, 109 | name='train', 110 | preproc=DiffusionValTransform( 111 | rgb_means=(0.485, 0.456, 0.406), 112 | std=(0.229, 0.224, 0.225), 113 | max_labels=1000, 114 | ) 115 | ) 116 | 117 | if is_distributed: 118 | batch_size = batch_size // dist.get_world_size() 119 | sampler = torch.utils.data.distributed.DistributedSampler( 120 | valdataset, shuffle=False 121 | ) 122 | else: 123 | sampler = torch.utils.data.SequentialSampler(valdataset) 124 | 125 | dataloader_kwargs = { 126 | "num_workers": self.data_num_workers, 127 | "pin_memory": True, 128 | "sampler": sampler, 129 | } 130 | dataloader_kwargs["batch_size"] = batch_size 131 | val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs) 132 | 133 | return val_loader 134 | 135 | def get_evaluator(self, batch_size, is_distributed, testdev=False): 136 | from yolox.evaluators import COCOEvaluator 137 | 138 | val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev) 139 | evaluator = COCOEvaluator( 140 | dataloader=val_loader, 141 | img_size=self.test_size, 142 | confthre=self.conf_thresh, 143 | nmsthre3d=self.nms_thresh3d, 144 | detthre=self.det_thresh, 145 | nmsthre2d=self.nms_thresh2d, 146 | num_classes=self.num_classes, 147 | testdev=testdev, 148 | ) 149 | return evaluator 150 | 151 | def get_model(self): 152 | from yolox.models import YOLOPAFPN, YOLOX, YOLOXHead 153 | from diffusion.models.diffusionnet import DiffusionNet,DiffusionHead 154 | 155 | def init_yolo(M): 156 | for m in M.modules(): 157 | if isinstance(m, nn.BatchNorm2d): 158 | m.eps = 1e-3 159 | m.momentum = 0.03 160 | 161 | if getattr(self, "model", None) is None: 162 | in_channels = [256, 512, 1024] 163 | backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels) 164 | for value in backbone.parameters(): 165 | value.requires_grad=False 166 | head=DiffusionHead(self.num_classes,self.width) 167 | self.model = DiffusionNet(backbone, head) 168 | 169 | self.model.apply(init_yolo) 170 | # self.model.head.initialize_biases(1e-2) 171 | return self.model 172 | 173 | def get_optimizer(self, batch_size): 174 | lr=2.5e-05 175 | weight_decay = 0.0001 176 | self.optimizer=AdamW(self.model.parameters(),lr=lr,weight_decay=weight_decay) 177 | return self.optimizer 178 | --------------------------------------------------------------------------------