├── Dockerfile ├── configs ├── config_quick.json └── config_refined.json ├── LICENSE ├── download.py ├── external ├── nms.py └── sort.py ├── train.py ├── utils.py ├── mask.py ├── .gitignore ├── render.py ├── test.py ├── detrk.py └── README.md /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime 2 | 3 | RUN apt-get update && apt-get install -y build-essential git ffmpeg libsm6 libxext6 fonts-freefont-ttf 4 | RUN pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.10/index.html 5 | RUN pip install jupyter opencv-python scikit-image filterpy 6 | RUN cd /workspace && git clone https://github.com/facebookresearch/detectron2.git 7 | -------------------------------------------------------------------------------- /configs/config_quick.json: -------------------------------------------------------------------------------- 1 | { 2 | "config": "COCO-Detection/retinanet_R_101_FPN_3x.yaml", 3 | "dataloader_num_workers": 4, 4 | "batch_size_per_image": 8, 5 | "anchor_generator_sizes": [[8, 16, 32, 64, 128]], 6 | "ims_per_batch": 1, 7 | "base_lr": 1e-4, 8 | "max_iter": 5000, 9 | "weights": "output/model_final.pth", 10 | "num_classes": 1, 11 | "score_threshold": 0.45, 12 | "nms_threshold": 0.25, 13 | "detections_per_image": 2000 14 | } 15 | -------------------------------------------------------------------------------- /configs/config_refined.json: -------------------------------------------------------------------------------- 1 | { 2 | "config": "COCO-Detection/retinanet_R_101_FPN_3x.yaml", 3 | "dataloader_num_workers": 4, 4 | "batch_size_per_image": 128, 5 | "anchor_generator_sizes": [[8, 16, 32, 64, 128, 256]], 6 | "ims_per_batch": 2, 7 | "base_lr": 1e-4, 8 | "max_iter": 100000, 9 | "weights": "output/model_final.pth", 10 | "num_classes": 1, 11 | "score_threshold": 0.45, 12 | "nms_threshold": 0.25, 13 | "detections_per_image": 2000 14 | } 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 b3d-project 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | THIS SOFTWARE AND/OR DATA WAS DEPOSITED IN THE BAIR OPEN RESEARCH COMMONS 24 | REPOSITORY ON OCTOBER 6th, 2022. 25 | -------------------------------------------------------------------------------- /download.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gdown 3 | import subprocess 4 | 5 | 6 | def parse_args(): 7 | parser = argparse.ArgumentParser(description='Dataset download script') 8 | parser.add_argument( 9 | '--skip_videos', action='store_true', 10 | help='Skip downloading videos') 11 | parser.add_argument( 12 | '--skip_images', action='store_true', 13 | help='Skip downloading annotated images') 14 | parser.add_argument( 15 | '--pull_model', action='store_true', 16 | help='Download the model trained with config_refined.json') 17 | return parser.parse_args() 18 | 19 | 20 | def main(args): 21 | if not args.skip_videos: 22 | print('Downloading videos...') 23 | cmd = 'mkdir -p videos' 24 | subprocess.run(cmd.split(' ')) 25 | gdown.download_folder(id='1UcVuWcqHdxq4D5O8M02o4zZKSvDRtEd6') 26 | if not args.skip_images: 27 | print('Downloading annotated images...') 28 | gdown.download(id='1v2Go30iTtbNDnOcmoSPueF4Mp93P5Lbg') 29 | cmd = 'unzip vision.zip' 30 | subprocess.run(cmd.split(' ')) 31 | cmd = 'rm vision.zip' 32 | subprocess.run(cmd.split(' ')) 33 | if args.pull_model: 34 | print('Downloading model...') 35 | gdown.download(id='17ZiwW_11q5oLldTCXuXjCpd8FQ7MjKaD') 36 | cmd = 'mkdir -p output' 37 | subprocess.run(cmd.split(' ')) 38 | cmd = 'mv model_final.pth output/' 39 | subprocess.run(cmd.split(' ')) 40 | 41 | 42 | if __name__ == '__main__': 43 | main(parse_args()) 44 | -------------------------------------------------------------------------------- /external/nms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def nms(bounding_boxes, confidence_scores, threshold): 5 | # https://github.com/amusi/Non-Maximum-Suppression/blob/master/nms.py 6 | # If no bounding boxes, return empty list 7 | if len(bounding_boxes) == 0: 8 | return [], [] 9 | 10 | # Bounding boxes 11 | boxes = np.array(bounding_boxes) 12 | 13 | # coordinates of bounding boxes 14 | start_x = boxes[:, 0] 15 | start_y = boxes[:, 1] 16 | end_x = boxes[:, 2] 17 | end_y = boxes[:, 3] 18 | 19 | # Confidence scores of bounding boxes 20 | score = np.array(confidence_scores) 21 | 22 | # Picked bounding boxes 23 | picked_boxes = [] 24 | picked_scores = [] 25 | 26 | # Compute areas of bounding boxes 27 | areas = (end_x - start_x + 1) * (end_y - start_y + 1) 28 | 29 | # Sort by confidence score of bounding boxes 30 | order = np.argsort(score) 31 | 32 | # Iterate bounding boxes 33 | while order.size > 0: 34 | # The index of largest confidence score 35 | index = order[-1] 36 | 37 | # Pick the bounding box with largest confidence score 38 | picked_boxes.append(bounding_boxes[index]) 39 | picked_scores.append(confidence_scores[index]) 40 | 41 | # Compute ordinates of intersection-over-union(IOU) 42 | x1 = np.maximum(start_x[index], start_x[order[:-1]]) 43 | x2 = np.minimum(end_x[index], end_x[order[:-1]]) 44 | y1 = np.maximum(start_y[index], start_y[order[:-1]]) 45 | y2 = np.minimum(end_y[index], end_y[order[:-1]]) 46 | 47 | # Compute areas of intersection-over-union 48 | w = np.maximum(0.0, x2 - x1 + 1) 49 | h = np.maximum(0.0, y2 - y1 + 1) 50 | intersection = w * h 51 | 52 | # Compute the ratio between intersection and union 53 | ratio = intersection / (areas[index] + areas[order[:-1]] - intersection) 54 | 55 | left = np.where(ratio < threshold) 56 | order = order[left] 57 | 58 | return picked_boxes, picked_scores -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from detectron2 import model_zoo 3 | from detectron2.config import get_cfg 4 | from detectron2.data import MetadataCatalog 5 | from detectron2.data.datasets import register_coco_instances 6 | from detectron2.engine import DefaultTrainer 7 | from detectron2.utils.logger import setup_logger 8 | import json 9 | import os 10 | import torch 11 | setup_logger() 12 | print( 13 | 'Torch version:', torch.__version__, 14 | 'CUDA availability:', torch.cuda.is_available()) 15 | 16 | 17 | def parse_args(): 18 | parser = argparse.ArgumentParser(description='Example train script') 19 | parser.add_argument('-c', '--config', required=True, 20 | help='Detection model configuration') 21 | return parser.parse_args() 22 | 23 | 24 | def main(args): 25 | dataset_name = 'b3d_train' 26 | annotation_path = 'vision/annotations/train.json' 27 | image_path = 'vision/images/train' 28 | register_coco_instances(dataset_name, {}, annotation_path, image_path) 29 | MetadataCatalog.get(dataset_name).thing_classes = ['vehicle'] 30 | 31 | with open(args.config) as fp: 32 | config = json.load(fp) 33 | cfg = get_cfg() 34 | cfg.merge_from_file(model_zoo.get_config_file(config['config'])) 35 | cfg.DATASETS.TRAIN = ('b3d_train',) 36 | cfg.DATASETS.TEST = () 37 | cfg.DATALOADER.NUM_WORKERS = config['dataloader_num_workers'] 38 | cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(config['config']) 39 | cfg.MODEL.ROI_HEADS.NUM_CLASSES = config['num_classes'] 40 | cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = config['batch_size_per_image'] 41 | cfg.MODEL.ANCHOR_GENERATOR.SIZES = config['anchor_generator_sizes'] 42 | cfg.SOLVER.IMS_PER_BATCH = config['ims_per_batch'] 43 | cfg.SOLVER.BASE_LR = config['base_lr'] 44 | cfg.SOLVER.MAX_ITER = config['max_iter'] 45 | 46 | os.makedirs(cfg.OUTPUT_DIR, exist_ok=True) 47 | trainer = DefaultTrainer(cfg) 48 | trainer.resume_or_load(resume=False) 49 | trainer.train() 50 | 51 | 52 | if __name__ == '__main__': 53 | main(parse_args()) 54 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from matplotlib.path import Path 2 | import numpy as np 3 | 4 | 5 | def mask_frame(frame, mask): 6 | domain = mask.find('.//polygon[@label="domain"]').attrib['points'] 7 | domain = domain.replace(';', ',') 8 | domain = np.array([ 9 | float(pt) for pt in domain.split(',')]).reshape((-1, 2)) 10 | tl = (int(np.min(domain[:, 1])), int(np.min(domain[:, 0]))) 11 | br = (int(np.max(domain[:, 1])), int(np.max(domain[:, 0]))) 12 | domain_poly = Path(domain) 13 | width, height = int(frame.shape[1]), int(frame.shape[0]) 14 | x, y = np.meshgrid(np.arange(width), np.arange(height)) 15 | x, y = x.flatten(), y.flatten() 16 | pixel_points = np.vstack((x, y)).T 17 | bitmap = domain_poly.contains_points(pixel_points) 18 | bitmap = bitmap.reshape((height, width)) 19 | frame[bitmap == 0] = 0 20 | frame_masked = frame[tl[0]:br[0], tl[1]:br[1], :] 21 | return frame_masked 22 | 23 | 24 | def parse_outputs(outputs, offset): 25 | instances = outputs['instances'].to('cpu') 26 | bboxes = [] 27 | scores = [] 28 | classes = [] 29 | for bbox, score, pred_class in zip( 30 | instances.pred_boxes, instances.scores, instances.pred_classes): 31 | bbox[0] += offset[0] 32 | bbox[1] += offset[1] 33 | bbox[2] += offset[0] 34 | bbox[3] += offset[1] 35 | bboxes.append(bbox.numpy()) 36 | scores.append(score.numpy()) 37 | classes.append(pred_class.numpy()) 38 | return bboxes, scores, classes 39 | 40 | 41 | def regionize_image(image): 42 | height, width, _ = image.shape 43 | split_width = width 44 | while(split_width / height > 4): 45 | split_width = int(split_width / 2) 46 | batch = [] 47 | covered_width = 0 48 | while(covered_width < width): 49 | stop_width = min(covered_width + split_width, width) 50 | if (stop_width - covered_width < 0.75 * split_width): 51 | break 52 | batch.append( 53 | [image[:, covered_width:stop_width, :], (covered_width, 0)]) 54 | covered_width = min(covered_width + int(split_width / 2), width) 55 | return batch 56 | -------------------------------------------------------------------------------- /mask.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import cv2 3 | import matplotlib.patches as patches 4 | from matplotlib.path import Path 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | import os 8 | from xml.etree import ElementTree 9 | 10 | 11 | def parse_args(): 12 | parser = argparse.ArgumentParser(description='Example masking script') 13 | parser.add_argument('-i', '--image', required=True, 14 | help='Sample image') 15 | parser.add_argument('-m', '--mask', required=True, 16 | help='Specification of the mask') 17 | return parser.parse_args() 18 | 19 | 20 | def visualize_masking(image, domain_poly): 21 | image = image[:, :, ::-1] 22 | fig = plt.figure(dpi=300, frameon=False) 23 | ax = fig.add_subplot(1, 1, 1) 24 | ax.set_axis_off() 25 | ax.imshow(image) 26 | patch = patches.PathPatch( 27 | domain_poly, facecolor='r', alpha=0.5, edgecolor='none') 28 | ax.add_patch(patch) 29 | plt.savefig('output/mask_overlay.png', bbox_inches='tight', pad_inches=0) 30 | 31 | 32 | def main(args): 33 | tree = ElementTree.parse(args.mask) 34 | root = tree.getroot() 35 | domain = root.find('.//polygon[@label="domain"]').attrib['points'] 36 | domain = domain.replace(';', ',') 37 | domain = np.array([ 38 | float(pt) for pt in domain.split(',')]).reshape((-1, 2)) 39 | tl = (int(np.min(domain[:, 1])), int(np.min(domain[:, 0]))) 40 | br = (int(np.max(domain[:, 1])), int(np.max(domain[:, 0]))) 41 | domain_poly = Path(domain) 42 | 43 | image = cv2.imread(args.image) 44 | 45 | visualize_masking(image, domain_poly) 46 | 47 | width, height = int(image.shape[1]), int(image.shape[0]) 48 | x, y = np.meshgrid(np.arange(width), np.arange(height)) 49 | x, y = x.flatten(), y.flatten() 50 | pixel_points = np.vstack((x, y)).T 51 | bitmap = domain_poly.contains_points(pixel_points) 52 | bitmap = bitmap.reshape((height, width)) 53 | image[bitmap == 0] = 0 54 | image_masked = image[tl[0]:br[0], tl[1]:br[1], :] 55 | os.makedirs('output', exist_ok=True) 56 | cv2.imwrite('output/masked_image.png', image_masked) 57 | 58 | 59 | if __name__ == '__main__': 60 | main(parse_args()) 61 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # Project specific folders 132 | .idea/ 133 | .vscode/ 134 | videos/ 135 | vision.zip 136 | vision/ 137 | output/ 138 | shared/ 139 | .DS_Store 140 | -------------------------------------------------------------------------------- /render.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import cv2 3 | import json 4 | from utils import mask_frame 5 | from xml.etree import ElementTree 6 | import os 7 | 8 | 9 | def parse_args(): 10 | parser = argparse.ArgumentParser(description='Example rendering script') 11 | parser.add_argument('-v', '--video', required=True, 12 | help='Input video') 13 | parser.add_argument('-d', '--data', required=True, 14 | help='Data for rendering detected and tracking results') 15 | parser.add_argument('-m', '--mask', required=True, 16 | help='Mask for the video') 17 | return parser.parse_args() 18 | 19 | 20 | def main(args): 21 | tree = ElementTree.parse(args.mask) 22 | mask = tree.getroot() 23 | cap = cv2.VideoCapture(os.path.expanduser(args.video)) 24 | with open(args.rendering) as fp: 25 | rendering = json.load(fp) 26 | frame_index = 0 27 | frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) 28 | out = None 29 | while cap.isOpened(): 30 | print('Parsing frame {:d} / {:d}...'.format(frame_index, frame_count)) 31 | success, frame = cap.read() 32 | if not success: 33 | break 34 | masked_frame = mask_frame(frame, mask) 35 | tracked_objects = rendering['{:d}'.format(frame_index)] 36 | for tracked_object in tracked_objects: 37 | object_index = int(tracked_object[0]) 38 | tl = (int(tracked_object[1]), int(tracked_object[2])) 39 | br = (int(tracked_object[3]), int(tracked_object[4])) 40 | cv2.rectangle(masked_frame, tl, br, (255, 0, 0), 2) 41 | cv2.putText( 42 | masked_frame, '{:d}'.format(object_index), (br[0]+10, br[1]), 43 | cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2) 44 | 45 | display_width = int(masked_frame.shape[1] * 0.5) 46 | display_height = int(masked_frame.shape[0] * 0.5) 47 | resized_frame = cv2.resize( 48 | masked_frame, (display_width, display_height)) 49 | if out is None: 50 | scenario = args.video.replace('videos/', '').replace('.mp4', '') 51 | out = cv2.VideoWriter( 52 | 'output/{}.mp4'.format(scenario), 53 | cv2.VideoWriter_fourcc('m', 'p', '4', 'v'), 30, 54 | (display_width,display_height)) 55 | out.write(resized_frame) 56 | 57 | # cv2.imshow('Frame', resized_frame) 58 | frame_index = frame_index + 1 59 | if cv2.waitKey(1) & 0xFF == ord('q'): 60 | break 61 | cap.release() 62 | out.release() 63 | cv2.destroyAllWindows() 64 | 65 | 66 | if __name__ == '__main__': 67 | main(parse_args()) 68 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import cv2 3 | from detectron2 import model_zoo 4 | from detectron2.config import get_cfg 5 | from detectron2.data.datasets import register_coco_instances 6 | from detectron2.data import MetadataCatalog 7 | from detectron2.engine import DefaultPredictor 8 | from external.nms import nms 9 | import json 10 | import matplotlib.pyplot as plt 11 | import matplotlib.patches as patches 12 | import numpy as np 13 | import os 14 | from utils import parse_outputs, regionize_image 15 | 16 | plt.rcParams['font.family'] = 'sans-serif' 17 | plt.rcParams['font.sans-serif'] = \ 18 | ['FreeSans'] + plt.rcParams['font.sans-serif'] 19 | 20 | 21 | def parse_args(): 22 | parser = argparse.ArgumentParser(description='Example test script') 23 | parser.add_argument('-i', '--image', required=True, 24 | help='Sample image') 25 | parser.add_argument('-c', '--config', required=True, 26 | help='Detection model configuration') 27 | return parser.parse_args() 28 | 29 | 30 | def visualize_outputs(image, bboxes, scores, save_path): 31 | fig = plt.figure(dpi=400, frameon=False) 32 | ax = fig.add_subplot(1, 1, 1) 33 | ax.set_axis_off() 34 | ax.imshow(image) 35 | cmap = plt.cm.get_cmap('terrain', len(bboxes)) 36 | for index, (bbox, score) in enumerate(zip(bboxes, scores)): 37 | origin = (bbox[0], bbox[1]) 38 | width = bbox[2] - bbox[0] 39 | length = bbox[3] - bbox[1] 40 | rect = patches.Rectangle( 41 | origin, width, length, 42 | linewidth=2, edgecolor=cmap(index), 43 | facecolor='w', alpha=0.5) 44 | ax.add_patch(rect) 45 | ax.text( 46 | bbox[0] + 2, bbox[3] - 5, 47 | '{:.2f}'.format(score), color='k', fontsize=3.0) 48 | plt.savefig(save_path, bbox_inches='tight', pad_inches=0) 49 | 50 | 51 | def main(args): 52 | dataset_name = 'b3d_test' 53 | annotations_path = 'vision/annotations/test.json' 54 | images_path = 'vision/images/test' 55 | register_coco_instances(dataset_name, {}, annotations_path, images_path) 56 | MetadataCatalog.get(dataset_name).thing_classes = ['vehicle'] 57 | 58 | with open(args.config) as fp: 59 | config = json.load(fp) 60 | cfg = get_cfg() 61 | cfg.merge_from_file(model_zoo.get_config_file(config['config'])) 62 | cfg.MODEL.WEIGHTS = config['weights'] 63 | cfg.MODEL.ROI_HEADS.NUM_CLASSES = config['num_classes'] 64 | cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = config['score_threshold'] 65 | cfg.MODEL.RETINANET.SCORE_THRESH_TEST = config['score_threshold'] 66 | cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST = config['nms_threshold'] 67 | cfg.MODEL.RETINANET.NMS_THRESH_TEST = config['nms_threshold'] 68 | cfg.TEST.DETECTIONS_PER_IMAGE = config['detections_per_image'] 69 | cfg.MODEL.ANCHOR_GENERATOR.SIZES = config['anchor_generator_sizes'] 70 | 71 | image_path = args.image 72 | image = cv2.imread(image_path) 73 | predictor = DefaultPredictor(cfg) 74 | image_regions = regionize_image(image) 75 | bboxes = [] 76 | scores = [] 77 | for _image, _offset in image_regions: 78 | _outputs = predictor(_image) 79 | _bboxes, _scores, _ = parse_outputs(_outputs, _offset) 80 | bboxes += _bboxes 81 | scores += _scores 82 | nms_threshold = config['nms_threshold'] 83 | nms_bboxes, nms_scores = nms(bboxes, scores, nms_threshold) 84 | 85 | save_path = os.path.join(cfg.OUTPUT_DIR, 'out.jpg') 86 | visualize_outputs(image, nms_bboxes, nms_scores, save_path) 87 | 88 | 89 | if __name__ == '__main__': 90 | main(parse_args()) 91 | -------------------------------------------------------------------------------- /detrk.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import cv2 3 | from detectron2 import model_zoo 4 | from detectron2.config import get_cfg 5 | from detectron2.data.datasets import register_coco_instances 6 | from detectron2.engine import DefaultPredictor 7 | from external.nms import nms 8 | from external.sort import Sort 9 | import json 10 | import numpy as np 11 | import os 12 | from utils import mask_frame, parse_outputs, regionize_image 13 | from xml.etree import ElementTree 14 | 15 | 16 | def parse_args(): 17 | parser = argparse.ArgumentParser( 18 | description='Example detection and tracking script') 19 | parser.add_argument('-v', '--video', required=True, 20 | help='Input video') 21 | parser.add_argument('-c', '--config', required=True, 22 | help='Detection model configuration') 23 | parser.add_argument('-m', '--mask', required=True, 24 | help='Mask for the video') 25 | return parser.parse_args() 26 | 27 | 28 | def main(args): 29 | with open(args.config) as fp: 30 | config = json.load(fp) 31 | cfg = get_cfg() 32 | cfg.merge_from_file(model_zoo.get_config_file(config['config'])) 33 | cfg.MODEL.WEIGHTS = config['weights'] 34 | cfg.MODEL.ROI_HEADS.NUM_CLASSES = config['num_classes'] 35 | cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = config['score_threshold'] 36 | cfg.MODEL.RETINANET.SCORE_THRESH_TEST = config['score_threshold'] 37 | cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST = config['nms_threshold'] 38 | cfg.MODEL.RETINANET.NMS_THRESH_TEST = config['nms_threshold'] 39 | cfg.TEST.DETECTIONS_PER_IMAGE = config['detections_per_image'] 40 | cfg.MODEL.ANCHOR_GENERATOR.SIZES = config['anchor_generator_sizes'] 41 | predictor = DefaultPredictor(cfg) 42 | tree = ElementTree.parse(args.mask) 43 | mask = tree.getroot() 44 | 45 | tracker = Sort(max_age=5) 46 | cap = cv2.VideoCapture(os.path.expanduser(args.video)) 47 | trajectories = {} 48 | rendering = {} 49 | frame_index = 0 50 | frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) 51 | while cap.isOpened(): 52 | print('Parsing frame {:d} / {:d}...'.format(frame_index, frame_count)) 53 | success, frame = cap.read() 54 | if not success: 55 | break 56 | frame_masked = mask_frame(frame, mask) 57 | 58 | image_regions = regionize_image(frame_masked) 59 | bboxes = [] 60 | scores = [] 61 | for _image, _offset in image_regions: 62 | _outputs = predictor(_image) 63 | _bboxes, _scores, _ = parse_outputs(_outputs, _offset) 64 | bboxes += _bboxes 65 | scores += _scores 66 | nms_threshold = config['nms_threshold'] 67 | nms_bboxes, nms_scores = nms(bboxes, scores, nms_threshold) 68 | detections = np.zeros((len(nms_bboxes), 5)) 69 | detections[:, 0:4] = nms_bboxes 70 | detections[:, 4] = nms_scores 71 | 72 | tracked_objects = tracker.update(detections) 73 | rendering[frame_index] = [] 74 | for tracked_object in tracked_objects: 75 | tl = (int(tracked_object[0]), int(tracked_object[1])) 76 | br = (int(tracked_object[2]), int(tracked_object[3])) 77 | object_index = int(tracked_object[4]) 78 | if object_index not in trajectories: 79 | trajectories[object_index] = [] 80 | trajectories[object_index].append([ 81 | frame_index, tl[0], tl[1], br[0], br[1]]) 82 | rendering[frame_index].append([ 83 | object_index, tl[0], tl[1], br[0], br[1]]) 84 | 85 | frame_index = frame_index + 1 86 | if cv2.waitKey(1) & 0xFF == ord('q'): 87 | break 88 | cap.release() 89 | cv2.destroyAllWindows() 90 | 91 | scenario = args.video.replace('videos/', '').replace('.mp4', '') 92 | with open('output/{}_t.json'.format(scenario), 'w') as fp: 93 | json.dump(trajectories, fp) 94 | with open('output/{}_r.json'.format(scenario), 'w') as fp: 95 | json.dump(rendering, fp) 96 | 97 | 98 | if __name__ == '__main__': 99 | main(parse_args()) 100 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Berkeley DeepDrive Drone Dataset 2 | 3 | ## Introduction 4 | 5 | The Berkeley DeepDrive Drone (B3D) Dataset allows researchers to study implicit driving etiquette in *understructured* road environments. 6 | The dataset consists: 7 | 1. A set of 20 aerial videos recording understructured driving, 8 | 2. A collection of 16002 images and annotations to train vehicle detection models, and 9 | 3. A few example scripts for illustrating typical usages. 10 | 11 | To download the videos and annotated images, run 12 | ``` 13 | pip install gdown 14 | python download.py 15 | ``` 16 | 17 | After downloading, the *full* structure of the dataset repository should be as follows: 18 | ``` 19 | . 20 | ├── configs 21 | │ ├── config_quick.json 22 | │ └── config_refined.json 23 | ├── Dockerfile 24 | ├── download.py 25 | ├── LICENSE 26 | ├── README.md 27 | ├── test.py 28 | ├── train.py 29 | ├── videos 30 | │ └── <20 mp4 files> 31 | └── vision 32 | ├── annotations 33 | │ ├── test.json 34 | │ ├── train.json 35 | │ └── val.json 36 | └── images 37 | ├── test 38 | │ └── <1636 jpg files> 39 | ├── train 40 | │ └── <12700 jpg files> 41 | └── val 42 | └── <1666 jpg files> 43 | ``` 44 | 45 | ## Getting Started 46 | We recommend running the script in a Docker container. 47 | Please follow the instructions [here](https://docs.docker.com/engine/install/) to install Docker and 48 | instructions [here](https://github.com/NVIDIA/nvidia-container-toolkit) to install NVIDIA Container Toolkit. 49 | 50 | After installing Docker and NVIDIA Container Toolkit, build the required Docker image 51 | ``` 52 | docker build -t detectron2:latest . 53 | ``` 54 | 55 | ## Usage 56 | To inspect and edit the annotations, please use the open source image annotation tool CVAT. 57 | Note that the training dataset might need to be split into several smaller datasets for it to be properly parsed by CVAT. 58 | 59 | To train a vehicle detection model using the annotated images, one could use the Detectron2 library. 60 | The example `train.py` script is provided to show how to use Detectron2 to train for a vehicle detection model. 61 | 62 | To run the trainer script, open a docker container and run 63 | ``` 64 | docker run --shm-size 16G -p 8899:8888 --rm --gpus all -it -v [path/to/b3d]:/data -w /data detectron2 bash 65 | # Use config_refined.json for better accuracy 66 | python train.py -c configs/config_quick.json 67 | ``` 68 | The trained model will be saved to `output/model_final.pth`. 69 | 70 | Alternatively, one can skip the training by downloading a pre-trained model as follows 71 | ``` 72 | python download.py --skip_videos --skip_images --pull_model 73 | ``` 74 | The trained model will be downloaded to `output/model_final.pth`. 75 | Note that this model is trained with `config_refined.json`. 76 | 77 | A test script `test.py` is provided to run the trained model on a sample image. 78 | 79 | For instance, to use the test script on the image `vision/images/test/01_034_01.jpg`, run 80 | ``` 81 | # Use config_refined.json if the model_final.pth is generated by it 82 | python test.py -i vision/images/test/01_034_01.jpg -c configs/config_quick.json 83 | ``` 84 | The result will be exported to `output/out.jpg`. 85 | 86 | Lastly, we provide a masking script `mask.py` that crops an image according to a pre-defined polygonal mask. 87 | The mask is expected to be made in CVAT through creating a polygon with the category name `domain`. 88 | For example, please download the 89 | [example mask](https://drive.google.com/file/d/1JdOlkYjYV_lI79tDA79WhXseun61E6SM/view?usp=sharing) with the corresponding 90 | [example image](https://drive.google.com/file/d/1xOHCyKPunfHpzbr64n5oB8rNS6vz-fpM/view?usp=sharing). 91 | Move those files into the `output/` directory and run 92 | ``` 93 | python mask.py --image output/example_masking.png --mask output/example_masking.xml 94 | ``` 95 | The masked image will be saved to `output/masked_image.png` and a visualization of the mask to `output/mask_overlay.png`. 96 | To mask a video, simply apply the same masking to every frame of the video. 97 | 98 | ## Citation 99 | If you find this dataset useful, please consider cite the accompanying paper below: 100 | ``` 101 | @article{wu2022b3d, 102 | title={Decentralized Vehicle Coordination: The Berkeley DeepDrive Drone Dataset}, 103 | author={Fangyu Wu and Dequan Wang and Minjune Hwang and Chenhui Hao and Jiawei Lu and Jiamu Zhang and Christopher Chou and Trevor Darrell and Alexandre Byen}, 104 | journal={arXiv}, 105 | year={2022} 106 | } 107 | ``` 108 | -------------------------------------------------------------------------------- /external/sort.py: -------------------------------------------------------------------------------- 1 | """ 2 | SORT: A Simple, Online and Realtime Tracker 3 | Copyright (C) 2016-2020 Alex Bewley alex@bewley.ai 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | """ 18 | from __future__ import print_function 19 | 20 | import os 21 | import numpy as np 22 | import matplotlib 23 | matplotlib.use('TkAgg') 24 | import matplotlib.pyplot as plt 25 | import matplotlib.patches as patches 26 | from skimage import io 27 | 28 | import glob 29 | import time 30 | import argparse 31 | from filterpy.kalman import KalmanFilter 32 | 33 | np.random.seed(0) 34 | 35 | 36 | def linear_assignment(cost_matrix): 37 | try: 38 | import lap 39 | _, x, y = lap.lapjv(cost_matrix, extend_cost=True) 40 | return np.array([[y[i],i] for i in x if i >= 0]) # 41 | except ImportError: 42 | from scipy.optimize import linear_sum_assignment 43 | x, y = linear_sum_assignment(cost_matrix) 44 | return np.array(list(zip(x, y))) 45 | 46 | 47 | def iou_batch(bb_test, bb_gt): 48 | """ 49 | From SORT: Computes IOU between two bboxes in the form [x1,y1,x2,y2] 50 | """ 51 | bb_gt = np.expand_dims(bb_gt, 0) 52 | bb_test = np.expand_dims(bb_test, 1) 53 | 54 | xx1 = np.maximum(bb_test[..., 0], bb_gt[..., 0]) 55 | yy1 = np.maximum(bb_test[..., 1], bb_gt[..., 1]) 56 | xx2 = np.minimum(bb_test[..., 2], bb_gt[..., 2]) 57 | yy2 = np.minimum(bb_test[..., 3], bb_gt[..., 3]) 58 | w = np.maximum(0., xx2 - xx1) 59 | h = np.maximum(0., yy2 - yy1) 60 | wh = w * h 61 | o = wh / ((bb_test[..., 2] - bb_test[..., 0]) * (bb_test[..., 3] - bb_test[..., 1]) 62 | + (bb_gt[..., 2] - bb_gt[..., 0]) * (bb_gt[..., 3] - bb_gt[..., 1]) - wh) 63 | return(o) 64 | 65 | 66 | def convert_bbox_to_z(bbox): 67 | """ 68 | Takes a bounding box in the form [x1,y1,x2,y2] and returns z in the form 69 | [x,y,s,r] where x,y is the centre of the box and s is the scale/area and r is 70 | the aspect ratio 71 | """ 72 | w = bbox[2] - bbox[0] 73 | h = bbox[3] - bbox[1] 74 | x = bbox[0] + w/2. 75 | y = bbox[1] + h/2. 76 | s = w * h #scale is just area 77 | r = w / float(h) 78 | return np.array([x, y, s, r]).reshape((4, 1)) 79 | 80 | 81 | def convert_x_to_bbox(x,score=None): 82 | """ 83 | Takes a bounding box in the centre form [x,y,s,r] and returns it in the form 84 | [x1,y1,x2,y2] where x1,y1 is the top left and x2,y2 is the bottom right 85 | """ 86 | w = np.sqrt(x[2] * x[3]) 87 | h = x[2] / w 88 | if(score==None): 89 | return np.array([x[0]-w/2.,x[1]-h/2.,x[0]+w/2.,x[1]+h/2.]).reshape((1,4)) 90 | else: 91 | return np.array([x[0]-w/2.,x[1]-h/2.,x[0]+w/2.,x[1]+h/2.,score]).reshape((1,5)) 92 | 93 | 94 | class KalmanBoxTracker(object): 95 | """ 96 | This class represents the internal state of individual tracked objects observed as bbox. 97 | """ 98 | count = 0 99 | def __init__(self,bbox): 100 | """ 101 | Initialises a tracker using initial bounding box. 102 | """ 103 | #define constant velocity model 104 | self.kf = KalmanFilter(dim_x=7, dim_z=4) 105 | self.kf.F = np.array([[1,0,0,0,1,0,0],[0,1,0,0,0,1,0],[0,0,1,0,0,0,1],[0,0,0,1,0,0,0], [0,0,0,0,1,0,0],[0,0,0,0,0,1,0],[0,0,0,0,0,0,1]]) 106 | self.kf.H = np.array([[1,0,0,0,0,0,0],[0,1,0,0,0,0,0],[0,0,1,0,0,0,0],[0,0,0,1,0,0,0]]) 107 | 108 | self.kf.R[2:,2:] *= 10. 109 | self.kf.P[4:,4:] *= 1000. #give high uncertainty to the unobservable initial velocities 110 | self.kf.P *= 10. 111 | self.kf.Q[-1,-1] *= 0.01 112 | self.kf.Q[4:,4:] *= 0.01 113 | 114 | self.kf.x[:4] = convert_bbox_to_z(bbox) 115 | self.time_since_update = 0 116 | self.id = KalmanBoxTracker.count 117 | KalmanBoxTracker.count += 1 118 | self.history = [] 119 | self.hits = 0 120 | self.hit_streak = 0 121 | self.age = 0 122 | 123 | def update(self,bbox): 124 | """ 125 | Updates the state vector with observed bbox. 126 | """ 127 | self.time_since_update = 0 128 | self.history = [] 129 | self.hits += 1 130 | self.hit_streak += 1 131 | self.kf.update(convert_bbox_to_z(bbox)) 132 | 133 | def predict(self): 134 | """ 135 | Advances the state vector and returns the predicted bounding box estimate. 136 | """ 137 | if((self.kf.x[6]+self.kf.x[2])<=0): 138 | self.kf.x[6] *= 0.0 139 | self.kf.predict() 140 | self.age += 1 141 | if(self.time_since_update>0): 142 | self.hit_streak = 0 143 | self.time_since_update += 1 144 | self.history.append(convert_x_to_bbox(self.kf.x)) 145 | return self.history[-1] 146 | 147 | def get_state(self): 148 | """ 149 | Returns the current bounding box estimate. 150 | """ 151 | return convert_x_to_bbox(self.kf.x) 152 | 153 | 154 | def associate_detections_to_trackers(detections,trackers,iou_threshold = 0.3): 155 | """ 156 | Assigns detections to tracked object (both represented as bounding boxes) 157 | 158 | Returns 3 lists of matches, unmatched_detections and unmatched_trackers 159 | """ 160 | if(len(trackers)==0): 161 | return np.empty((0,2),dtype=int), np.arange(len(detections)), np.empty((0,5),dtype=int) 162 | 163 | iou_matrix = iou_batch(detections, trackers) 164 | 165 | if min(iou_matrix.shape) > 0: 166 | a = (iou_matrix > iou_threshold).astype(np.int32) 167 | if a.sum(1).max() == 1 and a.sum(0).max() == 1: 168 | matched_indices = np.stack(np.where(a), axis=1) 169 | else: 170 | matched_indices = linear_assignment(-iou_matrix) 171 | else: 172 | matched_indices = np.empty(shape=(0,2)) 173 | 174 | unmatched_detections = [] 175 | for d, det in enumerate(detections): 176 | if(d not in matched_indices[:,0]): 177 | unmatched_detections.append(d) 178 | unmatched_trackers = [] 179 | for t, trk in enumerate(trackers): 180 | if(t not in matched_indices[:,1]): 181 | unmatched_trackers.append(t) 182 | 183 | #filter out matched with low IOU 184 | matches = [] 185 | for m in matched_indices: 186 | if(iou_matrix[m[0], m[1]]= self.min_hits or self.frame_count <= self.min_hits): 246 | ret.append(np.concatenate((d,[trk.id+1])).reshape(1,-1)) # +1 as MOT benchmark requires positive 247 | i -= 1 248 | # remove dead tracklet 249 | if(trk.time_since_update > self.max_age): 250 | self.trackers.pop(i) 251 | if(len(ret)>0): 252 | return np.concatenate(ret) 253 | return np.empty((0,5)) 254 | 255 | def parse_args(): 256 | """Parse input arguments.""" 257 | parser = argparse.ArgumentParser(description='SORT demo') 258 | parser.add_argument('--display', dest='display', help='Display online tracker output (slow) [False]',action='store_true') 259 | parser.add_argument("--seq_path", help="Path to detections.", type=str, default='data') 260 | parser.add_argument("--phase", help="Subdirectory in seq_path.", type=str, default='train') 261 | parser.add_argument("--max_age", 262 | help="Maximum number of frames to keep alive a track without associated detections.", 263 | type=int, default=1) 264 | parser.add_argument("--min_hits", 265 | help="Minimum number of associated detections before track is initialised.", 266 | type=int, default=3) 267 | parser.add_argument("--iou_threshold", help="Minimum IOU for match.", type=float, default=0.3) 268 | args = parser.parse_args() 269 | return args 270 | 271 | if __name__ == '__main__': 272 | # all train 273 | args = parse_args() 274 | display = args.display 275 | phase = args.phase 276 | total_time = 0.0 277 | total_frames = 0 278 | colours = np.random.rand(32, 3) #used only for display 279 | if(display): 280 | if not os.path.exists('mot_benchmark'): 281 | print('\n\tERROR: mot_benchmark link not found!\n\n Create a symbolic link to the MOT benchmark\n (https://motchallenge.net/data/2D_MOT_2015/#download). E.g.:\n\n $ ln -s /path/to/MOT2015_challenge/2DMOT2015 mot_benchmark\n\n') 282 | exit() 283 | plt.ion() 284 | fig = plt.figure() 285 | ax1 = fig.add_subplot(111, aspect='equal') 286 | 287 | if not os.path.exists('output'): 288 | os.makedirs('output') 289 | pattern = os.path.join(args.seq_path, phase, '*', 'det', 'det.txt') 290 | for seq_dets_fn in glob.glob(pattern): 291 | mot_tracker = Sort(max_age=args.max_age, 292 | min_hits=args.min_hits, 293 | iou_threshold=args.iou_threshold) #create instance of the SORT tracker 294 | seq_dets = np.loadtxt(seq_dets_fn, delimiter=',') 295 | seq = seq_dets_fn[pattern.find('*'):].split(os.path.sep)[0] 296 | 297 | with open(os.path.join('output', '%s.txt'%(seq)),'w') as out_file: 298 | print("Processing %s."%(seq)) 299 | for frame in range(int(seq_dets[:,0].max())): 300 | frame += 1 #detection and frame numbers begin at 1 301 | dets = seq_dets[seq_dets[:, 0]==frame, 2:7] 302 | dets[:, 2:4] += dets[:, 0:2] #convert to [x1,y1,w,h] to [x1,y1,x2,y2] 303 | total_frames += 1 304 | 305 | if(display): 306 | fn = os.path.join('mot_benchmark', phase, seq, 'img1', '%06d.jpg'%(frame)) 307 | im =io.imread(fn) 308 | ax1.imshow(im) 309 | plt.title(seq + ' Tracked Targets') 310 | 311 | start_time = time.time() 312 | trackers = mot_tracker.update(dets) 313 | cycle_time = time.time() - start_time 314 | total_time += cycle_time 315 | 316 | for d in trackers: 317 | print('%d,%d,%.2f,%.2f,%.2f,%.2f,1,-1,-1,-1'%(frame,d[4],d[0],d[1],d[2]-d[0],d[3]-d[1]),file=out_file) 318 | if(display): 319 | d = d.astype(np.int32) 320 | ax1.add_patch(patches.Rectangle((d[0],d[1]),d[2]-d[0],d[3]-d[1],fill=False,lw=3,ec=colours[d[4]%32,:])) 321 | 322 | if(display): 323 | fig.canvas.flush_events() 324 | plt.draw() 325 | ax1.cla() 326 | 327 | print("Total Tracking took: %.3f seconds for %d frames or %.1f FPS" % (total_time, total_frames, total_frames / total_time)) 328 | 329 | if(display): 330 | print("Note: to get real runtime results run without the option: --display") 331 | --------------------------------------------------------------------------------