├── yolo3 ├── __init__.py ├── utils.py ├── generate_detections.py └── model.py ├── deepsort ├── __init__.py ├── detection.py ├── preprocessing.py ├── iou_matching.py ├── tracker.py ├── track.py ├── nn_matching.py ├── linear_assignment.py └── kalman_filter.py ├── model_h5 └── h5 files can be located in here.txt ├── model_data ├── tiny_yolo_anchors.txt ├── yolo_anchors.txt ├── cifar_classes.txt ├── voc_classes.txt └── coco_classes.txt ├── font └── times.ttf ├── input └── Demo1.jpg ├── output └── Demo1.png ├── openh264-1.8.0-win64.dll ├── LICENSE ├── .gitignore ├── tracker_func.py ├── yolo_video.py ├── kmeans_anchors.py ├── README.md ├── read_data_cifar100.py ├── sort.py ├── train.py └── yolo.py /yolo3/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepsort/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /model_h5/h5 files can be located in here.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /model_data/tiny_yolo_anchors.txt: -------------------------------------------------------------------------------- 1 | 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 2 | -------------------------------------------------------------------------------- /font/times.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ImLaoBJie/yolo3_sort_deepsort/HEAD/font/times.ttf -------------------------------------------------------------------------------- /input/Demo1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ImLaoBJie/yolo3_sort_deepsort/HEAD/input/Demo1.jpg -------------------------------------------------------------------------------- /output/Demo1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ImLaoBJie/yolo3_sort_deepsort/HEAD/output/Demo1.png -------------------------------------------------------------------------------- /model_data/yolo_anchors.txt: -------------------------------------------------------------------------------- 1 | 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 2 | -------------------------------------------------------------------------------- /openh264-1.8.0-win64.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ImLaoBJie/yolo3_sort_deepsort/HEAD/openh264-1.8.0-win64.dll -------------------------------------------------------------------------------- /model_data/cifar_classes.txt: -------------------------------------------------------------------------------- 1 | boy 2 | girl 3 | man 4 | woman 5 | bicycle 6 | bus 7 | motorcycle 8 | pickuptruck 9 | streetcar 10 | tank 11 | -------------------------------------------------------------------------------- /model_data/voc_classes.txt: -------------------------------------------------------------------------------- 1 | aeroplane 2 | bicycle 3 | bird 4 | boat 5 | bottle 6 | bus 7 | car 8 | cat 9 | chair 10 | cow 11 | diningtable 12 | dog 13 | horse 14 | motorbike 15 | person 16 | pottedplant 17 | sheep 18 | sofa 19 | train 20 | tvmonitor 21 | -------------------------------------------------------------------------------- /model_data/coco_classes.txt: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 ImLaoBJie 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /deepsort/detection.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Detection(object): 5 | """ 6 | This class represents a bounding box detection in a single image. 7 | 8 | Parameters 9 | ---------- 10 | tlwh : array_like 11 | Bounding box in format `(x, y, w, h)`. 12 | confidence : float 13 | Detector confidence score. 14 | feature : array_like 15 | A feature vector that describes the object contained in this image. 16 | 17 | Attributes 18 | ---------- 19 | tlwh : ndarray 20 | Bounding box in format `(top left x, top left y, width, height)`. 21 | confidence : ndarray 22 | Detector confidence score. 23 | class_name : ndarray 24 | Detector class. 25 | feature : ndarray | NoneType 26 | A feature vector that describes the object contained in this image. 27 | 28 | """ 29 | 30 | def __init__(self, tlwh, confidence, class_name, feature): 31 | self.tlwh = np.asarray(tlwh, dtype=np.float) 32 | self.confidence = float(confidence) 33 | self.class_name = class_name 34 | self.feature = np.asarray(feature, dtype=np.float32) 35 | 36 | def get_class(self): 37 | return self.class_name 38 | 39 | def to_tlbr(self): 40 | """Convert bounding box to format `(min x, min y, max x, max y)`, i.e., 41 | `(top left, bottom right)`. 42 | """ 43 | ret = self.tlwh.copy() 44 | ret[2:] += ret[:2] 45 | return ret 46 | 47 | def to_xyah(self): 48 | """Convert bounding box to format `(center x, center y, aspect ratio, 49 | height)`, where the aspect ratio is `width / height`. 50 | """ 51 | ret = self.tlwh.copy() 52 | ret[:2] += ret[2:] / 2 53 | ret[2] /= ret[3] 54 | return ret 55 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .nox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # IPython 77 | profile_default/ 78 | ipython_config.py 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # Environments 90 | .env 91 | .venv 92 | env/ 93 | venv/ 94 | ENV/ 95 | env.bak/ 96 | venv.bak/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # mypy 109 | .mypy_cache/ 110 | .dmypy.json 111 | dmypy.json 112 | 113 | # Pyre type checker 114 | .pyre/ 115 | 116 | # Demo 117 | *.mp4 118 | *.weights 119 | *.dat 120 | *.webm 121 | *.png 122 | 123 | # Weights 124 | *.h5 125 | *.pb 126 | 127 | __pycache__/ 128 | .idea/ 129 | cifar-100-python/ 130 | -------------------------------------------------------------------------------- /deepsort/preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | 4 | 5 | def non_max_suppression(boxes, classes, max_bbox_overlap, scores=None): 6 | """Suppress overlapping detections. 7 | 8 | Original code from [1]_ has been adapted to include confidence score. 9 | 10 | .. [1] http://www.pyimagesearch.com/2015/02/16/ 11 | faster-non-maximum-suppression-python/ 12 | 13 | Examples 14 | -------- 15 | 16 | >>> boxes = [d.roi for d in detections] 17 | >>> classes = [d.classes for d in detections] 18 | >>> scores = [d.confidence for d in detections] 19 | >>> indices = non_max_suppression(boxes, max_bbox_overlap, scores) 20 | >>> detections = [detections[i] for i in indices] 21 | 22 | Parameters 23 | ---------- 24 | boxes : ndarray 25 | Array of ROIs (x, y, width, height). 26 | max_bbox_overlap : float 27 | ROIs that overlap more than this values are suppressed. 28 | scores : Optional[array_like] 29 | Detector confidence score. 30 | 31 | Returns 32 | ------- 33 | List[int] 34 | Returns indices of detections that have survived non-maxima suppression. 35 | 36 | """ 37 | if len(boxes) == 0: 38 | return [] 39 | 40 | boxes = boxes.astype(np.float) 41 | pick = [] 42 | 43 | x1 = boxes[:, 0] 44 | y1 = boxes[:, 1] 45 | x2 = boxes[:, 2] + boxes[:, 0] 46 | y2 = boxes[:, 3] + boxes[:, 1] 47 | 48 | area = (x2 - x1 + 1) * (y2 - y1 + 1) 49 | if scores is not None: 50 | idxs = np.argsort(scores) 51 | else: 52 | idxs = np.argsort(y2) 53 | 54 | while len(idxs) > 0: 55 | last = len(idxs) - 1 56 | i = idxs[last] 57 | pick.append(i) 58 | 59 | xx1 = np.maximum(x1[i], x1[idxs[:last]]) 60 | yy1 = np.maximum(y1[i], y1[idxs[:last]]) 61 | xx2 = np.minimum(x2[i], x2[idxs[:last]]) 62 | yy2 = np.minimum(y2[i], y2[idxs[:last]]) 63 | 64 | w = np.maximum(0, xx2 - xx1 + 1) 65 | h = np.maximum(0, yy2 - yy1 + 1) 66 | 67 | overlap = (w * h) / area[idxs[:last]] 68 | 69 | idxs = np.delete( 70 | idxs, np.concatenate( 71 | ([last], np.where(overlap > max_bbox_overlap)[0]))) 72 | 73 | return pick 74 | -------------------------------------------------------------------------------- /tracker_func.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sort import Sort 4 | from deepsort.tracker import Tracker 5 | from deepsort.detection import Detection 6 | from deepsort import preprocessing 7 | from yolo3.utils import convert_boxes 8 | 9 | 10 | def sort_image(sort_class: Sort, out_boxes, out_scores, out_classes): 11 | dets = [] 12 | 13 | for i in range(0, len(out_boxes)): 14 | dets.append([out_boxes[i][1], out_boxes[i][0], out_boxes[i][3], out_boxes[i][2], out_scores[i], out_classes[i]]) 15 | 16 | dets = np.array(dets) 17 | # update 18 | trackers = sort_class.update(dets) 19 | 20 | out_boxes = [] 21 | out_scores = [] 22 | out_classes = [] 23 | object_id = [] 24 | # d [x1,y1,x2,y2,object_id,score,type] 25 | for d in trackers: 26 | out_boxes.append(list([d[1], d[0], d[3], d[2]])) 27 | object_id.append(int(d[4])) 28 | out_scores.append(float(d[5])) 29 | out_classes.append(int(d[6])) 30 | 31 | return np.array(out_boxes), np.array(out_scores), np.array(out_classes), np.array(object_id) 32 | 33 | 34 | def deepsort_image(deepsort_class: Tracker, encoder, frame, out_boxes, out_scores, out_classes, 35 | nms_max_overlap=1.0): 36 | 37 | converted_boxes = convert_boxes(out_boxes) 38 | features = encoder(frame, converted_boxes) 39 | detections = [Detection(bbox, score, class_name, feature) for bbox, score, class_name, feature in 40 | zip(converted_boxes, out_scores, out_classes, features)] 41 | 42 | # run non-maxima suppresion 43 | boxs = np.array([d.tlwh for d in detections]) 44 | scores = np.array([d.confidence for d in detections]) 45 | classes = np.array([d.class_name for d in detections]) 46 | indices = preprocessing.non_max_suppression(boxs, classes, nms_max_overlap, scores) 47 | detections = [detections[i] for i in indices] 48 | 49 | deepsort_class.predict() 50 | deepsort_class.update(detections) 51 | 52 | num_trackers = len(deepsort_class.tracks) 53 | out_boxes = [] 54 | out_classes = [] 55 | out_scores = [] 56 | object_id = [] 57 | # d [x1,y1,x2,y2,object_id,score,type] 58 | for index, track in enumerate(deepsort_class.tracks): 59 | if not track.is_confirmed() or track.time_since_update > 1: 60 | continue 61 | out_boxes.append(track.to_tlbr()) 62 | out_classes.append(int(track.get_class())) 63 | out_scores.append(float(track.get_score())) 64 | object_id.append(int(track.track_id)) 65 | 66 | return np.array(out_boxes), np.array(out_scores), np.array(out_classes), np.array(object_id) 67 | -------------------------------------------------------------------------------- /yolo_video.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | 4 | import numpy as np 5 | 6 | from yolo import YOLO, detect_video 7 | from PIL import Image 8 | 9 | 10 | DEFAULTS = { 11 | "model_path": './model_h5/yolo.h5', 12 | "anchors_path": './model_data/yolo_anchors.txt', 13 | "classes_path": './model_data/coco_classes.txt', 14 | "deepsort_model": './model_data/mars-small128.pb', 15 | "gpu_num": 1, 16 | "image": False, # 如果此处设置了True,"tracker"则被忽略 17 | "tracker": 'deepsort', # 此处根据需要为'sort'或'deepsort' 18 | "write_to_file": True, 19 | "input": './input/your_video.format', 20 | "output": './output/your_video.format', 21 | "output_path": './output/', 22 | "score": 0.4, # threshold 23 | "iou": 0.4, # threshold 24 | "repeat_iou": 0.95, # threshold 25 | } 26 | 27 | 28 | def getvalue(FLAGS, defaults): 29 | 30 | args = vars(FLAGS) 31 | 32 | for value in defaults: 33 | args[value] = defaults[value] 34 | 35 | return FLAGS 36 | 37 | 38 | def detect_img(yolo): 39 | while True: 40 | 41 | img = input('Input image filename:') 42 | try: 43 | image = Image.open(img) 44 | image = np.asarray(image) 45 | except: 46 | print('Open Error! Try again!') 47 | continue 48 | else: 49 | # Initialization 50 | # mot_tracker = sort.Sort() 51 | # yolo.mot_tracker = mot_tracker 52 | yolo.frame = 1 53 | 54 | if yolo.write_to_file: 55 | emptyFile = open(yolo.output_path + 'result.dat', 'w') 56 | else: 57 | emptyFile = None 58 | r_image = yolo.detect_image(image, emptyFile) 59 | if yolo.write_to_file: 60 | emptyFile.close() 61 | r_image.save(yolo.__dict__['output_path'] + 'output.png', 'png') 62 | yolo.close_session() 63 | 64 | 65 | FLAGS = None 66 | 67 | if __name__ == '__main__': 68 | 69 | FLAGS = argparse.Namespace() 70 | FLAGS = getvalue(FLAGS, DEFAULTS) 71 | 72 | if FLAGS.image: 73 | """ 74 | Image detection mode, disregard any remaining command line arguments 75 | """ 76 | print("Image detection mode") 77 | if "input" in FLAGS: 78 | print(" Ignoring remaining command line arguments: " + FLAGS.input + "," + FLAGS.output) 79 | detect_img(YOLO(**vars(FLAGS))) 80 | elif "input" in FLAGS: 81 | detect_video(YOLO(**vars(FLAGS)), FLAGS.input, FLAGS.output) 82 | else: 83 | print("Must specify at least video_input_path. See usage with --help.") 84 | -------------------------------------------------------------------------------- /deepsort/iou_matching.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import numpy as np 3 | from . import linear_assignment 4 | 5 | 6 | def iou(bbox, candidates): 7 | """Computer intersection over union. 8 | 9 | Parameters 10 | ---------- 11 | bbox : ndarray 12 | A bounding box in format `(top left x, top left y, width, height)`. 13 | candidates : ndarray 14 | A matrix of candidate bounding boxes (one per row) in the same format 15 | as `bbox`. 16 | 17 | Returns 18 | ------- 19 | ndarray 20 | The intersection over union in [0, 1] between the `bbox` and each 21 | candidate. A higher score means a larger fraction of the `bbox` is 22 | occluded by the candidate. 23 | 24 | """ 25 | bbox_tl, bbox_br = bbox[:2], bbox[:2] + bbox[2:] 26 | candidates_tl = candidates[:, :2] 27 | candidates_br = candidates[:, :2] + candidates[:, 2:] 28 | 29 | tl = np.c_[np.maximum(bbox_tl[0], candidates_tl[:, 0])[:, np.newaxis], 30 | np.maximum(bbox_tl[1], candidates_tl[:, 1])[:, np.newaxis]] 31 | br = np.c_[np.minimum(bbox_br[0], candidates_br[:, 0])[:, np.newaxis], 32 | np.minimum(bbox_br[1], candidates_br[:, 1])[:, np.newaxis]] 33 | wh = np.maximum(0., br - tl) 34 | 35 | area_intersection = wh.prod(axis=1) 36 | area_bbox = bbox[2:].prod() 37 | area_candidates = candidates[:, 2:].prod(axis=1) 38 | return area_intersection / (area_bbox + area_candidates - area_intersection) 39 | 40 | 41 | def iou_cost(tracks, detections, track_indices=None, 42 | detection_indices=None): 43 | """An intersection over union distance metric. 44 | 45 | Parameters 46 | ---------- 47 | tracks : List[deep_sort.track.Track] 48 | A list of tracks. 49 | detections : List[deep_sort.detection.Detection] 50 | A list of detections. 51 | track_indices : Optional[List[int]] 52 | A list of indices to tracks that should be matched. Defaults to 53 | all `tracks`. 54 | detection_indices : Optional[List[int]] 55 | A list of indices to detections that should be matched. Defaults 56 | to all `detections`. 57 | 58 | Returns 59 | ------- 60 | ndarray 61 | Returns a cost matrix of shape 62 | len(track_indices), len(detection_indices) where entry (i, j) is 63 | `1 - iou(tracks[track_indices[i]], detections[detection_indices[j]])`. 64 | 65 | """ 66 | if track_indices is None: 67 | track_indices = np.arange(len(tracks)) 68 | if detection_indices is None: 69 | detection_indices = np.arange(len(detections)) 70 | 71 | cost_matrix = np.zeros((len(track_indices), len(detection_indices))) 72 | for row, track_idx in enumerate(track_indices): 73 | if tracks[track_idx].time_since_update > 1: 74 | cost_matrix[row, :] = linear_assignment.INFTY_COST 75 | continue 76 | 77 | bbox = tracks[track_idx].to_tlwh() 78 | candidates = np.asarray([detections[i].tlwh for i in detection_indices]) 79 | cost_matrix[row, :] = 1. - iou(bbox, candidates) 80 | return cost_matrix 81 | -------------------------------------------------------------------------------- /kmeans_anchors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class YOLO_Kmeans: 5 | 6 | def __init__(self, cluster_number, filename): 7 | self.cluster_number = cluster_number 8 | self.filename = filename 9 | 10 | def iou(self, boxes, clusters): # 1 box -> k clusters 11 | n = boxes.shape[0] 12 | k = self.cluster_number 13 | 14 | box_area = boxes[:, 0] * boxes[:, 1] 15 | box_area = box_area.repeat(k) 16 | box_area = np.reshape(box_area, (n, k)) 17 | 18 | cluster_area = clusters[:, 0] * clusters[:, 1] 19 | cluster_area = np.tile(cluster_area, [1, n]) 20 | cluster_area = np.reshape(cluster_area, (n, k)) 21 | 22 | box_w_matrix = np.reshape(boxes[:, 0].repeat(k), (n, k)) 23 | cluster_w_matrix = np.reshape(np.tile(clusters[:, 0], (1, n)), (n, k)) 24 | min_w_matrix = np.minimum(cluster_w_matrix, box_w_matrix) 25 | 26 | box_h_matrix = np.reshape(boxes[:, 1].repeat(k), (n, k)) 27 | cluster_h_matrix = np.reshape(np.tile(clusters[:, 1], (1, n)), (n, k)) 28 | min_h_matrix = np.minimum(cluster_h_matrix, box_h_matrix) 29 | inter_area = np.multiply(min_w_matrix, min_h_matrix) 30 | 31 | result = inter_area / (box_area + cluster_area - inter_area) 32 | return result 33 | 34 | def avg_iou(self, boxes, clusters): 35 | accuracy = np.mean([np.max(self.iou(boxes, clusters), axis=1)]) 36 | return accuracy 37 | 38 | def kmeans(self, boxes, k, dist=np.median): 39 | box_number = boxes.shape[0] 40 | distances = np.empty((box_number, k)) 41 | last_nearest = np.zeros((box_number,)) 42 | np.random.seed() 43 | clusters = boxes[np.random.choice( 44 | box_number, k, replace=False)] # init k clusters 45 | while True: 46 | 47 | distances = 1 - self.iou(boxes, clusters) 48 | 49 | current_nearest = np.argmin(distances, axis=1) 50 | if (last_nearest == current_nearest).all(): 51 | break # clusters won't change 52 | for cluster in range(k): 53 | clusters[cluster] = dist( # update clusters 54 | boxes[current_nearest == cluster], axis=0) 55 | 56 | last_nearest = current_nearest 57 | 58 | return clusters 59 | 60 | def result2txt(self, data): 61 | f = open('output/yolo_anchors.txt', 'w') 62 | row = np.shape(data)[0] 63 | for i in range(row): 64 | if i == 0: 65 | x_y = "%d,%d" % (data[i][0], data[i][1]) 66 | else: 67 | x_y = ", %d,%d" % (data[i][0], data[i][1]) 68 | f.write(x_y) 69 | f.close() 70 | 71 | def txt2boxes(self): 72 | f = open(self.filename, 'r') 73 | dataSet = [] 74 | for line in f: 75 | infos = line.split(', ') 76 | length = len(infos) 77 | for i in range(1, length): 78 | width = int(abs(float(infos[4]) - float(infos[2]))) 79 | height = int(abs(float(infos[5]) - float(infos[3]))) 80 | dataSet.append([width, height]) 81 | result = np.array(dataSet) 82 | f.close() 83 | return result 84 | 85 | def txt2clusters(self): 86 | all_boxes = self.txt2boxes() 87 | result = self.kmeans(all_boxes, k=self.cluster_number) 88 | result = result[np.lexsort(result.T[0, None])] 89 | self.result2txt(result) 90 | print("K anchors:\n {}".format(result)) 91 | print("Accuracy: {:.2f}%".format( 92 | self.avg_iou(all_boxes, result) * 100)) 93 | 94 | 95 | if __name__ == "__main__": 96 | cluster_number = 9 97 | filename = 'output/result.dat' 98 | kmeans = YOLO_Kmeans(cluster_number, filename) 99 | kmeans.txt2clusters() 100 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # YOLOv3+SORT+DeepSort 2 | 3 | * Update 2020.7.16 增加deepsort,并作了大量调整 4 | 5 | # 介绍 Introduction 6 | 7 | YOLOV3及其训练的实现借鉴:[qqwweee/keras-yolo3](https://github.com/qqwweee/keras-yolo3) 8 | 9 | SORT的实现借鉴:[abewley/sort](https://github.com/abewley/sort) 10 | 11 | DeepSort的实现借鉴:[theAIGuysCode/yolov3_deepsort](https://github.com/theAIGuysCode/yolov3_deepsort) 12 | 13 | 参考文献: 14 | 15 | 1. [SIMPLE ONLINE AND REALTIME TRACKING](https://arxiv.org/pdf/1602.00763.pdf) 16 | 17 | 2. [SIMPLE ONLINE AND REALTIME TRACKING WITH A DEEP ASSOCIATION METRIC](https://arxiv.org/pdf/1703.07402.pdf) 18 | 19 | 演示视频:[SORT](https://www.bilibili.com/video/av56450343/) 20 | [DEEPSORT](https://www.bilibili.com/video/BV16A411e7ih/) 21 | 22 | --- 23 | 24 | # 搞快点 Quick Start 25 | 26 | 1. 打开`yolo_video.py` 27 | 28 | 2. 修改`DEFAULTS`(个人原因不太喜欢用`argparse`) 29 | 30 | ``` 31 | DEFAULTS = { 32 | "model_path": './model_h5/yolo.h5', 33 | "anchors_path": './model_data/yolo_anchors.txt', 34 | "classes_path": './model_data/coco_classes.txt', 35 | "deepsort_model": './model_data/mars-small128.pb', 36 | "gpu_num": 1, 37 | "image": False, # 如果此处设置了True,"tracker"则被忽略 38 | "tracker": 'deepsort', # 此处根据需要为'sort'或'deepsort' 39 | "write_to_file": True, 40 | "input": './input/your_video.format', 41 | "output": './output/your_video.format', 42 | "output_path": './output/', 43 | "score": 0.4, # threshold 44 | "iou": 0.4, # threshold 45 | "repeat_iou": 0.95, # threshold 46 | } 47 | ``` 48 | 49 | 3. 运行`yolo_video.py`,结果可在`"output_path"`中指定的文件夹查看 50 | 51 | ``` 52 | python yolo_video.py 53 | ``` 54 | 55 | 4. 如果想适用轻量级的YOLOv3模型,修改'"model_path"'和'"anchors_path"'即可 56 | 57 | *关于YOLOV3的内容,可以查看[YOLO WEBSITE](https://pjreddie.com/darknet/yolo/) 58 | 59 | *tiny-YOLOv3下载:[tiny-YOLOv3](https://pjreddie.com/media/files/yolov3-tiny.weights) 60 | 61 | *YOLOv3下载:[YOLOv3](https://pjreddie.com/media/files/yolov3.weights) 62 | 63 | *预训练的DeepSort网络:Google Drive: [DeepSort](https://drive.google.com/open?id=18fKzfqnqhqW3s9zwsCbnVJ5XF2JFeqMp), BaiduDisk: [DeepSort](https://pan.baidu.com/s/1B4xKXYWckM4TLIg6WGW6uw) pw:9i6p 64 | 65 | --- 66 | 67 | # 参数含义 Parameter 68 | 69 | ``` 70 | model_path # h5文件路径 71 | anchors_path # anchor的路径 72 | classes_path # 存放识别对象类别的路径 73 | deepsort_model # DeepSort预训练权重存放路径 74 | gpu_num # gpu数 75 | image # 处理video(False)或处理图片(True) 76 | tracker # 是否使用追踪 77 | write_to_file # 是否写入到文件 78 | input # video的路径 79 | output # 输出video的路径 80 | output_path # 其他文件output的路径 81 | score # 分数低于该阈值的物体会被忽略 82 | iou # iou低于该阈值的物体会被忽略 83 | repeat_iou # 去除重复bounding box 84 | ``` 85 | 86 | *写入到文件的格式为: 87 | 88 | ``` 89 | , , , , , , , , , 90 | ``` 91 | 92 | --- 93 | 94 | # 训练自己的模型 Training 95 | 96 | 选取的图片从CIFAR-100 dataset中提取,由于主要研究对象是交通方面的,因此选取的物体种类主要围绕车辆和 97 | 人,详细分类见`model_data/cifar_classes.txt` 98 | 99 | CIFAR数据集可在此网站查看:[The CIFAR-10 and CIFAR-100](http://www.cs.toronto.edu/~kriz/cifar.html) 100 | 101 | CIFAR-100 dataset下载:[CIFAR-100 python version](http://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz) 102 | 103 | 1. 可通过修改`read_data_cifar100.py`确定训练数据集的物体类别 104 | 105 | ``` 106 | REMAIN = list(np.concatenate([[11, 35, 46, 98], [8, 13, 48, 58], [81, 85]])) 107 | ``` 108 | 109 | 2. 运行train.py 110 | 111 | ``` 112 | python train.py 113 | ``` 114 | 115 | 可自行修改`epochs`,`batch_size` 116 | 117 | 3. 可先使用训练好的YOLOv3模型`yolo.h5`获取bounding box数据,再使用`kmeans_anchors.py` 118 | 计算获得anchors 119 | 120 | --- 121 | 122 | # TIPS 123 | 124 | 1. 环境 Environment 125 | 126 | * 主要依赖 127 | 128 | * python 3.6 129 | * Keras 2.3.1 130 | * tensorflow-gpu 1.13.0 131 | * numpy 1.17.0 132 | 133 | (较低版本貌似也支持) 134 | 135 | 3. 缺少`openh264-1.8.0-win64.dll`可能会发生未知错误,因此需要将此文件和`python yolo_video.py`放置在 136 | 同一目录下(貌似少了也没啥事) 137 | 138 | 4. DeepSort能解决短时遮挡问题,解决不了长时间object消失或被遮挡问题 139 | 140 | 5. **DEMO**上传至[百度云](https://pan.baidu.com/s/1VLKI8OGDbzsfqtzMe1amxg) PW: pb34 141 | 142 | 6. **MOT_DEMO** [Multiple Object Tracking Benchmark](https://motchallenge.net/data/MOT16/) 143 | 144 | 145 | -------------------------------------------------------------------------------- /read_data_cifar100.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import os 4 | import matplotlib.pyplot as plt 5 | 6 | PATH = 'cifar-100-python/' 7 | REMOVE = list(range(0, 100)) 8 | REMAIN = list(np.concatenate([[11, 35, 46, 98], [8, 13, 48, 58], [81, 85]])) 9 | for i in REMAIN: 10 | REMOVE.remove(i) 11 | 12 | 13 | def filter(image, label): 14 | # filter 15 | remove_index = [] 16 | for index, element in enumerate(label): 17 | if int(element) in REMOVE: 18 | remove_index.append(index) 19 | 20 | label = np.delete(label, remove_index) 21 | image = np.delete(image, remove_index, 0) 22 | 23 | if not REMAIN == []: 24 | value = 0 25 | for index in REMAIN: 26 | label[label == np.int32(index)] = np.int32(value) 27 | value = value + 1 28 | 29 | return image, label 30 | 31 | 32 | def load_CIFAR_batch(filename, N, data_filter: bool): 33 | # 单个batch 34 | # load single batch of cifar 35 | with open(filename, 'rb') as f: 36 | datadict = pickle.load(f, encoding='latin1') # dict类型 37 | image = datadict['data'] # X, ndarray, 像素值 38 | label = datadict['fine_labels'] # Y, list, 标签, 分类 39 | 40 | # check the id of fine_labels relevant to the coarse_labels 41 | # label = np.array(label) 42 | # coarse = np.array(datadict['coarse_labels']) 43 | # print(np.unique(label[np.array(np.where(coarse == 19))[0]])) 44 | 45 | # reshape, 一维数组转为矩阵10000行3列。每个entries是32x32 46 | # transpose,转置 47 | # astype,复制,同时指定类型 48 | image = image.reshape(N, 3, 32, 32).transpose(0, 2, 3, 1).astype('float') 49 | label = np.array(label) 50 | 51 | if data_filter: 52 | image, label = filter(image, label) 53 | 54 | return image, label 55 | 56 | 57 | def load_CIFAR100(path, data_filter: bool): 58 | # 所有batch 59 | # load all of cifar 60 | images = [] # list 61 | labels = [] 62 | 63 | # 训练集 64 | f = os.path.join(path, 'train') 65 | image, label = load_CIFAR_batch(f, 50000, data_filter) 66 | images.append(image) 67 | labels.append(label) 68 | 69 | images = np.concatenate(images) # [ndarray, ndarray] 合并为一个ndarray 70 | labels = np.concatenate(labels) 71 | 72 | # 测试集 73 | img_val, lab_val = load_CIFAR_batch(os.path.join(path, 'test'), 10000, data_filter) 74 | return images, labels, img_val, lab_val 75 | 76 | 77 | # 警告:使用该函数可能会导致内存溢出,可以适当修改减少扩充量 78 | # WARNING:Using this function may cause out of memory and OS breakdown 79 | def creat_more_data(images): 80 | # 通过旋转、翻转扩充数据 expand dataset through rotation and mirroring 81 | images_rot90 = [] 82 | images_rot180 = [] 83 | images_rot270 = [] 84 | img_lr = [] 85 | img_ud = [] 86 | 87 | for index in range(0, images.shape[0]): 88 | band_1 = images[index, :, :, 0] 89 | band_2 = images[index, :, :, 1] 90 | band_3 = images[index, :, :, 2] 91 | 92 | # 旋转90, rotating 90 degrees 93 | band_1_rot90 = np.rot90(band_1) 94 | band_2_rot90 = np.rot90(band_2) 95 | band_3_rot90 = np.rot90(band_3) 96 | images_rot90.append(np.dstack((band_1_rot90, band_2_rot90, band_3_rot90))) 97 | 98 | # 180 99 | band_1_rot180 = np.rot90(band_1_rot90) 100 | band_2_rot180 = np.rot90(band_2_rot90) 101 | band_3_rot180 = np.rot90(band_3_rot90) 102 | images_rot180.append(np.dstack((band_1_rot180, band_2_rot180, band_3_rot180))) 103 | 104 | # 270 105 | band_1_rot270 = np.rot90(band_1_rot180) 106 | band_2_rot270 = np.rot90(band_2_rot180) 107 | band_3_rot270 = np.rot90(band_3_rot180) 108 | images_rot270.append(np.dstack((band_1_rot270, band_2_rot270, band_3_rot270))) 109 | 110 | # 左右翻转 flip horizontally 111 | lr1 = np.flip(band_1, 0) 112 | lr2 = np.flip(band_2, 0) 113 | lr3 = np.flip(band_3, 0) 114 | img_lr.append(np.dstack((lr1, lr2, lr3))) 115 | 116 | # 上下反转 flip vertical 117 | ud1 = np.flip(band_1, 1) 118 | ud2 = np.flip(band_2, 1) 119 | ud3 = np.flip(band_3, 1) 120 | img_ud.append(np.dstack((ud1, ud2, ud3))) 121 | 122 | rot90 = np.array(images_rot90) 123 | rot180 = np.array(images_rot180) 124 | rot270 = np.array(images_rot270) 125 | lr = np.array(img_lr) 126 | ud = np.array(img_ud) 127 | 128 | images = np.concatenate((rot90, rot180, rot270, lr, ud)) 129 | 130 | return images 131 | 132 | 133 | def shuffle(images, labels): 134 | permutation = np.random.permutation(images.shape[0]) 135 | shuffled_dataset = images[permutation, :, :, :] 136 | shuffled_labels = labels[permutation] 137 | return shuffled_dataset, shuffled_labels 138 | 139 | 140 | def data(path, more_data: bool, shuffle_data: bool, data_filter: bool): 141 | images, labels, img_val, lab_val = load_CIFAR100(path, data_filter) 142 | 143 | if more_data: 144 | # 扩充数据 expand dataset 145 | images = creat_more_data(np.array(images)) 146 | # 扩充标签 expend labels 147 | labels = np.concatenate((labels, labels, labels, labels, labels, labels)) 148 | 149 | if shuffle_data: 150 | images, labels = shuffle(images, labels) 151 | img_val, lab_val = shuffle(img_val, lab_val) 152 | 153 | return images, labels, img_val, lab_val 154 | 155 | 156 | def main(): 157 | images, labels, img_val, lab_val = data(PATH, False, True, True) 158 | # test 159 | print(len(images)) 160 | print(len(labels)) 161 | plt.imshow(images[0] / 255) 162 | print(images[0]) 163 | print(labels[0]) 164 | plt.show() 165 | 166 | 167 | if __name__ == '__main__': 168 | main() 169 | -------------------------------------------------------------------------------- /deepsort/tracker.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import numpy as np 3 | from . import kalman_filter 4 | from . import linear_assignment 5 | from . import iou_matching 6 | from .track import Track 7 | 8 | 9 | class Tracker: 10 | """ 11 | This is the multi-target tracker. 12 | 13 | Parameters 14 | ---------- 15 | metric : nn_matching.NearestNeighborDistanceMetric 16 | A distance metric for measurement-to-track association. 17 | max_age : int 18 | Maximum number of missed misses before a track is deleted. 19 | n_init : int 20 | Number of consecutive detections before the track is confirmed. The 21 | track state is set to `Deleted` if a miss occurs within the first 22 | `n_init` frames. 23 | 24 | Attributes 25 | ---------- 26 | metric : nn_matching.NearestNeighborDistanceMetric 27 | The distance metric used for measurement to track association. 28 | max_age : int 29 | Maximum number of missed misses before a track is deleted. 30 | n_init : int 31 | Number of frames that a track remains in initialization phase. 32 | kf : kalman_filter.KalmanFilter 33 | A Kalman filter to filter target trajectories in image space. 34 | tracks : List[Track] 35 | The list of active tracks at the current time step. 36 | 37 | """ 38 | 39 | def __init__(self, metric, max_iou_distance=0.7, max_age=30, n_init=3): 40 | self.metric = metric 41 | self.max_iou_distance = max_iou_distance 42 | self.max_age = max_age 43 | self.n_init = n_init 44 | 45 | self.kf = kalman_filter.KalmanFilter() 46 | self.tracks = [] 47 | self._next_id = 1 48 | 49 | def predict(self): 50 | """Propagate track state distributions one time step forward. 51 | 52 | This function should be called once every time step, before `update`. 53 | """ 54 | for track in self.tracks: 55 | track.predict(self.kf) 56 | 57 | def update(self, detections): 58 | """Perform measurement update and track management. 59 | 60 | Parameters 61 | ---------- 62 | detections : List[deep_sort.detection.Detection] 63 | A list of detections at the current time step. 64 | 65 | """ 66 | # Run matching cascade. 67 | matches, unmatched_tracks, unmatched_detections = \ 68 | self._match(detections) 69 | 70 | # Update track set. 71 | for track_idx, detection_idx in matches: 72 | self.tracks[track_idx].update( 73 | self.kf, detections[detection_idx]) 74 | for track_idx in unmatched_tracks: 75 | self.tracks[track_idx].mark_missed() 76 | for detection_idx in unmatched_detections: 77 | self._initiate_track(detections[detection_idx]) 78 | self.tracks = [t for t in self.tracks if not t.is_deleted()] 79 | 80 | # Update distance metric. 81 | active_targets = [t.track_id for t in self.tracks if t.is_confirmed()] 82 | features, targets = [], [] 83 | for track in self.tracks: 84 | if not track.is_confirmed(): 85 | continue 86 | features += track.features 87 | targets += [track.track_id for _ in track.features] 88 | track.features = [] 89 | self.metric.partial_fit( 90 | np.asarray(features), np.asarray(targets), active_targets) 91 | 92 | def _match(self, detections): 93 | 94 | def gated_metric(tracks, dets, track_indices, detection_indices): 95 | features = np.array([dets[i].feature for i in detection_indices]) 96 | targets = np.array([tracks[i].track_id for i in track_indices]) 97 | cost_matrix = self.metric.distance(features, targets) 98 | cost_matrix = linear_assignment.gate_cost_matrix( 99 | self.kf, cost_matrix, tracks, dets, track_indices, 100 | detection_indices) 101 | 102 | return cost_matrix 103 | 104 | # Split track set into confirmed and unconfirmed tracks. 105 | confirmed_tracks = [ 106 | i for i, t in enumerate(self.tracks) if t.is_confirmed()] 107 | unconfirmed_tracks = [ 108 | i for i, t in enumerate(self.tracks) if not t.is_confirmed()] 109 | 110 | # Associate confirmed tracks using appearance features. 111 | matches_a, unmatched_tracks_a, unmatched_detections = \ 112 | linear_assignment.matching_cascade( 113 | gated_metric, self.metric.matching_threshold, self.max_age, 114 | self.tracks, detections, confirmed_tracks) 115 | 116 | # Associate remaining tracks together with unconfirmed tracks using IOU. 117 | iou_track_candidates = unconfirmed_tracks + [ 118 | k for k in unmatched_tracks_a if 119 | self.tracks[k].time_since_update == 1] 120 | unmatched_tracks_a = [ 121 | k for k in unmatched_tracks_a if 122 | self.tracks[k].time_since_update != 1] 123 | matches_b, unmatched_tracks_b, unmatched_detections = \ 124 | linear_assignment.min_cost_matching( 125 | iou_matching.iou_cost, self.max_iou_distance, self.tracks, 126 | detections, iou_track_candidates, unmatched_detections) 127 | 128 | matches = matches_a + matches_b 129 | unmatched_tracks = list(set(unmatched_tracks_a + unmatched_tracks_b)) 130 | return matches, unmatched_tracks, unmatched_detections 131 | 132 | def _initiate_track(self, detection): 133 | mean, covariance = self.kf.initiate(detection.to_xyah()) 134 | class_name = detection.get_class() 135 | self.tracks.append(Track( 136 | mean, covariance, self._next_id, self.n_init, self.max_age, 137 | detection.feature, class_name, detection.confidence)) 138 | self._next_id += 1 139 | -------------------------------------------------------------------------------- /deepsort/track.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class TrackState: 4 | """ 5 | Enumeration type for the single target track state. Newly created tracks are 6 | classified as `tentative` until enough evidence has been collected. Then, 7 | the track state is changed to `confirmed`. Tracks that are no longer alive 8 | are classified as `deleted` to mark them for removal from the set of active 9 | tracks. 10 | 11 | """ 12 | 13 | Tentative = 1 14 | Confirmed = 2 15 | Deleted = 3 16 | 17 | 18 | class Track: 19 | """ 20 | A single target track with state space `(x, y, a, h)` and associated 21 | velocities, where `(x, y)` is the center of the bounding box, `a` is the 22 | aspect ratio and `h` is the height. 23 | 24 | Parameters 25 | ---------- 26 | mean : ndarray 27 | Mean vector of the initial state distribution. 28 | covariance : ndarray 29 | Covariance matrix of the initial state distribution. 30 | track_id : int 31 | A unique track identifier. 32 | n_init : int 33 | Number of consecutive detections before the track is confirmed. The 34 | track state is set to `Deleted` if a miss occurs within the first 35 | `n_init` frames. 36 | max_age : int 37 | The maximum number of consecutive misses before the track state is 38 | set to `Deleted`. 39 | feature : Optional[ndarray] 40 | Feature vector of the detection this track originates from. If not None, 41 | this feature is added to the `features` cache. 42 | 43 | Attributes 44 | ---------- 45 | mean : ndarray 46 | Mean vector of the initial state distribution. 47 | covariance : ndarray 48 | Covariance matrix of the initial state distribution. 49 | track_id : int 50 | A unique track identifier. 51 | hits : int 52 | Total number of measurement updates. 53 | age : int 54 | Total number of frames since first occurance. 55 | time_since_update : int 56 | Total number of frames since last measurement update. 57 | state : TrackState 58 | The current track state. 59 | features : List[ndarray] 60 | A cache of features. On each measurement update, the associated feature 61 | vector is added to this list. 62 | 63 | """ 64 | 65 | def __init__(self, mean, covariance, track_id, n_init, max_age, 66 | feature=None, class_name=None, confidence=None): 67 | self.mean = mean 68 | self.covariance = covariance 69 | self.track_id = track_id 70 | self.hits = 1 71 | self.age = 1 72 | self.time_since_update = 0 73 | 74 | self.state = TrackState.Tentative 75 | self.features = [] 76 | if feature is not None: 77 | self.features.append(feature) 78 | 79 | self._n_init = n_init 80 | self._max_age = max_age 81 | self.class_name = class_name 82 | self.confidence = confidence 83 | 84 | def to_tlwh(self): 85 | """Get current position in bounding box format `(top left x, top left y, 86 | width, height)`. 87 | 88 | Returns 89 | ------- 90 | ndarray 91 | The bounding box. 92 | 93 | """ 94 | ret = self.mean[:4].copy() 95 | ret[2] *= ret[3] 96 | ret[:2] -= ret[2:] / 2 97 | return ret 98 | 99 | def to_tlbr(self): 100 | """Get current position in bounding box format `(min x, miny, max x, 101 | max y)`. 102 | 103 | Returns 104 | ------- 105 | ndarray 106 | The bounding box. 107 | 108 | """ 109 | ret = self.to_tlwh() 110 | ret[2:] = ret[:2] + ret[2:] 111 | return ret 112 | 113 | def get_class(self): 114 | return self.class_name 115 | 116 | def get_score(self): 117 | return self.confidence 118 | 119 | def predict(self, kf): 120 | """Propagate the state distribution to the current time step using a 121 | Kalman filter prediction step. 122 | 123 | Parameters 124 | ---------- 125 | kf : kalman_filter.KalmanFilter 126 | The Kalman filter. 127 | 128 | """ 129 | self.mean, self.covariance = kf.predict(self.mean, self.covariance) 130 | self.age += 1 131 | self.time_since_update += 1 132 | 133 | def update(self, kf, detection): 134 | """Perform Kalman filter measurement update step and update the feature 135 | cache. 136 | 137 | Parameters 138 | ---------- 139 | kf : kalman_filter.KalmanFilter 140 | The Kalman filter. 141 | detection : Detection 142 | The associated detection. 143 | 144 | """ 145 | self.mean, self.covariance = kf.update( 146 | self.mean, self.covariance, detection.to_xyah()) 147 | self.features.append(detection.feature) 148 | 149 | self.hits += 1 150 | self.time_since_update = 0 151 | if self.state == TrackState.Tentative and self.hits >= self._n_init: 152 | self.state = TrackState.Confirmed 153 | 154 | def mark_missed(self): 155 | """Mark this track as missed (no association at the current time step). 156 | """ 157 | if self.state == TrackState.Tentative: 158 | self.state = TrackState.Deleted 159 | elif self.time_since_update > self._max_age: 160 | self.state = TrackState.Deleted 161 | 162 | def is_tentative(self): 163 | """Returns True if this track is tentative (unconfirmed). 164 | """ 165 | return self.state == TrackState.Tentative 166 | 167 | def is_confirmed(self): 168 | """Returns True if this track is confirmed.""" 169 | return self.state == TrackState.Confirmed 170 | 171 | def is_deleted(self): 172 | """Returns True if this track is dead and should be deleted.""" 173 | return self.state == TrackState.Deleted 174 | -------------------------------------------------------------------------------- /yolo3/utils.py: -------------------------------------------------------------------------------- 1 | """Miscellaneous utility functions.""" 2 | 3 | from functools import reduce 4 | 5 | from PIL import Image 6 | import numpy as np 7 | from matplotlib.colors import rgb_to_hsv, hsv_to_rgb 8 | 9 | from sort import iou 10 | 11 | 12 | def compose(*funcs): 13 | """Compose arbitrarily many functions, evaluated left to right. 14 | 15 | Reference: https://mathieularose.com/function-composition-in-python/ 16 | """ 17 | # return lambda x: reduce(lambda v, f: f(v), funcs, x) 18 | if funcs: 19 | return reduce(lambda f, g: lambda *a, **kw: g(f(*a, **kw)), funcs) 20 | else: 21 | raise ValueError('Composition of empty sequence not supported.') 22 | 23 | 24 | def letterbox_image(image, size): 25 | '''resize image with unchanged aspect ratio using padding''' 26 | iw, ih = image.size 27 | w, h = size 28 | scale = min(w / iw, h / ih) 29 | nw = int(iw * scale) 30 | nh = int(ih * scale) 31 | 32 | image = image.resize((nw, nh), Image.BICUBIC) 33 | new_image = Image.new('RGB', size, (128, 128, 128)) 34 | new_image.paste(image, ((w - nw) // 2, (h - nh) // 2)) 35 | return new_image 36 | 37 | 38 | def rand(a=0, b=1): 39 | return np.random.rand() * (b - a) + a 40 | 41 | 42 | def get_random_data(annotation_line, input_shape, random=True, max_boxes=20, jitter=.3, hue=.1, sat=1.5, val=1.5, 43 | proc_img=True): 44 | '''random preprocessing for real-time data augmentation''' 45 | line = annotation_line.split() 46 | image = Image.open(line[0]) 47 | iw, ih = image.size 48 | h, w = input_shape 49 | box = np.array([np.array(list(map(int, box.split(',')))) for box in line[1:]]) 50 | 51 | if not random: 52 | # resize image 53 | scale = min(w / iw, h / ih) 54 | nw = int(iw * scale) 55 | nh = int(ih * scale) 56 | dx = (w - nw) // 2 57 | dy = (h - nh) // 2 58 | image_data = 0 59 | if proc_img: 60 | image = image.resize((nw, nh), Image.BICUBIC) 61 | new_image = Image.new('RGB', (w, h), (128, 128, 128)) 62 | new_image.paste(image, (dx, dy)) 63 | image_data = np.array(new_image) / 255. 64 | 65 | # correct boxes 66 | box_data = np.zeros((max_boxes, 5)) 67 | if len(box) > 0: 68 | np.random.shuffle(box) 69 | if len(box) > max_boxes: box = box[:max_boxes] 70 | box[:, [0, 2]] = box[:, [0, 2]] * scale + dx 71 | box[:, [1, 3]] = box[:, [1, 3]] * scale + dy 72 | box_data[:len(box)] = box 73 | 74 | return image_data, box_data 75 | 76 | # resize image 77 | new_ar = w / h * rand(1 - jitter, 1 + jitter) / rand(1 - jitter, 1 + jitter) 78 | scale = rand(.25, 2) 79 | if new_ar < 1: 80 | nh = int(scale * h) 81 | nw = int(nh * new_ar) 82 | else: 83 | nw = int(scale * w) 84 | nh = int(nw / new_ar) 85 | image = image.resize((nw, nh), Image.BICUBIC) 86 | 87 | # place image 88 | dx = int(rand(0, w - nw)) 89 | dy = int(rand(0, h - nh)) 90 | new_image = Image.new('RGB', (w, h), (128, 128, 128)) 91 | new_image.paste(image, (dx, dy)) 92 | image = new_image 93 | 94 | # flip image or not 95 | flip = rand() < .5 96 | if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT) 97 | 98 | # distort image 99 | hue = rand(-hue, hue) 100 | sat = rand(1, sat) if rand() < .5 else 1 / rand(1, sat) 101 | val = rand(1, val) if rand() < .5 else 1 / rand(1, val) 102 | x = rgb_to_hsv(np.array(image) / 255.) 103 | x[..., 0] += hue 104 | x[..., 0][x[..., 0] > 1] -= 1 105 | x[..., 0][x[..., 0] < 0] += 1 106 | x[..., 1] *= sat 107 | x[..., 2] *= val 108 | x[x > 1] = 1 109 | x[x < 0] = 0 110 | image_data = hsv_to_rgb(x) # numpy array, 0 to 1 111 | 112 | # correct boxes 113 | box_data = np.zeros((max_boxes, 5)) 114 | if len(box) > 0: 115 | np.random.shuffle(box) 116 | box[:, [0, 2]] = box[:, [0, 2]] * nw / iw + dx 117 | box[:, [1, 3]] = box[:, [1, 3]] * nh / ih + dy 118 | if flip: box[:, [0, 2]] = w - box[:, [2, 0]] 119 | box[:, 0:2][box[:, 0:2] < 0] = 0 120 | box[:, 2][box[:, 2] > w] = w 121 | box[:, 3][box[:, 3] > h] = h 122 | box_w = box[:, 2] - box[:, 0] 123 | box_h = box[:, 3] - box[:, 1] 124 | box = box[np.logical_and(box_w > 1, box_h > 1)] # discard invalid box 125 | if len(box) > max_boxes: box = box[:max_boxes] 126 | box_data[:len(box)] = box 127 | 128 | return image_data, box_data 129 | 130 | 131 | def delete_repeat_bbox(out_boxes, out_scores, out_classes, iou_threshold): 132 | '''Delete the same bboxes marked as different classes''' 133 | to_del = [] 134 | for i in range(0, len(out_classes) - 1): 135 | for j in range(i + 1, len(out_classes)): 136 | if (i not in to_del) and (j not in to_del): 137 | # bounding box 1 138 | y1_1, x1_1, y2_1, x2_1 = out_boxes[i] 139 | # bounding box 2 140 | y1_2, x1_2, y2_2, x2_2 = out_boxes[j] 141 | if iou([x1_1, y1_1, x2_1, y2_1], [x1_2, y1_2, x2_2, y2_2]) >= iou_threshold: 142 | if out_scores[i] >= out_scores[j]: 143 | to_del.append(j) 144 | else: 145 | to_del.append(i) 146 | 147 | to_del = sorted(to_del) 148 | 149 | for t in reversed(to_del): 150 | out_boxes.pop(t) 151 | out_scores.pop(t) 152 | out_classes.pop(t) 153 | 154 | return np.array(out_boxes), np.array(out_scores), np.array(out_classes) 155 | 156 | 157 | # boxes: np.array 158 | def convert_boxes(boxes): 159 | # [x1, y1, x2, y2] -> 160 | returned_boxes = [] 161 | for box in boxes: 162 | box = box.astype(int) 163 | box[2] = int(box[2]-box[0]) # width 164 | box[3] = int(box[3]-box[1]) # height 165 | box = box.astype(int) 166 | box = box.tolist() 167 | if box != [0, 0, 0, 0]: 168 | returned_boxes.append(box) 169 | return returned_boxes -------------------------------------------------------------------------------- /deepsort/nn_matching.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def _pdist(a, b): 5 | """Compute pair-wise squared distance between points in `a` and `b`. 6 | 7 | Parameters 8 | ---------- 9 | a : array_like 10 | An NxM matrix of N samples of dimensionality M. 11 | b : array_like 12 | An LxM matrix of L samples of dimensionality M. 13 | 14 | Returns 15 | ------- 16 | ndarray 17 | Returns a matrix of size len(a), len(b) such that eleement (i, j) 18 | contains the squared distance between `a[i]` and `b[j]`. 19 | 20 | """ 21 | a, b = np.asarray(a), np.asarray(b) 22 | if len(a) == 0 or len(b) == 0: 23 | return np.zeros((len(a), len(b))) 24 | a2, b2 = np.square(a).sum(axis=1), np.square(b).sum(axis=1) 25 | r2 = -2. * np.dot(a, b.T) + a2[:, None] + b2[None, :] 26 | r2 = np.clip(r2, 0., float(np.inf)) 27 | return r2 28 | 29 | 30 | def _cosine_distance(a, b, data_is_normalized=False): 31 | """Compute pair-wise cosine distance between points in `a` and `b`. 32 | 33 | Parameters 34 | ---------- 35 | a : array_like 36 | An NxM matrix of N samples of dimensionality M. 37 | b : array_like 38 | An LxM matrix of L samples of dimensionality M. 39 | data_is_normalized : Optional[bool] 40 | If True, assumes rows in a and b are unit length vectors. 41 | Otherwise, a and b are explicitly normalized to lenght 1. 42 | 43 | Returns 44 | ------- 45 | ndarray 46 | Returns a matrix of size len(a), len(b) such that eleement (i, j) 47 | contains the squared distance between `a[i]` and `b[j]`. 48 | 49 | """ 50 | if not data_is_normalized: 51 | a = np.asarray(a) / np.linalg.norm(a, axis=1, keepdims=True) 52 | b = np.asarray(b) / np.linalg.norm(b, axis=1, keepdims=True) 53 | return 1. - np.dot(a, b.T) 54 | 55 | 56 | def _nn_euclidean_distance(x, y): 57 | """ Helper function for nearest neighbor distance metric (Euclidean). 58 | 59 | Parameters 60 | ---------- 61 | x : ndarray 62 | A matrix of N row-vectors (sample points). 63 | y : ndarray 64 | A matrix of M row-vectors (query points). 65 | 66 | Returns 67 | ------- 68 | ndarray 69 | A vector of length M that contains for each entry in `y` the 70 | smallest Euclidean distance to a sample in `x`. 71 | 72 | """ 73 | distances = _pdist(x, y) 74 | return np.maximum(0.0, distances.min(axis=0)) 75 | 76 | 77 | def _nn_cosine_distance(x, y): 78 | """ Helper function for nearest neighbor distance metric (cosine). 79 | 80 | Parameters 81 | ---------- 82 | x : ndarray 83 | A matrix of N row-vectors (sample points). 84 | y : ndarray 85 | A matrix of M row-vectors (query points). 86 | 87 | Returns 88 | ------- 89 | ndarray 90 | A vector of length M that contains for each entry in `y` the 91 | smallest cosine distance to a sample in `x`. 92 | 93 | """ 94 | distances = _cosine_distance(x, y) 95 | return distances.min(axis=0) 96 | 97 | 98 | class NearestNeighborDistanceMetric(object): 99 | """ 100 | A nearest neighbor distance metric that, for each target, returns 101 | the closest distance to any sample that has been observed so far. 102 | 103 | Parameters 104 | ---------- 105 | metric : str 106 | Either "euclidean" or "cosine". 107 | matching_threshold: float 108 | The matching threshold. Samples with larger distance are considered an 109 | invalid match. 110 | budget : Optional[int] 111 | If not None, fix samples per class to at most this number. Removes 112 | the oldest samples when the budget is reached. 113 | 114 | Attributes 115 | ---------- 116 | samples : Dict[int -> List[ndarray]] 117 | A dictionary that maps from target identities to the list of samples 118 | that have been observed so far. 119 | 120 | """ 121 | 122 | def __init__(self, metric, matching_threshold, budget=None): 123 | 124 | 125 | if metric == "euclidean": 126 | self._metric = _nn_euclidean_distance 127 | elif metric == "cosine": 128 | self._metric = _nn_cosine_distance 129 | else: 130 | raise ValueError( 131 | "Invalid metric; must be either 'euclidean' or 'cosine'") 132 | self.matching_threshold = matching_threshold 133 | self.budget = budget 134 | self.samples = {} 135 | 136 | def partial_fit(self, features, targets, active_targets): 137 | """Update the distance metric with new data. 138 | 139 | Parameters 140 | ---------- 141 | features : ndarray 142 | An NxM matrix of N features of dimensionality M. 143 | targets : ndarray 144 | An integer array of associated target identities. 145 | active_targets : List[int] 146 | A list of targets that are currently present in the scene. 147 | 148 | """ 149 | for feature, target in zip(features, targets): 150 | self.samples.setdefault(target, []).append(feature) 151 | if self.budget is not None: 152 | self.samples[target] = self.samples[target][-self.budget:] 153 | self.samples = {k: self.samples[k] for k in active_targets} 154 | 155 | def distance(self, features, targets): 156 | """Compute distance between features and targets. 157 | 158 | Parameters 159 | ---------- 160 | features : ndarray 161 | An NxM matrix of N features of dimensionality M. 162 | targets : List[int] 163 | A list of targets to match the given `features` against. 164 | 165 | Returns 166 | ------- 167 | ndarray 168 | Returns a cost matrix of shape len(targets), len(features), where 169 | element (i, j) contains the closest squared distance between 170 | `targets[i]` and `features[j]`. 171 | 172 | """ 173 | cost_matrix = np.zeros((len(targets), len(features))) 174 | for i, target in enumerate(targets): 175 | cost_matrix[i, :] = self._metric(self.samples[target], features) 176 | return cost_matrix 177 | -------------------------------------------------------------------------------- /yolo3/generate_detections.py: -------------------------------------------------------------------------------- 1 | import os 2 | import errno 3 | import argparse 4 | import numpy as np 5 | import cv2 6 | import tensorflow.compat.v1 as tf 7 | 8 | 9 | def _run_in_batches(f, data_dict, out, batch_size): 10 | data_len = len(out) 11 | num_batches = int(data_len / batch_size) 12 | 13 | s, e = 0, 0 14 | for i in range(num_batches): 15 | s, e = i * batch_size, (i + 1) * batch_size 16 | batch_data_dict = {k: v[s:e] for k, v in data_dict.items()} 17 | out[s:e] = f(batch_data_dict) 18 | if e < len(out): 19 | batch_data_dict = {k: v[e:] for k, v in data_dict.items()} 20 | out[e:] = f(batch_data_dict) 21 | 22 | 23 | def extract_image_patch(image, bbox, patch_shape): 24 | """Extract image patch from bounding box. 25 | Parameters 26 | ---------- 27 | image : ndarray 28 | The full image. 29 | bbox : array_like 30 | The bounding box in format (x, y, width, height). 31 | patch_shape : Optional[array_like] 32 | This parameter can be used to enforce a desired patch shape 33 | (height, width). First, the `bbox` is adapted to the aspect ratio 34 | of the patch shape, then it is clipped at the image boundaries. 35 | If None, the shape is computed from :arg:`bbox`. 36 | Returns 37 | ------- 38 | ndarray | NoneType 39 | An image patch showing the :arg:`bbox`, optionally reshaped to 40 | :arg:`patch_shape`. 41 | Returns None if the bounding box is empty or fully outside of the image 42 | boundaries. 43 | """ 44 | bbox = np.array(bbox) 45 | if patch_shape is not None: 46 | # correct aspect ratio to patch shape 47 | target_aspect = float(patch_shape[1]) / patch_shape[0] 48 | new_width = target_aspect * bbox[3] 49 | bbox[0] -= (new_width - bbox[2]) / 2 50 | bbox[2] = new_width 51 | 52 | # convert to top left, bottom right 53 | bbox[2:] += bbox[:2] 54 | bbox = bbox.astype(np.int) 55 | 56 | # clip at image boundaries 57 | bbox[:2] = np.maximum(0, bbox[:2]) 58 | bbox[2:] = np.minimum(np.asarray(image.shape[:2][::-1]) - 1, bbox[2:]) 59 | if np.any(bbox[:2] >= bbox[2:]): 60 | return None 61 | sx, sy, ex, ey = bbox 62 | image = image[sy:ey, sx:ex] 63 | image = cv2.resize(image, tuple(patch_shape[::-1])) 64 | return image 65 | 66 | 67 | class ImageEncoder(object): 68 | 69 | def __init__(self, checkpoint_filename, input_name="images", 70 | output_name="features"): 71 | self.session = tf.Session() 72 | with tf.gfile.GFile(checkpoint_filename, "rb") as file_handle: 73 | graph_def = tf.GraphDef() 74 | graph_def.ParseFromString(file_handle.read()) 75 | tf.import_graph_def(graph_def, name="net") 76 | self.input_var = tf.get_default_graph().get_tensor_by_name( 77 | "net/%s:0" % input_name) 78 | self.output_var = tf.get_default_graph().get_tensor_by_name( 79 | "net/%s:0" % output_name) 80 | 81 | assert len(self.output_var.get_shape()) == 2 82 | assert len(self.input_var.get_shape()) == 4 83 | self.feature_dim = self.output_var.get_shape().as_list()[-1] 84 | self.image_shape = self.input_var.get_shape().as_list()[1:] 85 | 86 | def __call__(self, data_x, batch_size=32): 87 | out = np.zeros((len(data_x), self.feature_dim), np.float32) 88 | _run_in_batches( 89 | lambda x: self.session.run(self.output_var, feed_dict=x), 90 | {self.input_var: data_x}, out, batch_size) 91 | return out 92 | 93 | 94 | def create_box_encoder(model_filename, input_name="images", 95 | output_name="features", batch_size=32): 96 | image_encoder = ImageEncoder(model_filename, input_name, output_name) 97 | image_shape = image_encoder.image_shape 98 | 99 | def encoder(image, boxes): 100 | image_patches = [] 101 | for box in boxes: 102 | patch = extract_image_patch(image, box, image_shape[:2]) 103 | if patch is None: 104 | print("WARNING: Failed to extract image patch: %s." % str(box)) 105 | patch = np.random.uniform( 106 | 0., 255., image_shape).astype(np.uint8) 107 | image_patches.append(patch) 108 | image_patches = np.asarray(image_patches) 109 | return image_encoder(image_patches, batch_size) 110 | 111 | return encoder 112 | 113 | 114 | def generate_detections(encoder, mot_dir, output_dir, detection_dir=None): 115 | """Generate detections with features. 116 | Parameters 117 | ---------- 118 | encoder : Callable[image, ndarray] -> ndarray 119 | The encoder function takes as input a BGR color image and a matrix of 120 | bounding boxes in format `(x, y, w, h)` and returns a matrix of 121 | corresponding feature vectors. 122 | mot_dir : str 123 | Path to the MOTChallenge directory (can be either train or test). 124 | output_dir 125 | Path to the output directory. Will be created if it does not exist. 126 | detection_dir 127 | Path to custom detections. The directory structure should be the default 128 | MOTChallenge structure: `[sequence]/det/det.txt`. If None, uses the 129 | standard MOTChallenge detections. 130 | """ 131 | if detection_dir is None: 132 | detection_dir = mot_dir 133 | try: 134 | os.makedirs(output_dir) 135 | except OSError as exception: 136 | if exception.errno == errno.EEXIST and os.path.isdir(output_dir): 137 | pass 138 | else: 139 | raise ValueError( 140 | "Failed to created output directory '%s'" % output_dir) 141 | 142 | for sequence in os.listdir(mot_dir): 143 | print("Processing %s" % sequence) 144 | sequence_dir = os.path.join(mot_dir, sequence) 145 | 146 | image_dir = os.path.join(sequence_dir, "img1") 147 | image_filenames = { 148 | int(os.path.splitext(f)[0]): os.path.join(image_dir, f) 149 | for f in os.listdir(image_dir)} 150 | 151 | detection_file = os.path.join( 152 | detection_dir, sequence, "det/det.txt") 153 | detections_in = np.loadtxt(detection_file, delimiter=',') 154 | detections_out = [] 155 | 156 | frame_indices = detections_in[:, 0].astype(np.int) 157 | min_frame_idx = frame_indices.astype(np.int).min() 158 | max_frame_idx = frame_indices.astype(np.int).max() 159 | for frame_idx in range(min_frame_idx, max_frame_idx + 1): 160 | print("Frame %05d/%05d" % (frame_idx, max_frame_idx)) 161 | mask = frame_indices == frame_idx 162 | rows = detections_in[mask] 163 | 164 | if frame_idx not in image_filenames: 165 | print("WARNING could not find image for frame %d" % frame_idx) 166 | continue 167 | bgr_image = cv2.imread( 168 | image_filenames[frame_idx], cv2.IMREAD_COLOR) 169 | features = encoder(bgr_image, rows[:, 2:6].copy()) 170 | detections_out += [np.r_[(row, feature)] for row, feature 171 | in zip(rows, features)] 172 | 173 | output_filename = os.path.join(output_dir, "%s.npy" % sequence) 174 | np.save( 175 | output_filename, np.asarray(detections_out), allow_pickle=False) -------------------------------------------------------------------------------- /deepsort/linear_assignment.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import numpy as np 3 | from scipy.optimize import linear_sum_assignment 4 | from . import kalman_filter 5 | 6 | 7 | INFTY_COST = 1e+5 8 | 9 | 10 | def min_cost_matching( 11 | distance_metric, max_distance, tracks, detections, track_indices=None, 12 | detection_indices=None): 13 | """Solve linear assignment problem. 14 | 15 | Parameters 16 | ---------- 17 | distance_metric : Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray 18 | The distance metric is given a list of tracks and detections as well as 19 | a list of N track indices and M detection indices. The metric should 20 | return the NxM dimensional cost matrix, where element (i, j) is the 21 | association cost between the i-th track in the given track indices and 22 | the j-th detection in the given detection_indices. 23 | max_distance : float 24 | Gating threshold. Associations with cost larger than this value are 25 | disregarded. 26 | tracks : List[track.Track] 27 | A list of predicted tracks at the current time step. 28 | detections : List[detection.Detection] 29 | A list of detections at the current time step. 30 | track_indices : List[int] 31 | List of track indices that maps rows in `cost_matrix` to tracks in 32 | `tracks` (see description above). 33 | detection_indices : List[int] 34 | List of detection indices that maps columns in `cost_matrix` to 35 | detections in `detections` (see description above). 36 | 37 | Returns 38 | ------- 39 | (List[(int, int)], List[int], List[int]) 40 | Returns a tuple with the following three entries: 41 | * A list of matched track and detection indices. 42 | * A list of unmatched track indices. 43 | * A list of unmatched detection indices. 44 | 45 | """ 46 | if track_indices is None: 47 | track_indices = np.arange(len(tracks)) 48 | if detection_indices is None: 49 | detection_indices = np.arange(len(detections)) 50 | 51 | if len(detection_indices) == 0 or len(track_indices) == 0: 52 | return [], track_indices, detection_indices # Nothing to match. 53 | 54 | cost_matrix = distance_metric( 55 | tracks, detections, track_indices, detection_indices) 56 | cost_matrix[cost_matrix > max_distance] = max_distance + 1e-5 57 | indices = linear_sum_assignment(cost_matrix) 58 | indices = np.asarray(indices) 59 | indices = np.transpose(indices) 60 | matches, unmatched_tracks, unmatched_detections = [], [], [] 61 | for col, detection_idx in enumerate(detection_indices): 62 | if col not in indices[:, 1]: 63 | unmatched_detections.append(detection_idx) 64 | for row, track_idx in enumerate(track_indices): 65 | if row not in indices[:, 0]: 66 | unmatched_tracks.append(track_idx) 67 | for row, col in indices: 68 | track_idx = track_indices[row] 69 | detection_idx = detection_indices[col] 70 | if cost_matrix[row, col] > max_distance: 71 | unmatched_tracks.append(track_idx) 72 | unmatched_detections.append(detection_idx) 73 | else: 74 | matches.append((track_idx, detection_idx)) 75 | return matches, unmatched_tracks, unmatched_detections 76 | 77 | 78 | def matching_cascade( 79 | distance_metric, max_distance, cascade_depth, tracks, detections, 80 | track_indices=None, detection_indices=None): 81 | """Run matching cascade. 82 | 83 | Parameters 84 | ---------- 85 | distance_metric : Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray 86 | The distance metric is given a list of tracks and detections as well as 87 | a list of N track indices and M detection indices. The metric should 88 | return the NxM dimensional cost matrix, where element (i, j) is the 89 | association cost between the i-th track in the given track indices and 90 | the j-th detection in the given detection indices. 91 | max_distance : float 92 | Gating threshold. Associations with cost larger than this value are 93 | disregarded. 94 | cascade_depth: int 95 | The cascade depth, should be se to the maximum track age. 96 | tracks : List[track.Track] 97 | A list of predicted tracks at the current time step. 98 | detections : List[detection.Detection] 99 | A list of detections at the current time step. 100 | track_indices : Optional[List[int]] 101 | List of track indices that maps rows in `cost_matrix` to tracks in 102 | `tracks` (see description above). Defaults to all tracks. 103 | detection_indices : Optional[List[int]] 104 | List of detection indices that maps columns in `cost_matrix` to 105 | detections in `detections` (see description above). Defaults to all 106 | detections. 107 | 108 | Returns 109 | ------- 110 | (List[(int, int)], List[int], List[int]) 111 | Returns a tuple with the following three entries: 112 | * A list of matched track and detection indices. 113 | * A list of unmatched track indices. 114 | * A list of unmatched detection indices. 115 | 116 | """ 117 | if track_indices is None: 118 | track_indices = list(range(len(tracks))) 119 | if detection_indices is None: 120 | detection_indices = list(range(len(detections))) 121 | 122 | unmatched_detections = detection_indices 123 | matches = [] 124 | for level in range(cascade_depth): 125 | if len(unmatched_detections) == 0: # No detections left 126 | break 127 | 128 | track_indices_l = [ 129 | k for k in track_indices 130 | if tracks[k].time_since_update == 1 + level 131 | ] 132 | if len(track_indices_l) == 0: # Nothing to match at this level 133 | continue 134 | 135 | matches_l, _, unmatched_detections = \ 136 | min_cost_matching( 137 | distance_metric, max_distance, tracks, detections, 138 | track_indices_l, unmatched_detections) 139 | matches += matches_l 140 | unmatched_tracks = list(set(track_indices) - set(k for k, _ in matches)) 141 | return matches, unmatched_tracks, unmatched_detections 142 | 143 | 144 | def gate_cost_matrix( 145 | kf, cost_matrix, tracks, detections, track_indices, detection_indices, 146 | gated_cost=INFTY_COST, only_position=False): 147 | """Invalidate infeasible entries in cost matrix based on the state 148 | distributions obtained by Kalman filtering. 149 | 150 | Parameters 151 | ---------- 152 | kf : The Kalman filter. 153 | cost_matrix : ndarray 154 | The NxM dimensional cost matrix, where N is the number of track indices 155 | and M is the number of detection indices, such that entry (i, j) is the 156 | association cost between `tracks[track_indices[i]]` and 157 | `detections[detection_indices[j]]`. 158 | tracks : List[track.Track] 159 | A list of predicted tracks at the current time step. 160 | detections : List[detection.Detection] 161 | A list of detections at the current time step. 162 | track_indices : List[int] 163 | List of track indices that maps rows in `cost_matrix` to tracks in 164 | `tracks` (see description above). 165 | detection_indices : List[int] 166 | List of detection indices that maps columns in `cost_matrix` to 167 | detections in `detections` (see description above). 168 | gated_cost : Optional[float] 169 | Entries in the cost matrix corresponding to infeasible associations are 170 | set this value. Defaults to a very large value. 171 | only_position : Optional[bool] 172 | If True, only the x, y position of the state distribution is considered 173 | during gating. Defaults to False. 174 | 175 | Returns 176 | ------- 177 | ndarray 178 | Returns the modified cost matrix. 179 | 180 | """ 181 | gating_dim = 2 if only_position else 4 182 | gating_threshold = kalman_filter.chi2inv95[gating_dim] 183 | measurements = np.asarray( 184 | [detections[i].to_xyah() for i in detection_indices]) 185 | for row, track_idx in enumerate(track_indices): 186 | track = tracks[track_idx] 187 | gating_distance = kf.gating_distance( 188 | track.mean, track.covariance, measurements, only_position) 189 | cost_matrix[row, gating_distance > gating_threshold] = gated_cost 190 | return cost_matrix 191 | -------------------------------------------------------------------------------- /deepsort/kalman_filter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.linalg 3 | 4 | 5 | """ 6 | Table for the 0.95 quantile of the chi-square distribution with N degrees of 7 | freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv 8 | function and used as Mahalanobis gating threshold. 9 | """ 10 | chi2inv95 = { 11 | 1: 3.8415, 12 | 2: 5.9915, 13 | 3: 7.8147, 14 | 4: 9.4877, 15 | 5: 11.070, 16 | 6: 12.592, 17 | 7: 14.067, 18 | 8: 15.507, 19 | 9: 16.919} 20 | 21 | 22 | class KalmanFilter(object): 23 | """ 24 | A simple Kalman filter for tracking bounding boxes in image space. 25 | 26 | The 8-dimensional state space 27 | 28 | x, y, a, h, vx, vy, va, vh 29 | 30 | contains the bounding box center position (x, y), aspect ratio a, height h, 31 | and their respective velocities. 32 | 33 | Object motion follows a constant velocity model. The bounding box location 34 | (x, y, a, h) is taken as direct observation of the state space (linear 35 | observation model). 36 | 37 | """ 38 | 39 | def __init__(self): 40 | ndim, dt = 4, 1. 41 | 42 | # Create Kalman filter model matrices. 43 | self._motion_mat = np.eye(2 * ndim, 2 * ndim) 44 | for i in range(ndim): 45 | self._motion_mat[i, ndim + i] = dt 46 | self._update_mat = np.eye(ndim, 2 * ndim) 47 | 48 | # Motion and observation uncertainty are chosen relative to the current 49 | # state estimate. These weights control the amount of uncertainty in 50 | # the model. This is a bit hacky. 51 | self._std_weight_position = 1. / 20 52 | self._std_weight_velocity = 1. / 160 53 | 54 | def initiate(self, measurement): 55 | """Create track from unassociated measurement. 56 | 57 | Parameters 58 | ---------- 59 | measurement : ndarray 60 | Bounding box coordinates (x, y, a, h) with center position (x, y), 61 | aspect ratio a, and height h. 62 | 63 | Returns 64 | ------- 65 | (ndarray, ndarray) 66 | Returns the mean vector (8 dimensional) and covariance matrix (8x8 67 | dimensional) of the new track. Unobserved velocities are initialized 68 | to 0 mean. 69 | 70 | """ 71 | mean_pos = measurement 72 | mean_vel = np.zeros_like(mean_pos) 73 | mean = np.r_[mean_pos, mean_vel] 74 | 75 | std = [ 76 | 2 * self._std_weight_position * measurement[3], 77 | 2 * self._std_weight_position * measurement[3], 78 | 1e-2, 79 | 2 * self._std_weight_position * measurement[3], 80 | 10 * self._std_weight_velocity * measurement[3], 81 | 10 * self._std_weight_velocity * measurement[3], 82 | 1e-5, 83 | 10 * self._std_weight_velocity * measurement[3]] 84 | covariance = np.diag(np.square(std)) 85 | return mean, covariance 86 | 87 | def predict(self, mean, covariance): 88 | """Run Kalman filter prediction step. 89 | 90 | Parameters 91 | ---------- 92 | mean : ndarray 93 | The 8 dimensional mean vector of the object state at the previous 94 | time step. 95 | covariance : ndarray 96 | The 8x8 dimensional covariance matrix of the object state at the 97 | previous time step. 98 | 99 | Returns 100 | ------- 101 | (ndarray, ndarray) 102 | Returns the mean vector and covariance matrix of the predicted 103 | state. Unobserved velocities are initialized to 0 mean. 104 | 105 | """ 106 | std_pos = [ 107 | self._std_weight_position * mean[3], 108 | self._std_weight_position * mean[3], 109 | 1e-2, 110 | self._std_weight_position * mean[3]] 111 | std_vel = [ 112 | self._std_weight_velocity * mean[3], 113 | self._std_weight_velocity * mean[3], 114 | 1e-5, 115 | self._std_weight_velocity * mean[3]] 116 | motion_cov = np.diag(np.square(np.r_[std_pos, std_vel])) 117 | 118 | mean = np.dot(self._motion_mat, mean) 119 | covariance = np.linalg.multi_dot(( 120 | self._motion_mat, covariance, self._motion_mat.T)) + motion_cov 121 | 122 | return mean, covariance 123 | 124 | def project(self, mean, covariance): 125 | """Project state distribution to measurement space. 126 | 127 | Parameters 128 | ---------- 129 | mean : ndarray 130 | The state's mean vector (8 dimensional array). 131 | covariance : ndarray 132 | The state's covariance matrix (8x8 dimensional). 133 | 134 | Returns 135 | ------- 136 | (ndarray, ndarray) 137 | Returns the projected mean and covariance matrix of the given state 138 | estimate. 139 | 140 | """ 141 | std = [ 142 | self._std_weight_position * mean[3], 143 | self._std_weight_position * mean[3], 144 | 1e-1, 145 | self._std_weight_position * mean[3]] 146 | innovation_cov = np.diag(np.square(std)) 147 | 148 | mean = np.dot(self._update_mat, mean) 149 | covariance = np.linalg.multi_dot(( 150 | self._update_mat, covariance, self._update_mat.T)) 151 | return mean, covariance + innovation_cov 152 | 153 | def update(self, mean, covariance, measurement): 154 | """Run Kalman filter correction step. 155 | 156 | Parameters 157 | ---------- 158 | mean : ndarray 159 | The predicted state's mean vector (8 dimensional). 160 | covariance : ndarray 161 | The state's covariance matrix (8x8 dimensional). 162 | measurement : ndarray 163 | The 4 dimensional measurement vector (x, y, a, h), where (x, y) 164 | is the center position, a the aspect ratio, and h the height of the 165 | bounding box. 166 | 167 | Returns 168 | ------- 169 | (ndarray, ndarray) 170 | Returns the measurement-corrected state distribution. 171 | 172 | """ 173 | projected_mean, projected_cov = self.project(mean, covariance) 174 | 175 | chol_factor, lower = scipy.linalg.cho_factor( 176 | projected_cov, lower=True, check_finite=False) 177 | kalman_gain = scipy.linalg.cho_solve( 178 | (chol_factor, lower), np.dot(covariance, self._update_mat.T).T, 179 | check_finite=False).T 180 | innovation = measurement - projected_mean 181 | 182 | new_mean = mean + np.dot(innovation, kalman_gain.T) 183 | new_covariance = covariance - np.linalg.multi_dot(( 184 | kalman_gain, projected_cov, kalman_gain.T)) 185 | return new_mean, new_covariance 186 | 187 | def gating_distance(self, mean, covariance, measurements, 188 | only_position=False): 189 | """Compute gating distance between state distribution and measurements. 190 | 191 | A suitable distance threshold can be obtained from `chi2inv95`. If 192 | `only_position` is False, the chi-square distribution has 4 degrees of 193 | freedom, otherwise 2. 194 | 195 | Parameters 196 | ---------- 197 | mean : ndarray 198 | Mean vector over the state distribution (8 dimensional). 199 | covariance : ndarray 200 | Covariance of the state distribution (8x8 dimensional). 201 | measurements : ndarray 202 | An Nx4 dimensional matrix of N measurements, each in 203 | format (x, y, a, h) where (x, y) is the bounding box center 204 | position, a the aspect ratio, and h the height. 205 | only_position : Optional[bool] 206 | If True, distance computation is done with respect to the bounding 207 | box center position only. 208 | 209 | Returns 210 | ------- 211 | ndarray 212 | Returns an array of length N, where the i-th element contains the 213 | squared Mahalanobis distance between (mean, covariance) and 214 | `measurements[i]`. 215 | 216 | """ 217 | mean, covariance = self.project(mean, covariance) 218 | if only_position: 219 | mean, covariance = mean[:2], covariance[:2, :2] 220 | measurements = measurements[:, :2] 221 | 222 | cholesky_factor = np.linalg.cholesky(covariance) 223 | d = measurements - mean 224 | z = scipy.linalg.solve_triangular( 225 | cholesky_factor, d.T, lower=True, check_finite=False, 226 | overwrite_b=True) 227 | squared_maha = np.sum(z * z, axis=0) 228 | return squared_maha 229 | -------------------------------------------------------------------------------- /sort.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | from scipy.optimize import linear_sum_assignment 5 | from filterpy.kalman import KalmanFilter 6 | 7 | 8 | def iou(bb_test, bb_gt): 9 | """ 10 | Computes IOU between two bboxes in the form [x1,y1,x2,y2] 11 | """ 12 | xx1 = np.maximum(bb_test[0], bb_gt[0]) 13 | yy1 = np.maximum(bb_test[1], bb_gt[1]) 14 | xx2 = np.minimum(bb_test[2], bb_gt[2]) 15 | yy2 = np.minimum(bb_test[3], bb_gt[3]) 16 | w = np.maximum(0., xx2 - xx1) 17 | h = np.maximum(0., yy2 - yy1) 18 | wh = w * h 19 | o = wh / ((bb_test[2] - bb_test[0]) * (bb_test[3] - bb_test[1]) 20 | + (bb_gt[2] - bb_gt[0]) * (bb_gt[3] - bb_gt[1]) - wh) 21 | return o 22 | 23 | 24 | def convert_bbox_to_z(bbox): 25 | """ 26 | Takes a bounding box in the form [x1,y1,x2,y2] and returns z in the form 27 | [x,y,s,r] where x,y is the centre of the box and s is the scale/area and r is 28 | the aspect ratio 29 | """ 30 | w = bbox[2] - bbox[0] 31 | h = bbox[3] - bbox[1] 32 | x = bbox[0] + w / 2. 33 | y = bbox[1] + h / 2. 34 | s = w * h # scale is just area 35 | r = w / float(h) 36 | return np.array([x, y, s, r]).reshape((4, 1)) 37 | 38 | 39 | def convert_x_to_bbox(x, score=None): 40 | """ 41 | Takes a bounding box in the centre form [x,y,s,r] and returns it in the form 42 | [x1,y1,x2,y2] where x1,y1 is the top left and x2,y2 is the bottom right 43 | """ 44 | w = np.sqrt(x[2] * x[3]) 45 | h = x[2] / w 46 | if score is None: 47 | return np.array([x[0] - w / 2., x[1] - h / 2., x[0] + w / 2., x[1] + h / 2.]).reshape((1, 4)) 48 | else: 49 | return np.array([x[0] - w / 2., x[1] - h / 2., x[0] + w / 2., x[1] + h / 2., score]).reshape((1, 5)) 50 | 51 | 52 | class KalmanBoxTracker(object): 53 | """ 54 | This class represents the internel state of individual tracked objects observed as bbox. 55 | """ 56 | count = 0 57 | 58 | def __init__(self, bbox): 59 | """ 60 | Initialises a tracker using initial bounding box. 61 | """ 62 | # define constant velocity model 63 | self.kf = KalmanFilter(dim_x=7, dim_z=4) 64 | self.kf.F = np.array( 65 | [[1, 0, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 1, 0], [0, 0, 1, 0, 0, 0, 1], [0, 0, 0, 1, 0, 0, 0], 66 | [0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 1]]) 67 | self.kf.H = np.array( 68 | [[1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0]]) 69 | 70 | self.kf.R[2:, 2:] *= 10. 71 | self.kf.P[4:, 4:] *= 1000. # give high uncertainty to the unobservable initial velocities 72 | self.kf.P *= 10. 73 | self.kf.Q[-1, -1] *= 0.01 74 | self.kf.Q[4:, 4:] *= 0.01 75 | 76 | self.kf.x[:4] = convert_bbox_to_z(bbox) 77 | self.time_since_update = 0 78 | self.id = KalmanBoxTracker.count 79 | KalmanBoxTracker.count += 1 80 | self.history = [] 81 | self.hits = 0 82 | self.hit_streak = 0 83 | self.age = 0 84 | 85 | def update(self, bbox): 86 | """ 87 | Updates the state vector with observed bbox. 88 | """ 89 | self.time_since_update = 0 90 | self.history = [] 91 | self.hits += 1 92 | self.hit_streak += 1 93 | self.kf.update(convert_bbox_to_z(bbox)) 94 | 95 | def predict(self): 96 | """ 97 | Advances the state vector and returns the predicted bounding box estimate. 98 | """ 99 | if (self.kf.x[6] + self.kf.x[2]) <= 0: 100 | self.kf.x[6] *= 0.0 101 | self.kf.predict() 102 | self.age += 1 103 | if self.time_since_update > 0: 104 | self.hit_streak = 0 105 | self.time_since_update += 1 106 | self.history.append(convert_x_to_bbox(self.kf.x)) 107 | return self.history[-1] 108 | 109 | def get_state(self): 110 | """ 111 | Returns the current bounding box estimate. 112 | """ 113 | return convert_x_to_bbox(self.kf.x) 114 | 115 | 116 | def associate_detections_to_trackers(detections, trackers, iou_threshold=0.3): 117 | """ 118 | Assigns detections to tracked object (both represented as bounding boxes) 119 | Returns 3 lists of matches, unmatched_detections and unmatched_trackers 120 | """ 121 | if len(trackers) == 0: 122 | return np.empty((0, 2), dtype=int), np.arange(len(detections)), np.empty((0, 1), dtype=int) 123 | iou_matrix = np.zeros((len(detections), len(trackers)), dtype=np.float32) 124 | 125 | for d, det in enumerate(detections): 126 | for t, trk in enumerate(trackers): 127 | iou_matrix[d, t] = iou(det, trk) 128 | row_ind, col_ind = linear_sum_assignment(-iou_matrix) 129 | matched_indices = np.zeros(shape=(row_ind.shape[0], 2), dtype=np.int64) 130 | matched_indices[:, 0] = row_ind 131 | matched_indices[:, 1] = col_ind 132 | 133 | unmatched_detections = [] 134 | for d, det in enumerate(detections): 135 | if d not in matched_indices[:, 0]: 136 | unmatched_detections.append(d) 137 | unmatched_trackers = [] 138 | for t, trk in enumerate(trackers): 139 | if t not in matched_indices[:, 1]: 140 | unmatched_trackers.append(t) 141 | 142 | # filter out matched with low IOU 143 | matches = [] 144 | for m in matched_indices: 145 | if (iou_matrix[m[0], m[1]] < iou_threshold) and (not int(detections[m[0]][5]) == int(trackers[m[1]][5])): 146 | unmatched_detections.append(m[0]) 147 | unmatched_trackers.append(m[1]) 148 | else: 149 | matches.append(m.reshape(1, 2)) 150 | if len(matches) == 0: 151 | matches = np.empty((0, 2), dtype=int) 152 | else: 153 | matches = np.concatenate(matches, axis=0) 154 | 155 | return matches, np.array(unmatched_detections), np.array(unmatched_trackers) 156 | 157 | 158 | class Sort(object): 159 | def __init__(self, max_age=2, min_hits=3): 160 | """ 161 | Sets key parameters for SORT 162 | """ 163 | self.max_age = max_age 164 | self.min_hits = min_hits 165 | self.trackers = [] 166 | self.scores = [] 167 | self.types = [] 168 | self.frame_count = 0 169 | 170 | def update(self, dets): 171 | """ 172 | Params: 173 | dets - a numpy array of detections in the format [[x1,y1,x2,y2,score,type],[x1,y1,x2,y2,score,type],...] 174 | Requires: this method must be called once for each frame even with empty detections. 175 | Returns the a numpy array in the format [x1,y1,x2,y2,object_id,score,type] 176 | NOTE: The number of objects returned may differ from the number of detections provided. 177 | """ 178 | self.frame_count += 1 179 | # get predicted locations from existing trackers. 180 | trks = np.zeros((len(self.trackers), 6)) 181 | to_del = [] 182 | ret = [] 183 | for t, trk in enumerate(trks): 184 | pos = self.trackers[t].predict()[0] 185 | trk[:] = [pos[0], pos[1], pos[2], pos[3], self.scores[t], self.types[t]] 186 | if np.any(np.isnan(pos)): 187 | to_del.append(t) 188 | trks = np.ma.compress_rows(np.ma.masked_invalid(trks)) 189 | for t in reversed(to_del): 190 | self.trackers.pop(t) 191 | self.scores.pop(t) 192 | self.types.pop(t) 193 | matched, unmatched_dets, unmatched_trks = associate_detections_to_trackers(dets, trks) 194 | 195 | # update matched trackers with assigned detections 196 | for t, trk in enumerate(self.trackers): 197 | if t not in unmatched_trks: 198 | d = matched[np.where(matched[:, 1] == t)[0], 0] 199 | trk.update(dets[d, :][0]) 200 | self.scores[t] = dets[d, :][0][4] 201 | self.types[t] = dets[d, :][0][5] 202 | 203 | # create and initialise new trackers for unmatched detections 204 | for i in unmatched_dets: 205 | trk = KalmanBoxTracker(dets[i, 0:5]) 206 | self.trackers.append(trk) 207 | self.scores.append(dets[i, :][4]) 208 | self.types.append(dets[i, :][5]) 209 | i = len(self.trackers) 210 | for trk in reversed(self.trackers): 211 | pos = trk.get_state()[0] 212 | i -= 1 213 | if (trk.time_since_update < 1) and (trk.hit_streak >= self.min_hits or self.frame_count <= self.min_hits): 214 | ret.append(np.concatenate((pos, [trk.id + 1], [self.scores[i]], [self.types[i]])).reshape(1, -1)) # +1 as MOT benchmark requires positive 215 | # remove dead tracklet 216 | if trk.time_since_update > self.max_age: 217 | self.trackers.pop(i) 218 | self.scores.pop(i) 219 | self.types.pop(i) 220 | 221 | if len(ret) > 0: 222 | return np.concatenate(ret) 223 | else: 224 | return np.empty((0, 5)) 225 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | """ 2 | Retrain the YOLO model for your own dataset. 3 | """ 4 | 5 | import numpy as np 6 | import keras.backend as K 7 | from keras.layers import Input, Lambda 8 | from keras.models import Model 9 | from keras.optimizers import Adam 10 | from keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping 11 | 12 | from yolo3.model import preprocess_true_boxes, yolo_body, tiny_yolo_body, yolo_loss 13 | 14 | import read_data_cifar100 15 | from PIL import Image 16 | 17 | 18 | def _main(): 19 | # constant 20 | path = 'cifar-100-python/' 21 | log_dir = 'output/' 22 | classes_path = 'model_data/cifar_classes.txt' 23 | anchors_path = 'model_data/yolo_anchors.txt' 24 | # anchors_path = 'model_data/tiny_yolo_anchors.txt' 25 | pretrained_weight = 'model_h5/yolo.h5' 26 | pretrained_weight_tiny = 'model_h5/yolo-tiny.h5' 27 | pretrained = True 28 | 29 | # epoch 30 | epoch_first = 10 31 | epoch_second = 20 32 | 33 | class_names = get_classes(classes_path) 34 | num_classes = len(class_names) 35 | anchors = get_anchors(anchors_path) 36 | 37 | input_shape = (416, 416) # multiple of 32, hw; (32, 32) in cifar-100 38 | 39 | is_tiny_version = len(anchors) == 6 # default setting 40 | if is_tiny_version: 41 | model = create_tiny_model(input_shape, anchors, num_classes, load_pretrained=pretrained, freeze_body=2, 42 | weights_path=pretrained_weight_tiny) 43 | else: 44 | model = create_model(input_shape, anchors, num_classes, load_pretrained=pretrained, freeze_body=2, 45 | weights_path=pretrained_weight) # make sure you know what you freeze 46 | 47 | logging = TensorBoard(log_dir=log_dir) 48 | checkpoint = ModelCheckpoint(log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5', 49 | monitor='val_loss', save_weights_only=True, save_best_only=True, period=3) 50 | reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1) 51 | early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1) 52 | 53 | # load cifar dataset 54 | images, labels, img_val, lab_val = read_data_cifar100.data(path, False, True, True) 55 | 56 | num_val = len(img_val) 57 | num_train = len(images) 58 | 59 | # Train with frozen layers first, to get a stable loss. 60 | # Adjust num epochs to your dataset. This step is enough to obtain a not bad model. 61 | if True: 62 | model.compile(optimizer=Adam(lr=1e-3), loss={ 63 | # use custom yolo_loss Lambda layer. 64 | 'yolo_loss': lambda y_true, y_pred: y_pred}, metrics=['accuracy']) 65 | 66 | batch_size = 32 67 | print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size)) 68 | model.fit_generator(data_generator_wrapper(images, labels, batch_size, input_shape, anchors, num_classes), 69 | steps_per_epoch=max(1, num_train // batch_size), 70 | validation_data=data_generator_wrapper(img_val, lab_val, batch_size, input_shape, anchors, 71 | num_classes), 72 | validation_steps=max(1, num_val // batch_size), 73 | epochs=epoch_first, 74 | initial_epoch=0, 75 | callbacks=[logging, checkpoint]) 76 | model.save_weights(log_dir + 'trained_weights_stage_1.h5') 77 | 78 | # Unfreeze and continue training, to fine-tune. 79 | # Train longer if the result is not good. 80 | if True: 81 | for i in range(len(model.layers)): 82 | model.layers[i].trainable = True 83 | model.compile(optimizer=Adam(lr=1e-4), 84 | loss={'yolo_loss': lambda y_true, y_pred: y_pred}, metrics=['accuracy']) 85 | # recompile to apply the change 86 | print('Unfreeze all of the layers.') 87 | 88 | batch_size = 32 # note that more GPU memory is required after unfreezing the body 89 | print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size)) 90 | model.fit_generator(data_generator_wrapper(images, labels, batch_size, input_shape, anchors, num_classes), 91 | steps_per_epoch=max(1, num_train // batch_size), 92 | validation_data=data_generator_wrapper(img_val, lab_val, batch_size, input_shape, anchors, 93 | num_classes), 94 | validation_steps=max(1, num_val // batch_size), 95 | epochs=epoch_second, 96 | initial_epoch=epoch_first, 97 | callbacks=[logging, checkpoint, reduce_lr, early_stopping]) 98 | model.save_weights(log_dir + 'trained_weights_final.h5') 99 | 100 | # Further training if needed. 101 | 102 | 103 | def get_classes(classes_path): 104 | """loads the classes""" 105 | with open(classes_path) as f: 106 | class_names = f.readlines() 107 | class_names = [c.strip() for c in class_names] 108 | return class_names 109 | 110 | 111 | def get_anchors(anchors_path): 112 | """loads the anchors from a file""" 113 | with open(anchors_path) as f: 114 | anchors = f.readline() 115 | anchors = [float(x) for x in anchors.split(',')] 116 | return np.array(anchors).reshape(-1, 2) 117 | 118 | 119 | def create_model(input_shape, anchors, num_classes, load_pretrained=True, freeze_body=2, 120 | weights_path='model_data/yolo_weights.h5'): 121 | """create the training model""" 122 | K.clear_session() # get a new session 123 | image_input = Input(shape=(None, None, 3)) 124 | h, w = input_shape 125 | num_anchors = len(anchors) 126 | 127 | y_true = [ 128 | Input(shape=(h // {0: 32, 1: 16, 2: 8}[l], w // {0: 32, 1: 16, 2: 8}[l], num_anchors // 3, num_classes + 5)) for 129 | l in range(3)] 130 | 131 | model_body = yolo_body(image_input, num_anchors // 3, num_classes) 132 | print('Create YOLOv3 model with {} anchors and {} classes.'.format(num_anchors, num_classes)) 133 | 134 | if load_pretrained: 135 | model_body.load_weights(weights_path, by_name=True, skip_mismatch=True) 136 | print('Load weights {}.'.format(weights_path)) 137 | if freeze_body in [1, 2]: 138 | # Freeze darknet53 body or freeze all but 3 output layers. 139 | num = (185, len(model_body.layers) - 3)[freeze_body - 1] 140 | for i in range(num): 141 | model_body.layers[i].trainable = False 142 | print('Freeze the first {} layers of total {} layers.'.format(num, len(model_body.layers))) 143 | 144 | model_loss = Lambda(yolo_loss, output_shape=(1,), name='yolo_loss', 145 | arguments={'anchors': anchors, 'num_classes': num_classes, 'ignore_thresh': 0.5})( 146 | [*model_body.output, *y_true]) 147 | model = Model([model_body.input, *y_true], model_loss) 148 | 149 | return model 150 | 151 | 152 | def create_tiny_model(input_shape, anchors, num_classes, load_pretrained=True, freeze_body=2, 153 | weights_path='model_data/tiny_yolo_weights.h5'): 154 | """create the training model, for Tiny YOLOv3""" 155 | K.clear_session() # get a new session 156 | image_input = Input(shape=(None, None, 3)) 157 | h, w = input_shape 158 | num_anchors = len(anchors) 159 | 160 | y_true = [Input(shape=(h // {0: 32, 1: 16}[l], w // {0: 32, 1: 16}[l], num_anchors // 2, num_classes + 5)) for l in 161 | range(2)] 162 | 163 | model_body = tiny_yolo_body(image_input, num_anchors // 2, num_classes) 164 | print('Create Tiny YOLOv3 model with {} anchors and {} classes.'.format(num_anchors, num_classes)) 165 | 166 | if load_pretrained: 167 | model_body.load_weights(weights_path, by_name=True, skip_mismatch=True) 168 | print('Load weights {}.'.format(weights_path)) 169 | if freeze_body in [1, 2]: 170 | # Freeze the darknet body or freeze all but 2 output layers. 171 | num = (20, len(model_body.layers) - 2)[freeze_body - 1] 172 | for i in range(num): 173 | model_body.layers[i].trainable = False 174 | print('Freeze the first {} layers of total {} layers.'.format(num, len(model_body.layers))) 175 | 176 | model_loss = Lambda(yolo_loss, output_shape=(1,), name='yolo_loss', 177 | arguments={'anchors': anchors, 'num_classes': num_classes, 'ignore_thresh': 0.7})( 178 | [*model_body.output, *y_true]) 179 | model = Model([model_body.input, *y_true], model_loss) 180 | 181 | return model 182 | 183 | 184 | def get_data(single_image, single_label, input_shape, proc_img=True): 185 | """ 186 | pre-processing for real-time data augmentation 187 | """ 188 | image = Image.fromarray(np.uint8(single_image)) 189 | iw, ih = image.size 190 | h, w = input_shape 191 | # cifar dateset has captured the target area of images 192 | box = np.array([[0, 0, 32, 32, int(single_label)]]) 193 | 194 | # resize image 195 | scale = min(w / iw, h / ih) 196 | nw = int(iw * scale) 197 | nh = int(ih * scale) 198 | dx = (w - nw) // 2 199 | dy = (h - nh) // 2 200 | image_data = 0 201 | if proc_img: 202 | image = image.resize((nw, nh), Image.BICUBIC) 203 | new_image = Image.new('RGB', (w, h), (128, 128, 128)) 204 | new_image.paste(image, (dx, dy)) 205 | image_data = np.array(new_image) / 255. 206 | 207 | # correct boxes 208 | box_data = np.zeros((1, 5)) 209 | box[:, [0, 2]] = box[:, [0, 2]] * scale + dx 210 | box[:, [1, 3]] = box[:, [1, 3]] * scale + dy 211 | box_data[:len(box)] = box 212 | 213 | return image_data, box_data 214 | 215 | 216 | def data_generator(images, labels, batch_size, input_shape, anchors, num_classes): 217 | """data generator for fit_generator""" 218 | n = len(images) 219 | i = 0 220 | while True: 221 | image_data = [] 222 | box_data = [] 223 | for b in range(batch_size): 224 | image, box = get_data(images[i], labels[i], input_shape, proc_img=True) 225 | image_data.append(image) 226 | box_data.append(box) 227 | # avoid IndexError 228 | i = (i + 1) % n 229 | image_data = np.array(image_data) 230 | box_data = np.array(box_data) 231 | y_true = preprocess_true_boxes(box_data, input_shape, anchors, num_classes) 232 | yield [image_data, *y_true], np.zeros(batch_size) 233 | 234 | 235 | def data_generator_wrapper(images, labels, batch_size, input_shape, anchors, num_classes): 236 | n = len(images) 237 | if n == 0 or batch_size <= 0: 238 | return None 239 | return data_generator(images, labels, batch_size, input_shape, anchors, num_classes) 240 | 241 | 242 | if __name__ == '__main__': 243 | _main() 244 | -------------------------------------------------------------------------------- /yolo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Class definition of YOLO_v3 style detection model on image and video 4 | """ 5 | 6 | import colorsys 7 | from timeit import default_timer as timer 8 | 9 | import numpy as np 10 | from keras import backend as K 11 | from keras.models import load_model 12 | from keras.layers import Input 13 | from PIL import Image, ImageFont, ImageDraw 14 | 15 | from yolo3.model import yolo_eval, yolo_body, tiny_yolo_body 16 | from yolo3.utils import letterbox_image 17 | from yolo3.utils import delete_repeat_bbox 18 | import sort 19 | import os 20 | from keras.utils import multi_gpu_model 21 | 22 | from tracker_func import sort_image 23 | from sort import KalmanBoxTracker 24 | 25 | from tracker_func import deepsort_image 26 | from deepsort.tracker import Tracker as deepsort_Tracker 27 | from yolo3 import generate_detections as gdet 28 | from deepsort import nn_matching 29 | 30 | 31 | class YOLO(object): 32 | _defaults = { 33 | "model_image_size": (416, 416), 34 | } 35 | 36 | @classmethod 37 | def get_defaults(cls, n): 38 | if n in cls._defaults: 39 | return cls._defaults[n] 40 | else: 41 | return "Unrecognized attribute name '" + n + "'" 42 | 43 | def __init__(self, **kwargs): 44 | self.__dict__.update(self._defaults) # set up default values 45 | self.__dict__.update(kwargs) # and update with user overrides 46 | self.class_names = self._get_class() 47 | self.anchors = self._get_anchors() 48 | self.sess = K.get_session() 49 | self.boxes, self.scores, self.classes = self.generate() 50 | self.frame = 1 51 | self.mot_tracker, self.encoder = self._initialize_tracker() 52 | 53 | def _initialize_tracker(self): 54 | if not self.image: 55 | if self.tracker == 'sort': 56 | tracker = sort.Sort() 57 | return tracker, None 58 | elif self.tracker == 'deepsort': 59 | # initialize deep sort 60 | model_filename = self.deepsort_model 61 | encoder = gdet.create_box_encoder(model_filename, batch_size=1) 62 | metric = nn_matching.NearestNeighborDistanceMetric("cosine", matching_threshold=0.5, budget=None) 63 | tracker = deepsort_Tracker(metric) 64 | return tracker, encoder 65 | else: 66 | raise ValueError('The variable \"tracker\" must be \"sort\" or \"deepsort\".') 67 | else: 68 | return None, None 69 | 70 | def _get_class(self): 71 | classes_path = os.path.expanduser(self.classes_path) 72 | with open(classes_path) as f: 73 | class_names = f.readlines() 74 | class_names = [c.strip() for c in class_names] 75 | return class_names 76 | 77 | def _get_anchors(self): 78 | anchors_path = os.path.expanduser(self.anchors_path) 79 | with open(anchors_path) as f: 80 | anchors = f.readline() 81 | anchors = [float(x) for x in anchors.split(',')] 82 | return np.array(anchors).reshape(-1, 2) 83 | 84 | def generate(self): 85 | model_path = os.path.expanduser(self.model_path) 86 | assert model_path.endswith('.h5'), 'Keras model or weights must be a .h5 file.' 87 | 88 | # Load model, or construct model and load weights. 89 | num_anchors = len(self.anchors) 90 | num_classes = len(self.class_names) 91 | is_tiny_version = num_anchors == 6 # default setting 92 | try: 93 | self.yolo_model = load_model(model_path, compile=False) 94 | except: 95 | self.yolo_model = tiny_yolo_body(Input(shape=(None, None, 3)), num_anchors // 2, num_classes) \ 96 | if is_tiny_version else yolo_body(Input(shape=(None, None, 3)), num_anchors // 3, num_classes) 97 | self.yolo_model.load_weights(self.model_path) # make sure model, anchors and classes match 98 | else: 99 | try: 100 | assert self.yolo_model.layers[-1].output_shape[-1] == \ 101 | num_anchors / len(self.yolo_model.output) * (num_classes + 5), \ 102 | 'Mismatch between model and given anchor and class sizes' 103 | except TypeError: 104 | # the number of yolo_model.output(Tensor) may be just one 105 | assert self.yolo_model.layers[-1].output_shape[-1] == \ 106 | num_anchors / 1 * (num_classes + 5), 'Mismatch between model and given anchor and class sizes' 107 | 108 | print('{} model, anchors, and classes loaded.'.format(model_path)) 109 | 110 | # Generate colors for drawing bounding boxes. 111 | hsv_tuples = [(x / len(self.class_names), 1., 1.) 112 | for x in range(len(self.class_names))] 113 | self.colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples)) 114 | self.colors = list( 115 | map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), 116 | self.colors)) 117 | np.random.seed(10101) # Fixed seed for consistent colors across runs. 118 | np.random.shuffle(self.colors) # Shuffle colors to decorrelate adjacent classes. 119 | np.random.seed(None) # Reset seed to default. 120 | 121 | # Generate output tensor targets for filtered bounding boxes. 122 | self.input_image_shape = K.placeholder(shape=(2,)) 123 | if self.gpu_num >= 2: 124 | self.yolo_model = multi_gpu_model(self.yolo_model, gpus=self.gpu_num) 125 | boxes, scores, classes = yolo_eval(self.yolo_model.output, self.anchors, 126 | len(self.class_names), self.input_image_shape, 127 | score_threshold=self.score, iou_threshold=self.iou) 128 | return boxes, scores, classes 129 | 130 | def detect_image(self, frame, fo=None): 131 | image = Image.fromarray(frame) 132 | 133 | start = timer() 134 | 135 | if self.model_image_size != (None, None): 136 | assert self.model_image_size[0] % 32 == 0, 'Multiples of 32 required' 137 | assert self.model_image_size[1] % 32 == 0, 'Multiples of 32 required' 138 | boxed_image = letterbox_image(image, tuple(reversed(self.model_image_size))) 139 | else: 140 | new_image_size = (image.width - (image.width % 32), 141 | image.height - (image.height % 32)) 142 | boxed_image = letterbox_image(image, new_image_size) 143 | image_data = np.array(boxed_image, dtype='float32') 144 | 145 | print(image_data.shape) 146 | image_data /= 255. 147 | image_data = np.expand_dims(image_data, 0) # Add batch dimension. 148 | 149 | out_boxes, out_scores, out_classes = self.sess.run( 150 | [self.boxes, self.scores, self.classes], 151 | feed_dict={ 152 | self.yolo_model.input: image_data, 153 | self.input_image_shape: [image.size[1], image.size[0]], 154 | K.learning_phase(): 0 155 | }) 156 | 157 | # print(type(out_boxes), type(out_scores), type(out_classes)) 158 | # print(out_boxes, out_scores, out_classes) 159 | 160 | # delete repeat bbox 161 | out_boxes, out_scores, out_classes = \ 162 | delete_repeat_bbox(list(out_boxes), list(out_scores), list(out_classes), self.repeat_iou) 163 | 164 | # open or close tracker 165 | if self.mot_tracker is not None: 166 | if self.tracker == 'sort': 167 | out_boxes, out_scores, out_classes, object_id = \ 168 | sort_image(self.mot_tracker, out_boxes, out_scores, out_classes) 169 | elif self.tracker == 'deepsort': 170 | out_boxes, out_scores, out_classes, object_id = \ 171 | deepsort_image(self.mot_tracker, self.encoder, frame, out_boxes, out_scores, out_classes, 172 | nms_max_overlap=1.0) 173 | else: 174 | raise ValueError('The variable \"tracker\" must be \"sort\" or \"deepsort\".') 175 | else: 176 | KalmanBoxTracker.count = 0 177 | object_id = np.concatenate(np.zeros((1, len(out_boxes)))) 178 | 179 | # write to file 180 | if self.write_to_file: 181 | for i in reversed(range(0, len(out_boxes))): 182 | result = [self.frame, object_id[i], out_boxes[i][0], out_boxes[i][1], 183 | abs(out_boxes[i][2] - out_boxes[i][0]), abs(out_boxes[i][3] - out_boxes[i][1]), out_scores[i], 184 | -1, -1, -1] 185 | fo.write(', '.join(map(str, result))) 186 | fo.write('\n') 187 | 188 | print('Found {} boxes for {}'.format(len(out_boxes), 'img')) 189 | 190 | font = ImageFont.truetype(font='font/times.ttf', 191 | size=np.floor(3e-2 * image.size[1] + 0.5).astype('int32')) 192 | thickness = (image.size[0] + image.size[1]) // 300 193 | 194 | for i, c in reversed(list(enumerate(out_classes))): 195 | predicted_class = self.class_names[c] 196 | box = out_boxes[i] 197 | score = out_scores[i] 198 | id = int(object_id[i]) 199 | 200 | # bounding box 201 | top, left, bottom, right = box 202 | top = max(0, np.floor(top + 0.5).astype('int32')) 203 | left = max(0, np.floor(left + 0.5).astype('int32')) 204 | bottom = min(image.size[1], np.floor(bottom + 0.5).astype('int32')) 205 | right = min(image.size[0], np.floor(right + 0.5).astype('int32')) 206 | 207 | label = '{} {:.2f} id:{}'.format(predicted_class, score, id) 208 | draw = ImageDraw.Draw(image) 209 | label_size = draw.textsize(label, font) 210 | 211 | print(label, (left, top), (right, bottom)) 212 | 213 | if top - label_size[1] >= 0: 214 | text_origin = np.array([left, top - label_size[1]]) 215 | else: 216 | text_origin = np.array([left, top + 1]) 217 | 218 | # My kingdom for a good redistributable image drawing library. 219 | for i in range(thickness): 220 | draw.rectangle( 221 | [left + i, top + i, right - i, bottom - i], 222 | outline=self.colors[c]) 223 | draw.rectangle( 224 | [tuple(text_origin), tuple(text_origin + label_size)], 225 | fill=self.colors[c]) 226 | draw.text(text_origin, label, fill=(0, 0, 0), font=font) 227 | del draw 228 | 229 | end = timer() 230 | print('time:', end - start, 's') 231 | self.frame = self.frame + 1 232 | return image 233 | 234 | def close_session(self): 235 | self.sess.close() 236 | 237 | 238 | def detect_video(yolo, video_path, output_path=""): 239 | import cv2 240 | vid = cv2.VideoCapture(video_path) 241 | if not vid.isOpened(): 242 | raise IOError("Couldn't open webcam or video") 243 | video_FourCC = int(vid.get(cv2.CAP_PROP_FOURCC)) 244 | video_fps = vid.get(cv2.CAP_PROP_FPS) 245 | video_size = (int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)), 246 | int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))) 247 | isOutput = True if output_path != "" else False 248 | if isOutput: 249 | print("!!! TYPE:", type(output_path), type(video_FourCC), type(video_fps), type(video_size)) 250 | out = cv2.VideoWriter(output_path, video_FourCC, video_fps, video_size) 251 | accum_time = 0 252 | curr_fps = 0 253 | fps = "FPS: ??" 254 | prev_time = timer() 255 | 256 | if yolo.write_to_file: 257 | emptyFile = open(yolo.output_path + 'result.dat', 'w') 258 | else: 259 | emptyFile = None 260 | 261 | while True: 262 | return_value, frame = vid.read() 263 | try: 264 | image = Image.fromarray(frame) 265 | except AttributeError: 266 | break 267 | image = yolo.detect_image(frame, emptyFile) 268 | result = np.asarray(image) 269 | curr_time = timer() 270 | exec_time = curr_time - prev_time 271 | prev_time = curr_time 272 | accum_time = accum_time + exec_time 273 | curr_fps = curr_fps + 1 274 | if accum_time > 1: 275 | accum_time = accum_time - 1 276 | fps = "FPS: " + str(curr_fps) 277 | curr_fps = 0 278 | cv2.putText(result, text=fps, org=(3, 15), fontFace=cv2.FONT_HERSHEY_SIMPLEX, 279 | fontScale=0.50, color=(255, 0, 0), thickness=2) 280 | cv2.namedWindow("result", cv2.WINDOW_NORMAL) 281 | cv2.imshow("result", result) 282 | if isOutput: 283 | out.write(result) 284 | if cv2.waitKey(1) & 0xFF == ord('q'): 285 | break 286 | if yolo.write_to_file: 287 | emptyFile.close() 288 | yolo.close_session() 289 | -------------------------------------------------------------------------------- /yolo3/model.py: -------------------------------------------------------------------------------- 1 | """YOLO_v3 Model Defined in Keras.""" 2 | 3 | from functools import wraps 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from keras import backend as K 8 | from keras.layers import Conv2D, Add, ZeroPadding2D, UpSampling2D, Concatenate, MaxPooling2D 9 | from keras.layers.advanced_activations import LeakyReLU 10 | from keras.layers.normalization import BatchNormalization 11 | from keras.models import Model 12 | from keras.regularizers import l2 13 | 14 | from yolo3.utils import compose 15 | 16 | 17 | @wraps(Conv2D) 18 | def DarknetConv2D(*args, **kwargs): 19 | """Wrapper to set Darknet parameters for Convolution2D.""" 20 | darknet_conv_kwargs = {'kernel_regularizer': l2(5e-4)} 21 | darknet_conv_kwargs['padding'] = 'valid' if kwargs.get('strides') == (2, 2) else 'same' 22 | darknet_conv_kwargs.update(kwargs) 23 | return Conv2D(*args, **darknet_conv_kwargs) 24 | 25 | 26 | def DarknetConv2D_BN_Leaky(*args, **kwargs): 27 | """Darknet Convolution2D followed by BatchNormalization and LeakyReLU.""" 28 | no_bias_kwargs = {'use_bias': False} 29 | no_bias_kwargs.update(kwargs) 30 | return compose( 31 | DarknetConv2D(*args, **no_bias_kwargs), 32 | BatchNormalization(), 33 | LeakyReLU(alpha=0.1)) 34 | 35 | 36 | def resblock_body(x, num_filters, num_blocks): 37 | '''A series of resblocks starting with a downsampling Convolution2D''' 38 | # Darknet uses left and top padding instead of 'same' mode 39 | x = ZeroPadding2D(((1, 0), (1, 0)))(x) 40 | x = DarknetConv2D_BN_Leaky(num_filters, (3, 3), strides=(2, 2))(x) 41 | for i in range(num_blocks): 42 | y = compose( 43 | DarknetConv2D_BN_Leaky(num_filters // 2, (1, 1)), 44 | DarknetConv2D_BN_Leaky(num_filters, (3, 3)))(x) 45 | x = Add()([x, y]) 46 | return x 47 | 48 | 49 | def darknet_body(x): 50 | '''Darknent body having 52 Convolution2D layers''' 51 | x = DarknetConv2D_BN_Leaky(32, (3, 3))(x) 52 | x = resblock_body(x, 64, 1) 53 | x = resblock_body(x, 128, 2) 54 | x = resblock_body(x, 256, 8) 55 | x = resblock_body(x, 512, 8) 56 | x = resblock_body(x, 1024, 4) 57 | return x 58 | 59 | 60 | def make_last_layers(x, num_filters, out_filters): 61 | '''6 Conv2D_BN_Leaky layers followed by a Conv2D_linear layer''' 62 | x = compose( 63 | DarknetConv2D_BN_Leaky(num_filters, (1, 1)), 64 | DarknetConv2D_BN_Leaky(num_filters * 2, (3, 3)), 65 | DarknetConv2D_BN_Leaky(num_filters, (1, 1)), 66 | DarknetConv2D_BN_Leaky(num_filters * 2, (3, 3)), 67 | DarknetConv2D_BN_Leaky(num_filters, (1, 1)))(x) 68 | y = compose( 69 | DarknetConv2D_BN_Leaky(num_filters * 2, (3, 3)), 70 | DarknetConv2D(out_filters, (1, 1)))(x) 71 | return x, y 72 | 73 | 74 | def yolo_body(inputs, num_anchors, num_classes): 75 | """Create YOLO_V3 model CNN body in Keras.""" 76 | darknet = Model(inputs, darknet_body(inputs)) 77 | x, y1 = make_last_layers(darknet.output, 512, num_anchors * (num_classes + 5)) 78 | 79 | x = compose( 80 | DarknetConv2D_BN_Leaky(256, (1, 1)), 81 | UpSampling2D(2))(x) 82 | x = Concatenate()([x, darknet.layers[152].output]) 83 | x, y2 = make_last_layers(x, 256, num_anchors * (num_classes + 5)) 84 | 85 | x = compose( 86 | DarknetConv2D_BN_Leaky(128, (1, 1)), 87 | UpSampling2D(2))(x) 88 | x = Concatenate()([x, darknet.layers[92].output]) 89 | x, y3 = make_last_layers(x, 128, num_anchors * (num_classes + 5)) 90 | 91 | return Model(inputs, [y1, y2, y3]) 92 | 93 | 94 | def tiny_yolo_body(inputs, num_anchors, num_classes): 95 | '''Create Tiny YOLO_v3 model CNN body in keras.''' 96 | x1 = compose( 97 | DarknetConv2D_BN_Leaky(16, (3, 3)), 98 | MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'), 99 | DarknetConv2D_BN_Leaky(32, (3, 3)), 100 | MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'), 101 | DarknetConv2D_BN_Leaky(64, (3, 3)), 102 | MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'), 103 | DarknetConv2D_BN_Leaky(128, (3, 3)), 104 | MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'), 105 | DarknetConv2D_BN_Leaky(256, (3, 3)))(inputs) 106 | x2 = compose( 107 | MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'), 108 | DarknetConv2D_BN_Leaky(512, (3, 3)), 109 | MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding='same'), 110 | DarknetConv2D_BN_Leaky(1024, (3, 3)), 111 | DarknetConv2D_BN_Leaky(256, (1, 1)))(x1) 112 | y1 = compose( 113 | DarknetConv2D_BN_Leaky(512, (3, 3)), 114 | DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x2) 115 | 116 | x2 = compose( 117 | DarknetConv2D_BN_Leaky(128, (1, 1)), 118 | UpSampling2D(2))(x2) 119 | y2 = compose( 120 | Concatenate(), 121 | DarknetConv2D_BN_Leaky(256, (3, 3)), 122 | DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))([x2, x1]) 123 | 124 | return Model(inputs, [y1, y2]) 125 | 126 | 127 | def yolo_head(feats, anchors, num_classes, input_shape, calc_loss=False): 128 | """Convert final layer features to bounding box parameters.""" 129 | num_anchors = len(anchors) 130 | # Reshape to batch, height, width, num_anchors, box_params. 131 | anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2]) 132 | 133 | grid_shape = K.shape(feats)[1:3] # height, width 134 | grid_y = K.tile(K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]), 135 | [1, grid_shape[1], 1, 1]) 136 | grid_x = K.tile(K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]), 137 | [grid_shape[0], 1, 1, 1]) 138 | grid = K.concatenate([grid_x, grid_y]) 139 | grid = K.cast(grid, K.dtype(feats)) 140 | 141 | feats = K.reshape( 142 | feats, [-1, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5]) 143 | 144 | # Adjust preditions to each spatial grid point and anchor size. 145 | box_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast(grid_shape[::-1], K.dtype(feats)) 146 | box_wh = K.exp(feats[..., 2:4]) * anchors_tensor / K.cast(input_shape[::-1], K.dtype(feats)) 147 | box_confidence = K.sigmoid(feats[..., 4:5]) 148 | box_class_probs = K.sigmoid(feats[..., 5:]) 149 | 150 | if calc_loss == True: 151 | return grid, feats, box_xy, box_wh 152 | return box_xy, box_wh, box_confidence, box_class_probs 153 | 154 | 155 | def yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape): 156 | '''Get corrected boxes''' 157 | box_yx = box_xy[..., ::-1] 158 | box_hw = box_wh[..., ::-1] 159 | input_shape = K.cast(input_shape, K.dtype(box_yx)) 160 | image_shape = K.cast(image_shape, K.dtype(box_yx)) 161 | new_shape = K.round(image_shape * K.min(input_shape / image_shape)) 162 | offset = (input_shape - new_shape) / 2. / input_shape 163 | scale = input_shape / new_shape 164 | box_yx = (box_yx - offset) * scale 165 | box_hw *= scale 166 | 167 | box_mins = box_yx - (box_hw / 2.) 168 | box_maxes = box_yx + (box_hw / 2.) 169 | boxes = K.concatenate([ 170 | box_mins[..., 0:1], # y_min 171 | box_mins[..., 1:2], # x_min 172 | box_maxes[..., 0:1], # y_max 173 | box_maxes[..., 1:2] # x_max 174 | ]) 175 | 176 | # Scale boxes back to original image shape. 177 | boxes *= K.concatenate([image_shape, image_shape]) 178 | return boxes 179 | 180 | 181 | def yolo_boxes_and_scores(feats, anchors, num_classes, input_shape, image_shape): 182 | '''Process Conv layer output''' 183 | box_xy, box_wh, box_confidence, box_class_probs = yolo_head(feats, 184 | anchors, num_classes, input_shape) 185 | boxes = yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape) 186 | boxes = K.reshape(boxes, [-1, 4]) 187 | box_scores = box_confidence * box_class_probs 188 | box_scores = K.reshape(box_scores, [-1, num_classes]) 189 | return boxes, box_scores 190 | 191 | 192 | def yolo_eval(yolo_outputs, 193 | anchors, 194 | num_classes, 195 | image_shape, 196 | max_boxes=20, 197 | score_threshold=.6, 198 | iou_threshold=.5): 199 | """Evaluate YOLO model on given input and return filtered boxes.""" 200 | num_layers = len(yolo_outputs) 201 | anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]] # default setting 202 | input_shape = K.shape(yolo_outputs[0])[1:3] * 32 203 | boxes = [] 204 | box_scores = [] 205 | for l in range(num_layers): 206 | _boxes, _box_scores = yolo_boxes_and_scores(yolo_outputs[l], 207 | anchors[anchor_mask[l]], num_classes, input_shape, image_shape) 208 | boxes.append(_boxes) 209 | box_scores.append(_box_scores) 210 | boxes = K.concatenate(boxes, axis=0) 211 | box_scores = K.concatenate(box_scores, axis=0) 212 | 213 | mask = box_scores >= score_threshold 214 | max_boxes_tensor = K.constant(max_boxes, dtype='int32') 215 | boxes_ = [] 216 | scores_ = [] 217 | classes_ = [] 218 | for c in range(num_classes): 219 | # TODO: use keras backend instead of tf. 220 | class_boxes = tf.boolean_mask(boxes, mask[:, c]) 221 | class_box_scores = tf.boolean_mask(box_scores[:, c], mask[:, c]) 222 | nms_index = tf.image.non_max_suppression( 223 | class_boxes, class_box_scores, max_boxes_tensor, iou_threshold=iou_threshold) 224 | class_boxes = K.gather(class_boxes, nms_index) 225 | class_box_scores = K.gather(class_box_scores, nms_index) 226 | classes = K.ones_like(class_box_scores, 'int32') * c 227 | boxes_.append(class_boxes) 228 | scores_.append(class_box_scores) 229 | classes_.append(classes) 230 | boxes_ = K.concatenate(boxes_, axis=0) 231 | scores_ = K.concatenate(scores_, axis=0) 232 | classes_ = K.concatenate(classes_, axis=0) 233 | 234 | return boxes_, scores_, classes_ 235 | 236 | 237 | def preprocess_true_boxes(true_boxes, input_shape, anchors, num_classes): 238 | '''Preprocess true boxes to training input format 239 | 240 | Parameters 241 | ---------- 242 | true_boxes: array, shape=(m, T, 5) 243 | Absolute x_min, y_min, x_max, y_max, class_id relative to input_shape. 244 | input_shape: array-like, hw, multiples of 32 245 | anchors: array, shape=(N, 2), wh 246 | num_classes: integer 247 | 248 | Returns 249 | ------- 250 | y_true: list of array, shape like yolo_outputs, xywh are reletive value 251 | 252 | ''' 253 | assert (true_boxes[..., 4] < num_classes).all(), 'class id must be less than num_classes' 254 | num_layers = len(anchors) // 3 # default setting 255 | anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]] 256 | 257 | true_boxes = np.array(true_boxes, dtype='float32') 258 | input_shape = np.array(input_shape, dtype='int32') 259 | boxes_xy = (true_boxes[..., 0:2] + true_boxes[..., 2:4]) // 2 260 | boxes_wh = true_boxes[..., 2:4] - true_boxes[..., 0:2] 261 | true_boxes[..., 0:2] = boxes_xy / input_shape[::-1] 262 | true_boxes[..., 2:4] = boxes_wh / input_shape[::-1] 263 | 264 | m = true_boxes.shape[0] 265 | grid_shapes = [input_shape // {0: 32, 1: 16, 2: 8}[l] for l in range(num_layers)] 266 | y_true = [np.zeros((m, grid_shapes[l][0], grid_shapes[l][1], len(anchor_mask[l]), 5 + num_classes), 267 | dtype='float32') for l in range(num_layers)] 268 | 269 | # Expand dim to apply broadcasting. 270 | anchors = np.expand_dims(anchors, 0) 271 | anchor_maxes = anchors / 2. 272 | anchor_mins = -anchor_maxes 273 | valid_mask = boxes_wh[..., 0] > 0 274 | 275 | for b in range(m): 276 | # Discard zero rows. 277 | wh = boxes_wh[b, valid_mask[b]] 278 | if len(wh) == 0: continue 279 | # Expand dim to apply broadcasting. 280 | wh = np.expand_dims(wh, -2) 281 | box_maxes = wh / 2. 282 | box_mins = -box_maxes 283 | 284 | intersect_mins = np.maximum(box_mins, anchor_mins) 285 | intersect_maxes = np.minimum(box_maxes, anchor_maxes) 286 | intersect_wh = np.maximum(intersect_maxes - intersect_mins, 0.) 287 | intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1] 288 | box_area = wh[..., 0] * wh[..., 1] 289 | anchor_area = anchors[..., 0] * anchors[..., 1] 290 | iou = intersect_area / (box_area + anchor_area - intersect_area) 291 | 292 | # Find best anchor for each true box 293 | best_anchor = np.argmax(iou, axis=-1) 294 | 295 | for t, n in enumerate(best_anchor): 296 | for l in range(num_layers): 297 | if n in anchor_mask[l]: 298 | i = np.floor(true_boxes[b, t, 0] * grid_shapes[l][1]).astype('int32') 299 | j = np.floor(true_boxes[b, t, 1] * grid_shapes[l][0]).astype('int32') 300 | k = anchor_mask[l].index(n) 301 | c = true_boxes[b, t, 4].astype('int32') 302 | y_true[l][b, j, i, k, 0:4] = true_boxes[b, t, 0:4] 303 | y_true[l][b, j, i, k, 4] = 1 304 | y_true[l][b, j, i, k, 5 + c] = 1 305 | 306 | return y_true 307 | 308 | 309 | def box_iou(b1, b2): 310 | '''Return iou tensor 311 | 312 | Parameters 313 | ---------- 314 | b1: tensor, shape=(i1,...,iN, 4), xywh 315 | b2: tensor, shape=(j, 4), xywh 316 | 317 | Returns 318 | ------- 319 | iou: tensor, shape=(i1,...,iN, j) 320 | 321 | ''' 322 | 323 | # Expand dim to apply broadcasting. 324 | b1 = K.expand_dims(b1, -2) 325 | b1_xy = b1[..., :2] 326 | b1_wh = b1[..., 2:4] 327 | b1_wh_half = b1_wh / 2. 328 | b1_mins = b1_xy - b1_wh_half 329 | b1_maxes = b1_xy + b1_wh_half 330 | 331 | # Expand dim to apply broadcasting. 332 | b2 = K.expand_dims(b2, 0) 333 | b2_xy = b2[..., :2] 334 | b2_wh = b2[..., 2:4] 335 | b2_wh_half = b2_wh / 2. 336 | b2_mins = b2_xy - b2_wh_half 337 | b2_maxes = b2_xy + b2_wh_half 338 | 339 | intersect_mins = K.maximum(b1_mins, b2_mins) 340 | intersect_maxes = K.minimum(b1_maxes, b2_maxes) 341 | intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.) 342 | intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1] 343 | b1_area = b1_wh[..., 0] * b1_wh[..., 1] 344 | b2_area = b2_wh[..., 0] * b2_wh[..., 1] 345 | iou = intersect_area / (b1_area + b2_area - intersect_area) 346 | 347 | return iou 348 | 349 | 350 | def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, print_loss=False): 351 | '''Return yolo_loss tensor 352 | 353 | Parameters 354 | ---------- 355 | yolo_outputs: list of tensor, the output of yolo_body or tiny_yolo_body 356 | y_true: list of array, the output of preprocess_true_boxes 357 | anchors: array, shape=(N, 2), wh 358 | num_classes: integer 359 | ignore_thresh: float, the iou threshold whether to ignore object confidence loss 360 | 361 | Returns 362 | ------- 363 | loss: tensor, shape=(1,) 364 | 365 | ''' 366 | num_layers = len(anchors) // 3 # default setting 367 | yolo_outputs = args[:num_layers] 368 | y_true = args[num_layers:] 369 | anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]] 370 | input_shape = K.cast(K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) 371 | grid_shapes = [K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0])) for l in range(num_layers)] 372 | loss = 0 373 | m = K.shape(yolo_outputs[0])[0] # batch size, tensor 374 | mf = K.cast(m, K.dtype(yolo_outputs[0])) 375 | 376 | for l in range(num_layers): 377 | object_mask = y_true[l][..., 4:5] 378 | true_class_probs = y_true[l][..., 5:] 379 | 380 | grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l], 381 | anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True) 382 | pred_box = K.concatenate([pred_xy, pred_wh]) 383 | 384 | # Darknet raw box to calculate loss. 385 | raw_true_xy = y_true[l][..., :2] * grid_shapes[l][::-1] - grid 386 | raw_true_wh = K.log(y_true[l][..., 2:4] / anchors[anchor_mask[l]] * input_shape[::-1]) 387 | raw_true_wh = K.switch(object_mask, raw_true_wh, K.zeros_like(raw_true_wh)) # avoid log(0)=-inf 388 | box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4] 389 | 390 | # Find ignore mask, iterate over each of batch. 391 | ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) 392 | object_mask_bool = K.cast(object_mask, 'bool') 393 | 394 | def loop_body(b, ignore_mask): 395 | true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0]) 396 | iou = box_iou(pred_box[b], true_box) 397 | best_iou = K.max(iou, axis=-1) 398 | ignore_mask = ignore_mask.write(b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) 399 | return b + 1, ignore_mask 400 | 401 | _, ignore_mask = K.control_flow_ops.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask]) 402 | ignore_mask = ignore_mask.stack() 403 | ignore_mask = K.expand_dims(ignore_mask, -1) 404 | 405 | # K.binary_crossentropy is helpful to avoid exp overflow. 406 | xy_loss = object_mask * box_loss_scale * K.binary_crossentropy(raw_true_xy, raw_pred[..., 0:2], 407 | from_logits=True) 408 | wh_loss = object_mask * box_loss_scale * 0.5 * K.square(raw_true_wh - raw_pred[..., 2:4]) 409 | confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[..., 4:5], from_logits=True) + \ 410 | (1 - object_mask) * K.binary_crossentropy(object_mask, raw_pred[..., 4:5], 411 | from_logits=True) * ignore_mask 412 | class_loss = object_mask * K.binary_crossentropy(true_class_probs, raw_pred[..., 5:], from_logits=True) 413 | 414 | xy_loss = K.sum(xy_loss) / mf 415 | wh_loss = K.sum(wh_loss) / mf 416 | confidence_loss = K.sum(confidence_loss) / mf 417 | class_loss = K.sum(class_loss) / mf 418 | loss += xy_loss + wh_loss + confidence_loss + class_loss 419 | if print_loss: 420 | loss = tf.Print(loss, [loss, xy_loss, wh_loss, confidence_loss, class_loss, K.sum(ignore_mask)], 421 | message='loss: ') 422 | return loss 423 | --------------------------------------------------------------------------------