├── .gitignore ├── Collect_training_data.py ├── IMAGES ├── city.jpg ├── city_pred.jpg ├── kite.jpg ├── kite_pred.jpg ├── mnist_test.jpg ├── street.jpg ├── street_pred.jpg ├── tensorboard.png ├── test.mp4 └── tracking_results.gif ├── LICENSE ├── README.md ├── YOLOv3_colab_training.ipynb ├── checkpoints └── checkpoint ├── deep_sort ├── detection.py ├── generate_detections.py ├── iou_matching.py ├── kalman_filter.py ├── linear_assignment.py ├── nn_matching.py ├── preprocessing.py ├── test_tracking.gif ├── track.py └── tracker.py ├── detect_mnist.py ├── detection_custom.py ├── detection_demo.py ├── evaluate_mAP.py ├── mnist ├── make_data.py ├── mnist.names ├── mnist │ ├── test.zip │ └── train.zip └── show_image.py ├── model_data ├── coco │ ├── coco.names │ ├── train2017.txt │ └── val2017.txt └── mars-small128.pb ├── object_tracker.py ├── requirements.txt ├── tools ├── Convert_to_TRT.py ├── Convert_to_pb.py ├── Detection_to_XML.py ├── XML_to_YOLOv3.py └── oid_to_pascal_voc_xml.py ├── train.py └── yolov3 ├── __ init __.py ├── __pycache__ ├── configs.cpython-36.pyc ├── dataset.cpython-36.pyc ├── utils.cpython-36.pyc └── yolov3.cpython-36.pyc ├── configs.py ├── dataset.py ├── utils.py ├── yolov3.py └── yolov4.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | model_data 3 | configs.py -------------------------------------------------------------------------------- /Collect_training_data.py: -------------------------------------------------------------------------------- 1 | #================================================================ 2 | # 3 | # File name : Collect_training_data.py 4 | # Author : PyLessons 5 | # Created date: 2020-09-27 6 | # Website : https://pylessons.com/ 7 | # GitHub : https://github.com/pythonlessons/TensorFlow-2.x-YOLOv3 8 | # Description : YOLO detection to XML example script 9 | # 10 | #================================================================ 11 | import os 12 | import subprocess 13 | import time 14 | from datetime import datetime 15 | import cv2 16 | import mss 17 | import numpy as np 18 | import tensorflow as tf 19 | from yolov3.utils import * 20 | from yolov3.configs import * 21 | from yolov3.yolov4 import read_class_names 22 | from tools.Detection_to_XML import CreateXMLfile 23 | import random 24 | 25 | def draw_enemy(image, bboxes, CLASSES=YOLO_COCO_CLASSES, show_label=True, show_confidence = True, Text_colors=(255,255,0), rectangle_colors='', tracking=False): 26 | NUM_CLASS = read_class_names(CLASSES) 27 | num_classes = len(NUM_CLASS) 28 | image_h, image_w, _ = image.shape 29 | hsv_tuples = [(1.0 * x / num_classes, 1., 1.) for x in range(num_classes)] 30 | colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples)) 31 | colors = list(map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), colors)) 32 | 33 | random.seed(0) 34 | random.shuffle(colors) 35 | random.seed(None) 36 | 37 | detection_list = [] 38 | 39 | for i, bbox in enumerate(bboxes): 40 | coor = np.array(bbox[:4], dtype=np.int32) 41 | score = bbox[4] 42 | class_ind = int(bbox[5]) 43 | bbox_color = rectangle_colors if rectangle_colors != '' else colors[class_ind] 44 | bbox_thick = int(0.6 * (image_h + image_w) / 1000) 45 | if bbox_thick < 1: bbox_thick = 1 46 | fontScale = 0.75 * bbox_thick 47 | (x1, y1), (x2, y2) = (coor[0], coor[1]), (coor[2], coor[3]) 48 | 49 | # put object rectangle 50 | cv2.rectangle(image, (x1, y1), (x2, y2), bbox_color, bbox_thick*2) 51 | 52 | x, y = int(x1+(x2-x1)/2), int(y1+(y2-y1)/2) 53 | 54 | if show_label: 55 | # get text label 56 | score_str = " {:.2f}".format(score) if show_confidence else "" 57 | 58 | if tracking: score_str = " "+str(score) 59 | 60 | label = "{}".format(NUM_CLASS[class_ind]) + score_str 61 | 62 | # get text size 63 | (text_width, text_height), baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_COMPLEX_SMALL, 64 | fontScale, thickness=bbox_thick) 65 | # put filled text rectangle 66 | cv2.rectangle(image, (x1, y1), (x1 + text_width, y1 - text_height - baseline), bbox_color, thickness=cv2.FILLED) 67 | 68 | # put text above rectangle 69 | cv2.putText(image, label, (x1, y1-4), cv2.FONT_HERSHEY_COMPLEX_SMALL, fontScale, Text_colors, bbox_thick, lineType=cv2.LINE_AA) 70 | 71 | return image 72 | 73 | def detect_enemy(Yolo, original_image, input_size=416, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors=''): 74 | image_data = image_preprocess(original_image, [input_size, input_size]) 75 | image_data = image_data[np.newaxis, ...].astype(np.float32) 76 | 77 | if YOLO_FRAMEWORK == "tf": 78 | pred_bbox = Yolo.predict(image_data) 79 | 80 | elif YOLO_FRAMEWORK == "trt": 81 | batched_input = tf.constant(image_data) 82 | result = Yolo(batched_input) 83 | pred_bbox = [] 84 | for key, value in result.items(): 85 | value = value.numpy() 86 | pred_bbox.append(value) 87 | 88 | pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] 89 | pred_bbox = tf.concat(pred_bbox, axis=0) 90 | 91 | bboxes = postprocess_boxes(pred_bbox, original_image, input_size, score_threshold) 92 | bboxes = nms(bboxes, iou_threshold, method='nms') 93 | 94 | image = draw_enemy(original_image, bboxes, CLASSES=CLASSES, rectangle_colors=rectangle_colors) 95 | 96 | return image, bboxes 97 | 98 | offset = 30 99 | times = [] 100 | sct = mss.mss() 101 | yolo = Load_Yolo_model() 102 | while True: 103 | t1 = time.time() 104 | img = np.array(sct.grab({"top": 87-offset, "left": 1920, "width": 1280, "height": 720, "mon": -1})) 105 | img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB) 106 | image, bboxes = detect_enemy(yolo, np.copy(img), input_size=YOLO_INPUT_SIZE, CLASSES=TRAIN_CLASSES, rectangle_colors=(255,0,0)) 107 | if len(bboxes) > 0: 108 | CreateXMLfile("XML_Detections", str(int(time.time())), img, bboxes, read_class_names(TRAIN_CLASSES)) 109 | print("got it") 110 | time.sleep(2) 111 | 112 | t2 = time.time() 113 | times.append(t2-t1) 114 | times = times[-20:] 115 | ms = sum(times)/len(times)*1000 116 | fps = 1000 / ms 117 | print("FPS", fps) 118 | 119 | #cv2.imshow("Detection image", img) 120 | #if cv2.waitKey(25) & 0xFF == ord("q"): 121 | #cv2.destroyAllWindows() 122 | #break 123 | -------------------------------------------------------------------------------- /IMAGES/city.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/TensorFlow-2.x-YOLOv3/9f29d73ee24cd5db4ead280f95ff06f66d538fc2/IMAGES/city.jpg -------------------------------------------------------------------------------- /IMAGES/city_pred.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/TensorFlow-2.x-YOLOv3/9f29d73ee24cd5db4ead280f95ff06f66d538fc2/IMAGES/city_pred.jpg -------------------------------------------------------------------------------- /IMAGES/kite.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/TensorFlow-2.x-YOLOv3/9f29d73ee24cd5db4ead280f95ff06f66d538fc2/IMAGES/kite.jpg -------------------------------------------------------------------------------- /IMAGES/kite_pred.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/TensorFlow-2.x-YOLOv3/9f29d73ee24cd5db4ead280f95ff06f66d538fc2/IMAGES/kite_pred.jpg -------------------------------------------------------------------------------- /IMAGES/mnist_test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/TensorFlow-2.x-YOLOv3/9f29d73ee24cd5db4ead280f95ff06f66d538fc2/IMAGES/mnist_test.jpg -------------------------------------------------------------------------------- /IMAGES/street.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/TensorFlow-2.x-YOLOv3/9f29d73ee24cd5db4ead280f95ff06f66d538fc2/IMAGES/street.jpg -------------------------------------------------------------------------------- /IMAGES/street_pred.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/TensorFlow-2.x-YOLOv3/9f29d73ee24cd5db4ead280f95ff06f66d538fc2/IMAGES/street_pred.jpg -------------------------------------------------------------------------------- /IMAGES/tensorboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/TensorFlow-2.x-YOLOv3/9f29d73ee24cd5db4ead280f95ff06f66d538fc2/IMAGES/tensorboard.png -------------------------------------------------------------------------------- /IMAGES/test.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/TensorFlow-2.x-YOLOv3/9f29d73ee24cd5db4ead280f95ff06f66d538fc2/IMAGES/test.mp4 -------------------------------------------------------------------------------- /IMAGES/tracking_results.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/TensorFlow-2.x-YOLOv3/9f29d73ee24cd5db4ead280f95ff06f66d538fc2/IMAGES/tracking_results.gif -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 pythonlessons 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TensorFlow-2.x-YOLOv3 and YOLOv4 tutorials 2 | 3 | YOLOv3 and YOLOv4 implementation in TensorFlow 2.x, with support for training, transfer training, object tracking mAP and so on... 4 | Code was tested with following specs: 5 | - i7-7700k CPU and Nvidia 1080TI GPU 6 | - OS Ubuntu 18.04 7 | - CUDA 10.1 8 | - cuDNN v7.6.5 9 | - TensorRT-6.0.1.5 10 | - Tensorflow-GPU 2.3.1 11 | - Code was tested on Ubuntu and Windows 10 (TensorRT not supported officially) 12 | 13 | ## Installation 14 | First, clone or download this GitHub repository. 15 | Install requirements and download pretrained weights: 16 | ``` 17 | pip install -r ./requirements.txt 18 | 19 | # yolov3 20 | wget -P model_data https://pjreddie.com/media/files/yolov3.weights 21 | 22 | # yolov3-tiny 23 | wget -P model_data https://pjreddie.com/media/files/yolov3-tiny.weights 24 | 25 | # yolov4 26 | wget -P model_data https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.weights 27 | 28 | # yolov4-tiny 29 | wget -P model_data https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-tiny.weights 30 | ``` 31 | 32 | ## Quick start 33 | Start with using pretrained weights to test predictions on both image and video: 34 | ``` 35 | python detection_demo.py 36 | ``` 37 | 38 |

39 | 40 |

41 | 42 | ## Quick training for custom mnist dataset 43 | mnist folder contains mnist images, create training data: 44 | ``` 45 | python mnist/make_data.py 46 | ``` 47 | `./yolov3/configs.py` file is already configured for mnist training. 48 | 49 | Now, you can train it and then evaluate your model 50 | ``` 51 | python train.py 52 | tensorboard --logdir=log 53 | ``` 54 | Track training progress in Tensorboard and go to http://localhost:6006/: 55 |

56 | 57 |

58 | 59 | Test detection with `detect_mnist.py` script: 60 | ``` 61 | python detect_mnist.py 62 | ``` 63 | Results: 64 |

65 | 66 |

67 | 68 | ## Custom YOLOv3 & YOLOv4 object detection training 69 | Custom training required to prepare dataset first, how to prepare dataset and train custom model you can read in following link:
70 | https://pylessons.com/YOLOv3-TF2-custrom-train/
71 | More about YOLOv4 training you can read [on this link](https://pylessons.com/YOLOv4-TF2-training/). I didn’t have time to implement all YOLOv4 Bag-Of-Freebies to improve the training process… Maybe later I’ll find time to do that, but now I leave it as it is. I recommended to use [Alex's Darknet](https://github.com/AlexeyAB/darknet) to train your custom model, if you need maximum performance, otherwise, you can use my implementation. 72 | 73 | ## Google Colab Custom Yolo v3 training 74 | To learn more about Google Colab Free gpu training, visit my [text version tutorial](https://pylessons.com/YOLOv3-TF2-GoogleColab/) 75 | 76 | ## Yolo v3 Tiny train and detection 77 | To get detailed instructions how to use Yolov3-Tiny, follow my text version tutorial [YOLOv3-Tiny support](https://pylessons.com/YOLOv3-TF2-Tiny/). Short instructions: 78 | - Get YOLOv3-Tiny weights: ```wget -P model_data https://pjreddie.com/media/files/yolov3-tiny.weights``` 79 | - From `yolov3/configs.py` change `TRAIN_YOLO_TINY` from `False` to `True` 80 | - Run `detection_demo.py` script. 81 | 82 | ## Yolo v3 Object tracking 83 | To learn more about Object tracking with Deep SORT, visit [Following link](https://pylessons.com/YOLOv3-TF2-DeepSort/). 84 | Quick test: 85 | - Clone this repository; 86 | - Make sure object detection works for you; 87 | - Run object_tracking.py script 88 |

89 | 90 |

91 | 92 | ## YOLOv3 vs YOLOv4 comparison on 1080TI: 93 | 94 | YOLO FPS on COCO 2017 Dataset: 95 | | Detection | 320x320 | 416x416 | 512x512 | 96 | |--------------|---------|---------|---------| 97 | | YoloV3 FPS | 24.38 | 20.94 | 18.57 | 98 | | YoloV4 FPS | 22.15 | 18.69 | 16.50 | 99 | 100 | TensorRT FPS on COCO 2017 Dataset: 101 | | Detection | 320x320 | 416x416 | 512x512 | 608x608 | 102 | |-----------------|---------|---------|---------|---------| 103 | | YoloV4 FP32 FPS | 31.23 | 27.30 | 22.63 | 18.17 | 104 | | YoloV4 FP16 FPS | 30.33 | 25.44 | 21.94 | 17.99 | 105 | | YoloV4 INT8 FPS | 85.18 | 62.02 | 47.50 | 37.32 | 106 | | YoloV3 INT8 FPS | 84.65 | 52.72 | 38.22 | 28.75 | 107 | 108 | mAP on COCO 2017 Dataset: 109 | | Detection | 320x320 | 416x416 | 512x512 | 110 | |------------------|---------|---------|---------| 111 | | YoloV3 mAP50 | 49.85 | 55.31 | 57.48 | 112 | | YoloV4 mAP50 | 48.58 | 56.92 | 61.71 | 113 | 114 | TensorRT mAP on COCO 2017 Dataset: 115 | | Detection | 320x320 | 416x416 | 512x512 | 608x608 | 116 | |-------------------|---------|---------|---------|---------| 117 | | YoloV4 FP32 mAP50 | 48.58 | 56.92 | 61.71 | 63.92 | 118 | | YoloV4 FP16 mAP50 | 48.57 | 56.92 | 61.69 | 63.92 | 119 | | YoloV4 INT8 mAP50 | 40.61 | 48.36 | 52.84 | 54.53 | 120 | | YoloV3 INT8 mAP50 | 44.19 | 48.64 | 50.10 | 50.69 | 121 | 122 | ## Converting YOLO to TensorRT 123 | I will give two examples, both will be for YOLOv4 model,quantize_mode=INT8 and model input size will be 608. Detailed tutorial is on this [link](https://pylessons.com/YOLOv4-TF2-TensorRT/). 124 | ### Default weights from COCO dataset: 125 | - Download weights from links above; 126 | - In `configs.py` script choose your `YOLO_TYPE`; 127 | - In `configs.py` script set `YOLO_INPUT_SIZE = 608`; 128 | - In `configs.py` script set `YOLO_FRAMEWORK = "trt"`; 129 | - From main directory in terminal type `python tools/Convert_to_pb.py`; 130 | - From main directory in terminal type `python tools/Convert_to_TRT.py`; 131 | - In `configs.py` script set `YOLO_CUSTOM_WEIGHTS = f'checkpoints/{YOLO_TYPE}-trt-{YOLO_TRT_QUANTIZE_MODE}–{YOLO_INPUT_SIZE}'`; 132 | - Now you can run `detection_demo.py`, best to test with `detect_video` function. 133 | 134 | ### Custom trained YOLO weights: 135 | - Download weights from links above; 136 | - In `configs.py` script choose your `YOLO_TYPE`; 137 | - In `configs.py` script set `YOLO_INPUT_SIZE = 608`; 138 | - Train custom YOLO model with instructions above; 139 | - In `configs.py` script set `YOLO_CUSTOM_WEIGHTS = f"{YOLO_TYPE}_custom"`; 140 | - In `configs.py` script make sure that `TRAIN_CLASSES` is with your custom classes text file; 141 | - From main directory in terminal type `python tools/Convert_to_pb.py`; 142 | - From main directory in terminal type `python tools/Convert_to_TRT.py`; 143 | - In `configs.py` script set `YOLO_FRAMEWORK = "trt"`; 144 | - In `configs.py` script set `YOLO_CUSTOM_WEIGHTS = f'checkpoints/{YOLO_TYPE}-trt-{YOLO_TRT_QUANTIZE_MODE}–{YOLO_INPUT_SIZE}'`; 145 | - Now you can run `detection_custom.py`, to test custom trained and converted TensorRT model. 146 | 147 | What is done: 148 | -------------------- 149 | - [x] Detection with original weights [Tutorial link](https://pylessons.com/YOLOv3-TF2-introduction/) 150 | - [x] Mnist detection training [Tutorial link](https://pylessons.com/YOLOv3-TF2-mnist/) 151 | - [x] Custom detection training [Tutorial link1](https://pylessons.com/YOLOv3-TF2-custrom-train/), [link2](https://pylessons.com/YOLOv3-TF2-custrom-images/) 152 | - [x] Google Colab training [Tutorial link](https://pylessons.com/YOLOv3-TF2-GoogleColab/) 153 | - [x] YOLOv3-Tiny support [Tutorial link](https://pylessons.com/YOLOv3-TF2-Tiny/) 154 | - [X] Object tracking [Tutorial link](https://pylessons.com/YOLOv3-TF2-DeepSort/) 155 | - [X] Mean Average Precision (mAP) [Tutorial link](https://pylessons.com/YOLOv3-TF2-mAP/) 156 | - [X] Yolo v3 on Raspberry Pi [Tutorial link](https://pylessons.com/YOLOv3-TF2-RaspberryPi/) 157 | - [X] YOLOv4 and YOLOv4-tiny detection [Tutorial link](https://pylessons.com/YOLOv4-TF2-introduction/) 158 | - [X] YOLOv4 and YOLOv4-tiny detection training (Not fully) [Tutorial link](https://pylessons.com/YOLOv4-TF2-training/) 159 | - [X] Convert to TensorRT model [Tutorial link](https://pylessons.com/YOLOv4-TF2-TensorRT/) 160 | - [X] Add multiprocessing after detection (drawing bbox) [Tutorial link](https://pylessons.com/YOLOv4-TF2-multiprocessing/) 161 | - [X] Generate YOLO Object Detection training data from its own results [Tutorial link](https://pylessons.com/YOLOv4-TF2-CreateXML/) 162 | - [X] Counter-strike Global Offensive realtime YOLOv4 Object Detection aimbot [Tutorial link](https://pylessons.com/YOLOv4-TF2-CSGO-aimbot/) 163 | 164 | To be continued... (not anytime soon) 165 | -------------------- 166 | - [ ] Converting to TensorFlow Lite 167 | - [ ] YOLO on Android (Leaving it for future, will need to convert everythin to java... not ready for this) 168 | - [ ] Generating anchors 169 | - [ ] YOLACT: Real-time Instance Segmentation 170 | - [ ] Model pruning (Pruning is a technique in deep learning that aids in the development of smaller and more efficient neural networks. It's a model optimization technique that involves eliminating unnecessary values in the weight tensor.) 171 | -------------------------------------------------------------------------------- /checkpoints/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "yolov3_custom_2" 2 | all_model_checkpoint_paths: "yolov3_custom_2" 3 | -------------------------------------------------------------------------------- /deep_sort/detection.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | import numpy as np 3 | 4 | 5 | class Detection(object): 6 | """ 7 | This class represents a bounding box detection in a single image. 8 | 9 | Parameters 10 | ---------- 11 | tlwh : array_like 12 | Bounding box in format `(x, y, w, h)`. 13 | confidence : float 14 | Detector confidence score. 15 | feature : array_like 16 | A feature vector that describes the object contained in this image. 17 | 18 | Attributes 19 | ---------- 20 | tlwh : ndarray 21 | Bounding box in format `(top left x, top left y, width, height)`. 22 | confidence : ndarray 23 | Detector confidence score. 24 | class_name : ndarray 25 | Detector class. 26 | feature : ndarray | NoneType 27 | A feature vector that describes the object contained in this image. 28 | 29 | """ 30 | 31 | def __init__(self, tlwh, confidence, class_name, feature): 32 | self.tlwh = np.asarray(tlwh, dtype=np.float) 33 | self.confidence = float(confidence) 34 | self.class_name = class_name 35 | self.feature = np.asarray(feature, dtype=np.float32) 36 | 37 | def get_class(self): 38 | return self.class_name 39 | 40 | def to_tlbr(self): 41 | """Convert bounding box to format `(min x, min y, max x, max y)`, i.e., 42 | `(top left, bottom right)`. 43 | """ 44 | ret = self.tlwh.copy() 45 | ret[2:] += ret[:2] 46 | return ret 47 | 48 | def to_xyah(self): 49 | """Convert bounding box to format `(center x, center y, aspect ratio, 50 | height)`, where the aspect ratio is `width / height`. 51 | """ 52 | ret = self.tlwh.copy() 53 | ret[:2] += ret[2:] / 2 54 | ret[2] /= ret[3] 55 | return ret 56 | -------------------------------------------------------------------------------- /deep_sort/generate_detections.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | import os 3 | import errno 4 | import argparse 5 | import numpy as np 6 | import cv2 7 | import tensorflow.compat.v1 as tf 8 | 9 | physical_devices = tf.config.experimental.list_physical_devices('GPU') 10 | if len(physical_devices) > 0: 11 | tf.config.experimental.set_memory_growth(physical_devices[0], True) 12 | 13 | def _run_in_batches(f, data_dict, out, batch_size): 14 | data_len = len(out) 15 | num_batches = int(data_len / batch_size) 16 | 17 | s, e = 0, 0 18 | for i in range(num_batches): 19 | s, e = i * batch_size, (i + 1) * batch_size 20 | batch_data_dict = {k: v[s:e] for k, v in data_dict.items()} 21 | out[s:e] = f(batch_data_dict) 22 | if e < len(out): 23 | batch_data_dict = {k: v[e:] for k, v in data_dict.items()} 24 | out[e:] = f(batch_data_dict) 25 | 26 | 27 | def extract_image_patch(image, bbox, patch_shape): 28 | """Extract image patch from bounding box. 29 | 30 | Parameters 31 | ---------- 32 | image : ndarray 33 | The full image. 34 | bbox : array_like 35 | The bounding box in format (x, y, width, height). 36 | patch_shape : Optional[array_like] 37 | This parameter can be used to enforce a desired patch shape 38 | (height, width). First, the `bbox` is adapted to the aspect ratio 39 | of the patch shape, then it is clipped at the image boundaries. 40 | If None, the shape is computed from :arg:`bbox`. 41 | 42 | Returns 43 | ------- 44 | ndarray | NoneType 45 | An image patch showing the :arg:`bbox`, optionally reshaped to 46 | :arg:`patch_shape`. 47 | Returns None if the bounding box is empty or fully outside of the image 48 | boundaries. 49 | 50 | """ 51 | bbox = np.array(bbox) 52 | if patch_shape is not None: 53 | # correct aspect ratio to patch shape 54 | target_aspect = float(patch_shape[1]) / patch_shape[0] 55 | new_width = target_aspect * bbox[3] 56 | bbox[0] -= (new_width - bbox[2]) / 2 57 | bbox[2] = new_width 58 | 59 | # convert to top left, bottom right 60 | bbox[2:] += bbox[:2] 61 | bbox = bbox.astype(np.int) 62 | 63 | # clip at image boundaries 64 | bbox[:2] = np.maximum(0, bbox[:2]) 65 | bbox[2:] = np.minimum(np.asarray(image.shape[:2][::-1]) - 1, bbox[2:]) 66 | if np.any(bbox[:2] >= bbox[2:]): 67 | return None 68 | sx, sy, ex, ey = bbox 69 | image = image[sy:ey, sx:ex] 70 | image = cv2.resize(image, tuple(patch_shape[::-1])) 71 | return image 72 | 73 | 74 | class ImageEncoder(object): 75 | 76 | def __init__(self, checkpoint_filename, input_name="images", output_name="features"): 77 | self.session = tf.Session() 78 | with tf.gfile.GFile(checkpoint_filename, "rb") as file_handle: 79 | graph_def = tf.GraphDef() 80 | graph_def.ParseFromString(file_handle.read()) 81 | tf.import_graph_def(graph_def) 82 | try: 83 | self.input_var = tf.get_default_graph().get_tensor_by_name(input_name) 84 | self.output_var = tf.get_default_graph().get_tensor_by_name(output_name) 85 | except KeyError: 86 | layers = [i.name for i in tf.get_default_graph().get_operations()] 87 | self.input_var = tf.get_default_graph().get_tensor_by_name(layers[0]+':0') 88 | self.output_var = tf.get_default_graph().get_tensor_by_name(layers[-1]+':0') 89 | 90 | assert len(self.output_var.get_shape()) == 2 91 | assert len(self.input_var.get_shape()) == 4 92 | self.feature_dim = self.output_var.get_shape().as_list()[-1] 93 | self.image_shape = self.input_var.get_shape().as_list()[1:] 94 | 95 | def __call__(self, data_x, batch_size=32): 96 | out = np.zeros((len(data_x), self.feature_dim), np.float32) 97 | _run_in_batches( 98 | lambda x: self.session.run(self.output_var, feed_dict=x), 99 | {self.input_var: data_x}, out, batch_size) 100 | return out 101 | 102 | 103 | def create_box_encoder(model_filename, input_name="images:0", output_name="features:0", batch_size=32): 104 | image_encoder = ImageEncoder(model_filename, input_name, output_name) 105 | image_shape = image_encoder.image_shape 106 | 107 | def encoder(image, boxes): 108 | image_patches = [] 109 | for box in boxes: 110 | patch = extract_image_patch(image, box, image_shape[:2]) 111 | if patch is None: 112 | print("WARNING: Failed to extract image patch: %s." % str(box)) 113 | patch = np.random.uniform(0., 255., image_shape).astype(np.uint8) 114 | image_patches.append(patch) 115 | image_patches = np.asarray(image_patches) 116 | return image_encoder(image_patches, batch_size) 117 | 118 | return encoder 119 | 120 | 121 | def generate_detections(encoder, mot_dir, output_dir, detection_dir=None): 122 | """Generate detections with features. 123 | 124 | Parameters 125 | ---------- 126 | encoder : Callable[image, ndarray] -> ndarray 127 | The encoder function takes as input a BGR color image and a matrix of 128 | bounding boxes in format `(x, y, w, h)` and returns a matrix of 129 | corresponding feature vectors. 130 | mot_dir : str 131 | Path to the MOTChallenge directory (can be either train or test). 132 | output_dir 133 | Path to the output directory. Will be created if it does not exist. 134 | detection_dir 135 | Path to custom detections. The directory structure should be the default 136 | MOTChallenge structure: `[sequence]/det/det.txt`. If None, uses the 137 | standard MOTChallenge detections. 138 | 139 | """ 140 | if detection_dir is None: 141 | detection_dir = mot_dir 142 | try: 143 | os.makedirs(output_dir) 144 | except OSError as exception: 145 | if exception.errno == errno.EEXIST and os.path.isdir(output_dir): 146 | pass 147 | else: 148 | raise ValueError( 149 | "Failed to created output directory '%s'" % output_dir) 150 | 151 | for sequence in os.listdir(mot_dir): 152 | print("Processing %s" % sequence) 153 | sequence_dir = os.path.join(mot_dir, sequence) 154 | 155 | image_dir = os.path.join(sequence_dir, "img1") 156 | image_filenames = { 157 | int(os.path.splitext(f)[0]): os.path.join(image_dir, f) 158 | for f in os.listdir(image_dir)} 159 | 160 | detection_file = os.path.join( 161 | detection_dir, sequence, "det/det.txt") 162 | detections_in = np.loadtxt(detection_file, delimiter=',') 163 | detections_out = [] 164 | 165 | frame_indices = detections_in[:, 0].astype(np.int) 166 | min_frame_idx = frame_indices.astype(np.int).min() 167 | max_frame_idx = frame_indices.astype(np.int).max() 168 | for frame_idx in range(min_frame_idx, max_frame_idx + 1): 169 | print("Frame %05d/%05d" % (frame_idx, max_frame_idx)) 170 | mask = frame_indices == frame_idx 171 | rows = detections_in[mask] 172 | 173 | if frame_idx not in image_filenames: 174 | print("WARNING could not find image for frame %d" % frame_idx) 175 | continue 176 | bgr_image = cv2.imread( 177 | image_filenames[frame_idx], cv2.IMREAD_COLOR) 178 | features = encoder(bgr_image, rows[:, 2:6].copy()) 179 | detections_out += [np.r_[(row, feature)] for row, feature 180 | in zip(rows, features)] 181 | 182 | output_filename = os.path.join(output_dir, "%s.npy" % sequence) 183 | np.save( 184 | output_filename, np.asarray(detections_out), allow_pickle=False) 185 | 186 | 187 | def parse_args(): 188 | """Parse command line arguments. 189 | """ 190 | parser = argparse.ArgumentParser(description="Re-ID feature extractor") 191 | parser.add_argument( 192 | "--model", 193 | default="resources/networks/mars-small128.pb", 194 | help="Path to freezed inference graph protobuf.") 195 | parser.add_argument( 196 | "--mot_dir", help="Path to MOTChallenge directory (train or test)", 197 | required=True) 198 | parser.add_argument( 199 | "--detection_dir", help="Path to custom detections. Defaults to " 200 | "standard MOT detections Directory structure should be the default " 201 | "MOTChallenge structure: [sequence]/det/det.txt", default=None) 202 | parser.add_argument( 203 | "--output_dir", help="Output directory. Will be created if it does not" 204 | " exist.", default="detections") 205 | return parser.parse_args() 206 | 207 | 208 | def main(): 209 | args = parse_args() 210 | encoder = create_box_encoder(args.model, batch_size=32) 211 | generate_detections(encoder, args.mot_dir, args.output_dir, 212 | args.detection_dir) 213 | 214 | 215 | if __name__ == "__main__": 216 | main() 217 | -------------------------------------------------------------------------------- /deep_sort/iou_matching.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | from __future__ import absolute_import 3 | import numpy as np 4 | from . import linear_assignment 5 | 6 | 7 | def iou(bbox, candidates): 8 | """Computer intersection over union. 9 | 10 | Parameters 11 | ---------- 12 | bbox : ndarray 13 | A bounding box in format `(top left x, top left y, width, height)`. 14 | candidates : ndarray 15 | A matrix of candidate bounding boxes (one per row) in the same format 16 | as `bbox`. 17 | 18 | Returns 19 | ------- 20 | ndarray 21 | The intersection over union in [0, 1] between the `bbox` and each 22 | candidate. A higher score means a larger fraction of the `bbox` is 23 | occluded by the candidate. 24 | 25 | """ 26 | bbox_tl, bbox_br = bbox[:2], bbox[:2] + bbox[2:] 27 | candidates_tl = candidates[:, :2] 28 | candidates_br = candidates[:, :2] + candidates[:, 2:] 29 | 30 | tl = np.c_[np.maximum(bbox_tl[0], candidates_tl[:, 0])[:, np.newaxis], 31 | np.maximum(bbox_tl[1], candidates_tl[:, 1])[:, np.newaxis]] 32 | br = np.c_[np.minimum(bbox_br[0], candidates_br[:, 0])[:, np.newaxis], 33 | np.minimum(bbox_br[1], candidates_br[:, 1])[:, np.newaxis]] 34 | wh = np.maximum(0., br - tl) 35 | 36 | area_intersection = wh.prod(axis=1) 37 | area_bbox = bbox[2:].prod() 38 | area_candidates = candidates[:, 2:].prod(axis=1) 39 | return area_intersection / (area_bbox + area_candidates - area_intersection) 40 | 41 | 42 | def iou_cost(tracks, detections, track_indices=None, 43 | detection_indices=None): 44 | """An intersection over union distance metric. 45 | 46 | Parameters 47 | ---------- 48 | tracks : List[deep_sort.track.Track] 49 | A list of tracks. 50 | detections : List[deep_sort.detection.Detection] 51 | A list of detections. 52 | track_indices : Optional[List[int]] 53 | A list of indices to tracks that should be matched. Defaults to 54 | all `tracks`. 55 | detection_indices : Optional[List[int]] 56 | A list of indices to detections that should be matched. Defaults 57 | to all `detections`. 58 | 59 | Returns 60 | ------- 61 | ndarray 62 | Returns a cost matrix of shape 63 | len(track_indices), len(detection_indices) where entry (i, j) is 64 | `1 - iou(tracks[track_indices[i]], detections[detection_indices[j]])`. 65 | 66 | """ 67 | if track_indices is None: 68 | track_indices = np.arange(len(tracks)) 69 | if detection_indices is None: 70 | detection_indices = np.arange(len(detections)) 71 | 72 | cost_matrix = np.zeros((len(track_indices), len(detection_indices))) 73 | for row, track_idx in enumerate(track_indices): 74 | if tracks[track_idx].time_since_update > 1: 75 | cost_matrix[row, :] = linear_assignment.INFTY_COST 76 | continue 77 | 78 | bbox = tracks[track_idx].to_tlwh() 79 | candidates = np.asarray([detections[i].tlwh for i in detection_indices]) 80 | cost_matrix[row, :] = 1. - iou(bbox, candidates) 81 | return cost_matrix 82 | -------------------------------------------------------------------------------- /deep_sort/kalman_filter.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | import numpy as np 3 | import scipy.linalg 4 | 5 | 6 | """ 7 | Table for the 0.95 quantile of the chi-square distribution with N degrees of 8 | freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv 9 | function and used as Mahalanobis gating threshold. 10 | """ 11 | chi2inv95 = { 12 | 1: 3.8415, 13 | 2: 5.9915, 14 | 3: 7.8147, 15 | 4: 9.4877, 16 | 5: 11.070, 17 | 6: 12.592, 18 | 7: 14.067, 19 | 8: 15.507, 20 | 9: 16.919} 21 | 22 | 23 | class KalmanFilter(object): 24 | """ 25 | A simple Kalman filter for tracking bounding boxes in image space. 26 | 27 | The 8-dimensional state space 28 | 29 | x, y, a, h, vx, vy, va, vh 30 | 31 | contains the bounding box center position (x, y), aspect ratio a, height h, 32 | and their respective velocities. 33 | 34 | Object motion follows a constant velocity model. The bounding box location 35 | (x, y, a, h) is taken as direct observation of the state space (linear 36 | observation model). 37 | 38 | """ 39 | 40 | def __init__(self): 41 | ndim, dt = 4, 1. 42 | 43 | # Create Kalman filter model matrices. 44 | self._motion_mat = np.eye(2 * ndim, 2 * ndim) 45 | for i in range(ndim): 46 | self._motion_mat[i, ndim + i] = dt 47 | self._update_mat = np.eye(ndim, 2 * ndim) 48 | 49 | # Motion and observation uncertainty are chosen relative to the current 50 | # state estimate. These weights control the amount of uncertainty in 51 | # the model. This is a bit hacky. 52 | self._std_weight_position = 1. / 20 53 | self._std_weight_velocity = 1. / 160 54 | 55 | def initiate(self, measurement): 56 | """Create track from unassociated measurement. 57 | 58 | Parameters 59 | ---------- 60 | measurement : ndarray 61 | Bounding box coordinates (x, y, a, h) with center position (x, y), 62 | aspect ratio a, and height h. 63 | 64 | Returns 65 | ------- 66 | (ndarray, ndarray) 67 | Returns the mean vector (8 dimensional) and covariance matrix (8x8 68 | dimensional) of the new track. Unobserved velocities are initialized 69 | to 0 mean. 70 | 71 | """ 72 | mean_pos = measurement 73 | mean_vel = np.zeros_like(mean_pos) 74 | mean = np.r_[mean_pos, mean_vel] 75 | 76 | std = [ 77 | 2 * self._std_weight_position * measurement[3], 78 | 2 * self._std_weight_position * measurement[3], 79 | 1e-2, 80 | 2 * self._std_weight_position * measurement[3], 81 | 10 * self._std_weight_velocity * measurement[3], 82 | 10 * self._std_weight_velocity * measurement[3], 83 | 1e-5, 84 | 10 * self._std_weight_velocity * measurement[3]] 85 | covariance = np.diag(np.square(std)) 86 | return mean, covariance 87 | 88 | def predict(self, mean, covariance): 89 | """Run Kalman filter prediction step. 90 | 91 | Parameters 92 | ---------- 93 | mean : ndarray 94 | The 8 dimensional mean vector of the object state at the previous 95 | time step. 96 | covariance : ndarray 97 | The 8x8 dimensional covariance matrix of the object state at the 98 | previous time step. 99 | 100 | Returns 101 | ------- 102 | (ndarray, ndarray) 103 | Returns the mean vector and covariance matrix of the predicted 104 | state. Unobserved velocities are initialized to 0 mean. 105 | 106 | """ 107 | std_pos = [ 108 | self._std_weight_position * mean[3], 109 | self._std_weight_position * mean[3], 110 | 1e-2, 111 | self._std_weight_position * mean[3]] 112 | std_vel = [ 113 | self._std_weight_velocity * mean[3], 114 | self._std_weight_velocity * mean[3], 115 | 1e-5, 116 | self._std_weight_velocity * mean[3]] 117 | motion_cov = np.diag(np.square(np.r_[std_pos, std_vel])) 118 | 119 | mean = np.dot(self._motion_mat, mean) 120 | covariance = np.linalg.multi_dot(( 121 | self._motion_mat, covariance, self._motion_mat.T)) + motion_cov 122 | 123 | return mean, covariance 124 | 125 | def project(self, mean, covariance): 126 | """Project state distribution to measurement space. 127 | 128 | Parameters 129 | ---------- 130 | mean : ndarray 131 | The state's mean vector (8 dimensional array). 132 | covariance : ndarray 133 | The state's covariance matrix (8x8 dimensional). 134 | 135 | Returns 136 | ------- 137 | (ndarray, ndarray) 138 | Returns the projected mean and covariance matrix of the given state 139 | estimate. 140 | 141 | """ 142 | std = [ 143 | self._std_weight_position * mean[3], 144 | self._std_weight_position * mean[3], 145 | 1e-1, 146 | self._std_weight_position * mean[3]] 147 | innovation_cov = np.diag(np.square(std)) 148 | 149 | mean = np.dot(self._update_mat, mean) 150 | covariance = np.linalg.multi_dot(( 151 | self._update_mat, covariance, self._update_mat.T)) 152 | return mean, covariance + innovation_cov 153 | 154 | def update(self, mean, covariance, measurement): 155 | """Run Kalman filter correction step. 156 | 157 | Parameters 158 | ---------- 159 | mean : ndarray 160 | The predicted state's mean vector (8 dimensional). 161 | covariance : ndarray 162 | The state's covariance matrix (8x8 dimensional). 163 | measurement : ndarray 164 | The 4 dimensional measurement vector (x, y, a, h), where (x, y) 165 | is the center position, a the aspect ratio, and h the height of the 166 | bounding box. 167 | 168 | Returns 169 | ------- 170 | (ndarray, ndarray) 171 | Returns the measurement-corrected state distribution. 172 | 173 | """ 174 | projected_mean, projected_cov = self.project(mean, covariance) 175 | 176 | chol_factor, lower = scipy.linalg.cho_factor( 177 | projected_cov, lower=True, check_finite=False) 178 | kalman_gain = scipy.linalg.cho_solve( 179 | (chol_factor, lower), np.dot(covariance, self._update_mat.T).T, 180 | check_finite=False).T 181 | innovation = measurement - projected_mean 182 | 183 | new_mean = mean + np.dot(innovation, kalman_gain.T) 184 | new_covariance = covariance - np.linalg.multi_dot(( 185 | kalman_gain, projected_cov, kalman_gain.T)) 186 | return new_mean, new_covariance 187 | 188 | def gating_distance(self, mean, covariance, measurements, 189 | only_position=False): 190 | """Compute gating distance between state distribution and measurements. 191 | 192 | A suitable distance threshold can be obtained from `chi2inv95`. If 193 | `only_position` is False, the chi-square distribution has 4 degrees of 194 | freedom, otherwise 2. 195 | 196 | Parameters 197 | ---------- 198 | mean : ndarray 199 | Mean vector over the state distribution (8 dimensional). 200 | covariance : ndarray 201 | Covariance of the state distribution (8x8 dimensional). 202 | measurements : ndarray 203 | An Nx4 dimensional matrix of N measurements, each in 204 | format (x, y, a, h) where (x, y) is the bounding box center 205 | position, a the aspect ratio, and h the height. 206 | only_position : Optional[bool] 207 | If True, distance computation is done with respect to the bounding 208 | box center position only. 209 | 210 | Returns 211 | ------- 212 | ndarray 213 | Returns an array of length N, where the i-th element contains the 214 | squared Mahalanobis distance between (mean, covariance) and 215 | `measurements[i]`. 216 | 217 | """ 218 | mean, covariance = self.project(mean, covariance) 219 | if only_position: 220 | mean, covariance = mean[:2], covariance[:2, :2] 221 | measurements = measurements[:, :2] 222 | 223 | cholesky_factor = np.linalg.cholesky(covariance) 224 | d = measurements - mean 225 | z = scipy.linalg.solve_triangular( 226 | cholesky_factor, d.T, lower=True, check_finite=False, 227 | overwrite_b=True) 228 | squared_maha = np.sum(z * z, axis=0) 229 | return squared_maha 230 | -------------------------------------------------------------------------------- /deep_sort/linear_assignment.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | from __future__ import absolute_import 3 | import numpy as np 4 | from scipy.optimize import linear_sum_assignment 5 | from . import kalman_filter 6 | 7 | 8 | INFTY_COST = 1e+5 9 | 10 | 11 | def min_cost_matching( 12 | distance_metric, max_distance, tracks, detections, track_indices=None, 13 | detection_indices=None): 14 | """Solve linear assignment problem. 15 | 16 | Parameters 17 | ---------- 18 | distance_metric : Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray 19 | The distance metric is given a list of tracks and detections as well as 20 | a list of N track indices and M detection indices. The metric should 21 | return the NxM dimensional cost matrix, where element (i, j) is the 22 | association cost between the i-th track in the given track indices and 23 | the j-th detection in the given detection_indices. 24 | max_distance : float 25 | Gating threshold. Associations with cost larger than this value are 26 | disregarded. 27 | tracks : List[track.Track] 28 | A list of predicted tracks at the current time step. 29 | detections : List[detection.Detection] 30 | A list of detections at the current time step. 31 | track_indices : List[int] 32 | List of track indices that maps rows in `cost_matrix` to tracks in 33 | `tracks` (see description above). 34 | detection_indices : List[int] 35 | List of detection indices that maps columns in `cost_matrix` to 36 | detections in `detections` (see description above). 37 | 38 | Returns 39 | ------- 40 | (List[(int, int)], List[int], List[int]) 41 | Returns a tuple with the following three entries: 42 | * A list of matched track and detection indices. 43 | * A list of unmatched track indices. 44 | * A list of unmatched detection indices. 45 | 46 | """ 47 | if track_indices is None: 48 | track_indices = np.arange(len(tracks)) 49 | if detection_indices is None: 50 | detection_indices = np.arange(len(detections)) 51 | 52 | if len(detection_indices) == 0 or len(track_indices) == 0: 53 | return [], track_indices, detection_indices # Nothing to match. 54 | 55 | cost_matrix = distance_metric( 56 | tracks, detections, track_indices, detection_indices) 57 | cost_matrix[cost_matrix > max_distance] = max_distance + 1e-5 58 | indices = linear_sum_assignment(cost_matrix) 59 | indices = np.asarray(indices) 60 | indices = np.transpose(indices) 61 | matches, unmatched_tracks, unmatched_detections = [], [], [] 62 | for col, detection_idx in enumerate(detection_indices): 63 | if col not in indices[:, 1]: 64 | unmatched_detections.append(detection_idx) 65 | for row, track_idx in enumerate(track_indices): 66 | if row not in indices[:, 0]: 67 | unmatched_tracks.append(track_idx) 68 | for row, col in indices: 69 | track_idx = track_indices[row] 70 | detection_idx = detection_indices[col] 71 | if cost_matrix[row, col] > max_distance: 72 | unmatched_tracks.append(track_idx) 73 | unmatched_detections.append(detection_idx) 74 | else: 75 | matches.append((track_idx, detection_idx)) 76 | return matches, unmatched_tracks, unmatched_detections 77 | 78 | 79 | def matching_cascade( 80 | distance_metric, max_distance, cascade_depth, tracks, detections, 81 | track_indices=None, detection_indices=None): 82 | """Run matching cascade. 83 | 84 | Parameters 85 | ---------- 86 | distance_metric : Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray 87 | The distance metric is given a list of tracks and detections as well as 88 | a list of N track indices and M detection indices. The metric should 89 | return the NxM dimensional cost matrix, where element (i, j) is the 90 | association cost between the i-th track in the given track indices and 91 | the j-th detection in the given detection indices. 92 | max_distance : float 93 | Gating threshold. Associations with cost larger than this value are 94 | disregarded. 95 | cascade_depth: int 96 | The cascade depth, should be se to the maximum track age. 97 | tracks : List[track.Track] 98 | A list of predicted tracks at the current time step. 99 | detections : List[detection.Detection] 100 | A list of detections at the current time step. 101 | track_indices : Optional[List[int]] 102 | List of track indices that maps rows in `cost_matrix` to tracks in 103 | `tracks` (see description above). Defaults to all tracks. 104 | detection_indices : Optional[List[int]] 105 | List of detection indices that maps columns in `cost_matrix` to 106 | detections in `detections` (see description above). Defaults to all 107 | detections. 108 | 109 | Returns 110 | ------- 111 | (List[(int, int)], List[int], List[int]) 112 | Returns a tuple with the following three entries: 113 | * A list of matched track and detection indices. 114 | * A list of unmatched track indices. 115 | * A list of unmatched detection indices. 116 | 117 | """ 118 | if track_indices is None: 119 | track_indices = list(range(len(tracks))) 120 | if detection_indices is None: 121 | detection_indices = list(range(len(detections))) 122 | 123 | unmatched_detections = detection_indices 124 | matches = [] 125 | for level in range(cascade_depth): 126 | if len(unmatched_detections) == 0: # No detections left 127 | break 128 | 129 | track_indices_l = [ 130 | k for k in track_indices 131 | if tracks[k].time_since_update == 1 + level 132 | ] 133 | if len(track_indices_l) == 0: # Nothing to match at this level 134 | continue 135 | 136 | matches_l, _, unmatched_detections = \ 137 | min_cost_matching( 138 | distance_metric, max_distance, tracks, detections, 139 | track_indices_l, unmatched_detections) 140 | matches += matches_l 141 | unmatched_tracks = list(set(track_indices) - set(k for k, _ in matches)) 142 | return matches, unmatched_tracks, unmatched_detections 143 | 144 | 145 | def gate_cost_matrix( 146 | kf, cost_matrix, tracks, detections, track_indices, detection_indices, 147 | gated_cost=INFTY_COST, only_position=False): 148 | """Invalidate infeasible entries in cost matrix based on the state 149 | distributions obtained by Kalman filtering. 150 | 151 | Parameters 152 | ---------- 153 | kf : The Kalman filter. 154 | cost_matrix : ndarray 155 | The NxM dimensional cost matrix, where N is the number of track indices 156 | and M is the number of detection indices, such that entry (i, j) is the 157 | association cost between `tracks[track_indices[i]]` and 158 | `detections[detection_indices[j]]`. 159 | tracks : List[track.Track] 160 | A list of predicted tracks at the current time step. 161 | detections : List[detection.Detection] 162 | A list of detections at the current time step. 163 | track_indices : List[int] 164 | List of track indices that maps rows in `cost_matrix` to tracks in 165 | `tracks` (see description above). 166 | detection_indices : List[int] 167 | List of detection indices that maps columns in `cost_matrix` to 168 | detections in `detections` (see description above). 169 | gated_cost : Optional[float] 170 | Entries in the cost matrix corresponding to infeasible associations are 171 | set this value. Defaults to a very large value. 172 | only_position : Optional[bool] 173 | If True, only the x, y position of the state distribution is considered 174 | during gating. Defaults to False. 175 | 176 | Returns 177 | ------- 178 | ndarray 179 | Returns the modified cost matrix. 180 | 181 | """ 182 | gating_dim = 2 if only_position else 4 183 | gating_threshold = kalman_filter.chi2inv95[gating_dim] 184 | measurements = np.asarray( 185 | [detections[i].to_xyah() for i in detection_indices]) 186 | for row, track_idx in enumerate(track_indices): 187 | track = tracks[track_idx] 188 | gating_distance = kf.gating_distance( 189 | track.mean, track.covariance, measurements, only_position) 190 | cost_matrix[row, gating_distance > gating_threshold] = gated_cost 191 | return cost_matrix 192 | -------------------------------------------------------------------------------- /deep_sort/nn_matching.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | import numpy as np 3 | 4 | 5 | def _pdist(a, b): 6 | """Compute pair-wise squared distance between points in `a` and `b`. 7 | 8 | Parameters 9 | ---------- 10 | a : array_like 11 | An NxM matrix of N samples of dimensionality M. 12 | b : array_like 13 | An LxM matrix of L samples of dimensionality M. 14 | 15 | Returns 16 | ------- 17 | ndarray 18 | Returns a matrix of size len(a), len(b) such that eleement (i, j) 19 | contains the squared distance between `a[i]` and `b[j]`. 20 | 21 | """ 22 | a, b = np.asarray(a), np.asarray(b) 23 | if len(a) == 0 or len(b) == 0: 24 | return np.zeros((len(a), len(b))) 25 | a2, b2 = np.square(a).sum(axis=1), np.square(b).sum(axis=1) 26 | r2 = -2. * np.dot(a, b.T) + a2[:, None] + b2[None, :] 27 | r2 = np.clip(r2, 0., float(np.inf)) 28 | return r2 29 | 30 | 31 | def _cosine_distance(a, b, data_is_normalized=False): 32 | """Compute pair-wise cosine distance between points in `a` and `b`. 33 | 34 | Parameters 35 | ---------- 36 | a : array_like 37 | An NxM matrix of N samples of dimensionality M. 38 | b : array_like 39 | An LxM matrix of L samples of dimensionality M. 40 | data_is_normalized : Optional[bool] 41 | If True, assumes rows in a and b are unit length vectors. 42 | Otherwise, a and b are explicitly normalized to lenght 1. 43 | 44 | Returns 45 | ------- 46 | ndarray 47 | Returns a matrix of size len(a), len(b) such that eleement (i, j) 48 | contains the squared distance between `a[i]` and `b[j]`. 49 | 50 | """ 51 | if not data_is_normalized: 52 | a = np.asarray(a) / np.linalg.norm(a, axis=1, keepdims=True) 53 | b = np.asarray(b) / np.linalg.norm(b, axis=1, keepdims=True) 54 | return 1. - np.dot(a, b.T) 55 | 56 | 57 | def _nn_euclidean_distance(x, y): 58 | """ Helper function for nearest neighbor distance metric (Euclidean). 59 | 60 | Parameters 61 | ---------- 62 | x : ndarray 63 | A matrix of N row-vectors (sample points). 64 | y : ndarray 65 | A matrix of M row-vectors (query points). 66 | 67 | Returns 68 | ------- 69 | ndarray 70 | A vector of length M that contains for each entry in `y` the 71 | smallest Euclidean distance to a sample in `x`. 72 | 73 | """ 74 | distances = _pdist(x, y) 75 | return np.maximum(0.0, distances.min(axis=0)) 76 | 77 | 78 | def _nn_cosine_distance(x, y): 79 | """ Helper function for nearest neighbor distance metric (cosine). 80 | 81 | Parameters 82 | ---------- 83 | x : ndarray 84 | A matrix of N row-vectors (sample points). 85 | y : ndarray 86 | A matrix of M row-vectors (query points). 87 | 88 | Returns 89 | ------- 90 | ndarray 91 | A vector of length M that contains for each entry in `y` the 92 | smallest cosine distance to a sample in `x`. 93 | 94 | """ 95 | distances = _cosine_distance(x, y) 96 | return distances.min(axis=0) 97 | 98 | 99 | class NearestNeighborDistanceMetric(object): 100 | """ 101 | A nearest neighbor distance metric that, for each target, returns 102 | the closest distance to any sample that has been observed so far. 103 | 104 | Parameters 105 | ---------- 106 | metric : str 107 | Either "euclidean" or "cosine". 108 | matching_threshold: float 109 | The matching threshold. Samples with larger distance are considered an 110 | invalid match. 111 | budget : Optional[int] 112 | If not None, fix samples per class to at most this number. Removes 113 | the oldest samples when the budget is reached. 114 | 115 | Attributes 116 | ---------- 117 | samples : Dict[int -> List[ndarray]] 118 | A dictionary that maps from target identities to the list of samples 119 | that have been observed so far. 120 | 121 | """ 122 | 123 | def __init__(self, metric, matching_threshold, budget=None): 124 | 125 | 126 | if metric == "euclidean": 127 | self._metric = _nn_euclidean_distance 128 | elif metric == "cosine": 129 | self._metric = _nn_cosine_distance 130 | else: 131 | raise ValueError( 132 | "Invalid metric; must be either 'euclidean' or 'cosine'") 133 | self.matching_threshold = matching_threshold 134 | self.budget = budget 135 | self.samples = {} 136 | 137 | def partial_fit(self, features, targets, active_targets): 138 | """Update the distance metric with new data. 139 | 140 | Parameters 141 | ---------- 142 | features : ndarray 143 | An NxM matrix of N features of dimensionality M. 144 | targets : ndarray 145 | An integer array of associated target identities. 146 | active_targets : List[int] 147 | A list of targets that are currently present in the scene. 148 | 149 | """ 150 | for feature, target in zip(features, targets): 151 | self.samples.setdefault(target, []).append(feature) 152 | if self.budget is not None: 153 | self.samples[target] = self.samples[target][-self.budget:] 154 | self.samples = {k: self.samples[k] for k in active_targets} 155 | 156 | def distance(self, features, targets): 157 | """Compute distance between features and targets. 158 | 159 | Parameters 160 | ---------- 161 | features : ndarray 162 | An NxM matrix of N features of dimensionality M. 163 | targets : List[int] 164 | A list of targets to match the given `features` against. 165 | 166 | Returns 167 | ------- 168 | ndarray 169 | Returns a cost matrix of shape len(targets), len(features), where 170 | element (i, j) contains the closest squared distance between 171 | `targets[i]` and `features[j]`. 172 | 173 | """ 174 | cost_matrix = np.zeros((len(targets), len(features))) 175 | for i, target in enumerate(targets): 176 | cost_matrix[i, :] = self._metric(self.samples[target], features) 177 | return cost_matrix 178 | -------------------------------------------------------------------------------- /deep_sort/preprocessing.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | import numpy as np 3 | import cv2 4 | 5 | 6 | def non_max_suppression(boxes, classes, max_bbox_overlap, scores=None): 7 | """Suppress overlapping detections. 8 | 9 | Original code from [1]_ has been adapted to include confidence score. 10 | 11 | .. [1] http://www.pyimagesearch.com/2015/02/16/ 12 | faster-non-maximum-suppression-python/ 13 | 14 | Examples 15 | -------- 16 | 17 | >>> boxes = [d.roi for d in detections] 18 | >>> classes = [d.classes for d in detections] 19 | >>> scores = [d.confidence for d in detections] 20 | >>> indices = non_max_suppression(boxes, max_bbox_overlap, scores) 21 | >>> detections = [detections[i] for i in indices] 22 | 23 | Parameters 24 | ---------- 25 | boxes : ndarray 26 | Array of ROIs (x, y, width, height). 27 | max_bbox_overlap : float 28 | ROIs that overlap more than this values are suppressed. 29 | scores : Optional[array_like] 30 | Detector confidence score. 31 | 32 | Returns 33 | ------- 34 | List[int] 35 | Returns indices of detections that have survived non-maxima suppression. 36 | 37 | """ 38 | if len(boxes) == 0: 39 | return [] 40 | 41 | boxes = boxes.astype(np.float) 42 | pick = [] 43 | 44 | x1 = boxes[:, 0] 45 | y1 = boxes[:, 1] 46 | x2 = boxes[:, 2] + boxes[:, 0] 47 | y2 = boxes[:, 3] + boxes[:, 1] 48 | 49 | area = (x2 - x1 + 1) * (y2 - y1 + 1) 50 | if scores is not None: 51 | idxs = np.argsort(scores) 52 | else: 53 | idxs = np.argsort(y2) 54 | 55 | while len(idxs) > 0: 56 | last = len(idxs) - 1 57 | i = idxs[last] 58 | pick.append(i) 59 | 60 | xx1 = np.maximum(x1[i], x1[idxs[:last]]) 61 | yy1 = np.maximum(y1[i], y1[idxs[:last]]) 62 | xx2 = np.minimum(x2[i], x2[idxs[:last]]) 63 | yy2 = np.minimum(y2[i], y2[idxs[:last]]) 64 | 65 | w = np.maximum(0, xx2 - xx1 + 1) 66 | h = np.maximum(0, yy2 - yy1 + 1) 67 | 68 | overlap = (w * h) / area[idxs[:last]] 69 | 70 | idxs = np.delete( 71 | idxs, np.concatenate( 72 | ([last], np.where(overlap > max_bbox_overlap)[0]))) 73 | 74 | return pick 75 | -------------------------------------------------------------------------------- /deep_sort/test_tracking.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/TensorFlow-2.x-YOLOv3/9f29d73ee24cd5db4ead280f95ff06f66d538fc2/deep_sort/test_tracking.gif -------------------------------------------------------------------------------- /deep_sort/track.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | 3 | 4 | class TrackState: 5 | """ 6 | Enumeration type for the single target track state. Newly created tracks are 7 | classified as `tentative` until enough evidence has been collected. Then, 8 | the track state is changed to `confirmed`. Tracks that are no longer alive 9 | are classified as `deleted` to mark them for removal from the set of active 10 | tracks. 11 | 12 | """ 13 | 14 | Tentative = 1 15 | Confirmed = 2 16 | Deleted = 3 17 | 18 | 19 | class Track: 20 | """ 21 | A single target track with state space `(x, y, a, h)` and associated 22 | velocities, where `(x, y)` is the center of the bounding box, `a` is the 23 | aspect ratio and `h` is the height. 24 | 25 | Parameters 26 | ---------- 27 | mean : ndarray 28 | Mean vector of the initial state distribution. 29 | covariance : ndarray 30 | Covariance matrix of the initial state distribution. 31 | track_id : int 32 | A unique track identifier. 33 | n_init : int 34 | Number of consecutive detections before the track is confirmed. The 35 | track state is set to `Deleted` if a miss occurs within the first 36 | `n_init` frames. 37 | max_age : int 38 | The maximum number of consecutive misses before the track state is 39 | set to `Deleted`. 40 | feature : Optional[ndarray] 41 | Feature vector of the detection this track originates from. If not None, 42 | this feature is added to the `features` cache. 43 | 44 | Attributes 45 | ---------- 46 | mean : ndarray 47 | Mean vector of the initial state distribution. 48 | covariance : ndarray 49 | Covariance matrix of the initial state distribution. 50 | track_id : int 51 | A unique track identifier. 52 | hits : int 53 | Total number of measurement updates. 54 | age : int 55 | Total number of frames since first occurance. 56 | time_since_update : int 57 | Total number of frames since last measurement update. 58 | state : TrackState 59 | The current track state. 60 | features : List[ndarray] 61 | A cache of features. On each measurement update, the associated feature 62 | vector is added to this list. 63 | 64 | """ 65 | 66 | def __init__(self, mean, covariance, track_id, n_init, max_age, 67 | feature=None, class_name=None): 68 | self.mean = mean 69 | self.covariance = covariance 70 | self.track_id = track_id 71 | self.hits = 1 72 | self.age = 1 73 | self.time_since_update = 0 74 | 75 | self.state = TrackState.Tentative 76 | self.features = [] 77 | if feature is not None: 78 | self.features.append(feature) 79 | 80 | self._n_init = n_init 81 | self._max_age = max_age 82 | self.class_name = class_name 83 | 84 | def to_tlwh(self): 85 | """Get current position in bounding box format `(top left x, top left y, 86 | width, height)`. 87 | 88 | Returns 89 | ------- 90 | ndarray 91 | The bounding box. 92 | 93 | """ 94 | ret = self.mean[:4].copy() 95 | ret[2] *= ret[3] 96 | ret[:2] -= ret[2:] / 2 97 | return ret 98 | 99 | def to_tlbr(self): 100 | """Get current position in bounding box format `(min x, miny, max x, 101 | max y)`. 102 | 103 | Returns 104 | ------- 105 | ndarray 106 | The bounding box. 107 | 108 | """ 109 | ret = self.to_tlwh() 110 | ret[2:] = ret[:2] + ret[2:] 111 | return ret 112 | 113 | def get_class(self): 114 | return self.class_name 115 | 116 | def predict(self, kf): 117 | """Propagate the state distribution to the current time step using a 118 | Kalman filter prediction step. 119 | 120 | Parameters 121 | ---------- 122 | kf : kalman_filter.KalmanFilter 123 | The Kalman filter. 124 | 125 | """ 126 | self.mean, self.covariance = kf.predict(self.mean, self.covariance) 127 | self.age += 1 128 | self.time_since_update += 1 129 | 130 | def update(self, kf, detection): 131 | """Perform Kalman filter measurement update step and update the feature 132 | cache. 133 | 134 | Parameters 135 | ---------- 136 | kf : kalman_filter.KalmanFilter 137 | The Kalman filter. 138 | detection : Detection 139 | The associated detection. 140 | 141 | """ 142 | self.mean, self.covariance = kf.update( 143 | self.mean, self.covariance, detection.to_xyah()) 144 | self.features.append(detection.feature) 145 | 146 | self.hits += 1 147 | self.time_since_update = 0 148 | if self.state == TrackState.Tentative and self.hits >= self._n_init: 149 | self.state = TrackState.Confirmed 150 | 151 | def mark_missed(self): 152 | """Mark this track as missed (no association at the current time step). 153 | """ 154 | if self.state == TrackState.Tentative: 155 | self.state = TrackState.Deleted 156 | elif self.time_since_update > self._max_age: 157 | self.state = TrackState.Deleted 158 | 159 | def is_tentative(self): 160 | """Returns True if this track is tentative (unconfirmed). 161 | """ 162 | return self.state == TrackState.Tentative 163 | 164 | def is_confirmed(self): 165 | """Returns True if this track is confirmed.""" 166 | return self.state == TrackState.Confirmed 167 | 168 | def is_deleted(self): 169 | """Returns True if this track is dead and should be deleted.""" 170 | return self.state == TrackState.Deleted 171 | -------------------------------------------------------------------------------- /deep_sort/tracker.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | from __future__ import absolute_import 3 | import numpy as np 4 | from . import kalman_filter 5 | from . import linear_assignment 6 | from . import iou_matching 7 | from .track import Track 8 | 9 | 10 | class Tracker: 11 | """ 12 | This is the multi-target tracker. 13 | 14 | Parameters 15 | ---------- 16 | metric : nn_matching.NearestNeighborDistanceMetric 17 | A distance metric for measurement-to-track association. 18 | max_age : int 19 | Maximum number of missed misses before a track is deleted. 20 | n_init : int 21 | Number of consecutive detections before the track is confirmed. The 22 | track state is set to `Deleted` if a miss occurs within the first 23 | `n_init` frames. 24 | 25 | Attributes 26 | ---------- 27 | metric : nn_matching.NearestNeighborDistanceMetric 28 | The distance metric used for measurement to track association. 29 | max_age : int 30 | Maximum number of missed misses before a track is deleted. 31 | n_init : int 32 | Number of frames that a track remains in initialization phase. 33 | kf : kalman_filter.KalmanFilter 34 | A Kalman filter to filter target trajectories in image space. 35 | tracks : List[Track] 36 | The list of active tracks at the current time step. 37 | 38 | """ 39 | 40 | def __init__(self, metric, max_iou_distance=0.7, max_age=30, n_init=3): 41 | self.metric = metric 42 | self.max_iou_distance = max_iou_distance 43 | self.max_age = max_age 44 | self.n_init = n_init 45 | 46 | self.kf = kalman_filter.KalmanFilter() 47 | self.tracks = [] 48 | self._next_id = 1 49 | 50 | def predict(self): 51 | """Propagate track state distributions one time step forward. 52 | 53 | This function should be called once every time step, before `update`. 54 | """ 55 | for track in self.tracks: 56 | track.predict(self.kf) 57 | 58 | def update(self, detections): 59 | """Perform measurement update and track management. 60 | 61 | Parameters 62 | ---------- 63 | detections : List[deep_sort.detection.Detection] 64 | A list of detections at the current time step. 65 | 66 | """ 67 | # Run matching cascade. 68 | matches, unmatched_tracks, unmatched_detections = \ 69 | self._match(detections) 70 | 71 | # Update track set. 72 | for track_idx, detection_idx in matches: 73 | self.tracks[track_idx].update( 74 | self.kf, detections[detection_idx]) 75 | for track_idx in unmatched_tracks: 76 | self.tracks[track_idx].mark_missed() 77 | for detection_idx in unmatched_detections: 78 | self._initiate_track(detections[detection_idx]) 79 | self.tracks = [t for t in self.tracks if not t.is_deleted()] 80 | 81 | # Update distance metric. 82 | active_targets = [t.track_id for t in self.tracks if t.is_confirmed()] 83 | features, targets = [], [] 84 | for track in self.tracks: 85 | if not track.is_confirmed(): 86 | continue 87 | features += track.features 88 | targets += [track.track_id for _ in track.features] 89 | track.features = [] 90 | self.metric.partial_fit( 91 | np.asarray(features), np.asarray(targets), active_targets) 92 | 93 | def _match(self, detections): 94 | 95 | def gated_metric(tracks, dets, track_indices, detection_indices): 96 | features = np.array([dets[i].feature for i in detection_indices]) 97 | targets = np.array([tracks[i].track_id for i in track_indices]) 98 | cost_matrix = self.metric.distance(features, targets) 99 | cost_matrix = linear_assignment.gate_cost_matrix( 100 | self.kf, cost_matrix, tracks, dets, track_indices, 101 | detection_indices) 102 | 103 | return cost_matrix 104 | 105 | # Split track set into confirmed and unconfirmed tracks. 106 | confirmed_tracks = [ 107 | i for i, t in enumerate(self.tracks) if t.is_confirmed()] 108 | unconfirmed_tracks = [ 109 | i for i, t in enumerate(self.tracks) if not t.is_confirmed()] 110 | 111 | # Associate confirmed tracks using appearance features. 112 | matches_a, unmatched_tracks_a, unmatched_detections = \ 113 | linear_assignment.matching_cascade( 114 | gated_metric, self.metric.matching_threshold, self.max_age, 115 | self.tracks, detections, confirmed_tracks) 116 | 117 | # Associate remaining tracks together with unconfirmed tracks using IOU. 118 | iou_track_candidates = unconfirmed_tracks + [ 119 | k for k in unmatched_tracks_a if 120 | self.tracks[k].time_since_update == 1] 121 | unmatched_tracks_a = [ 122 | k for k in unmatched_tracks_a if 123 | self.tracks[k].time_since_update != 1] 124 | matches_b, unmatched_tracks_b, unmatched_detections = \ 125 | linear_assignment.min_cost_matching( 126 | iou_matching.iou_cost, self.max_iou_distance, self.tracks, 127 | detections, iou_track_candidates, unmatched_detections) 128 | 129 | matches = matches_a + matches_b 130 | unmatched_tracks = list(set(unmatched_tracks_a + unmatched_tracks_b)) 131 | return matches, unmatched_tracks, unmatched_detections 132 | 133 | def _initiate_track(self, detection): 134 | mean, covariance = self.kf.initiate(detection.to_xyah()) 135 | class_name = detection.get_class() 136 | self.tracks.append(Track( 137 | mean, covariance, self._next_id, self.n_init, self.max_age, 138 | detection.feature, class_name)) 139 | self._next_id += 1 140 | -------------------------------------------------------------------------------- /detect_mnist.py: -------------------------------------------------------------------------------- 1 | #================================================================ 2 | # 3 | # File name : detect_mnist.py 4 | # Author : PyLessons 5 | # Created date: 2020-08-12 6 | # Website : https://pylessons.com/ 7 | # GitHub : https://github.com/pythonlessons/TensorFlow-2.x-YOLOv3 8 | # Description : mnist object detection example 9 | # 10 | #================================================================ 11 | import os 12 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 13 | import cv2 14 | import numpy as np 15 | import random 16 | import time 17 | import tensorflow as tf 18 | from yolov3.yolov4 import Create_Yolo 19 | from yolov3.utils import detect_image 20 | from yolov3.configs import * 21 | 22 | while True: 23 | ID = random.randint(0, 200) 24 | label_txt = "mnist/mnist_test.txt" 25 | image_info = open(label_txt).readlines()[ID].split() 26 | 27 | image_path = image_info[0] 28 | 29 | yolo = Create_Yolo(input_size=YOLO_INPUT_SIZE, CLASSES=TRAIN_CLASSES) 30 | yolo.load_weights(f"./checkpoints/{TRAIN_MODEL_NAME}") # use keras weights 31 | 32 | detect_image(yolo, image_path, "mnist_test.jpg", input_size=YOLO_INPUT_SIZE, show=True, CLASSES=TRAIN_CLASSES, rectangle_colors=(255,0,0)) 33 | -------------------------------------------------------------------------------- /detection_custom.py: -------------------------------------------------------------------------------- 1 | #================================================================ 2 | # 3 | # File name : detection_custom.py 4 | # Author : PyLessons 5 | # Created date: 2020-09-17 6 | # Website : https://pylessons.com/ 7 | # GitHub : https://github.com/pythonlessons/TensorFlow-2.x-YOLOv3 8 | # Description : object detection image and video example 9 | # 10 | #================================================================ 11 | import os 12 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 13 | import cv2 14 | import numpy as np 15 | import tensorflow as tf 16 | from yolov3.utils import detect_image, detect_realtime, detect_video, Load_Yolo_model, detect_video_realtime_mp 17 | from yolov3.configs import * 18 | 19 | image_path = "./IMAGES/plate_2.jpg" 20 | video_path = "./IMAGES/test.mp4" 21 | 22 | yolo = Load_Yolo_model() 23 | detect_image(yolo, image_path, "./IMAGES/plate_1_detect.jpg", input_size=YOLO_INPUT_SIZE, show=True, CLASSES=TRAIN_CLASSES, rectangle_colors=(255,0,0)) 24 | #detect_video(yolo, video_path, './IMAGES/detected.mp4', input_size=YOLO_INPUT_SIZE, show=False, CLASSES=TRAIN_CLASSES, rectangle_colors=(255,0,0)) 25 | #detect_realtime(yolo, '', input_size=YOLO_INPUT_SIZE, show=True, CLASSES=TRAIN_CLASSES, rectangle_colors=(255, 0, 0)) 26 | 27 | #detect_video_realtime_mp(video_path, "Output.mp4", input_size=YOLO_INPUT_SIZE, show=True, CLASSES=TRAIN_CLASSES, rectangle_colors=(255,0,0), realtime=False) 28 | -------------------------------------------------------------------------------- /detection_demo.py: -------------------------------------------------------------------------------- 1 | #================================================================ 2 | # 3 | # File name : detection_demo.py 4 | # Author : PyLessons 5 | # Created date: 2020-09-27 6 | # Website : https://pylessons.com/ 7 | # GitHub : https://github.com/pythonlessons/TensorFlow-2.x-YOLOv3 8 | # Description : object detection image and video example 9 | # 10 | #================================================================ 11 | import os 12 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 13 | import cv2 14 | import numpy as np 15 | import tensorflow as tf 16 | from yolov3.utils import detect_image, detect_realtime, detect_video, Load_Yolo_model, detect_video_realtime_mp 17 | from yolov3.configs import * 18 | 19 | image_path = "./IMAGES/kite.jpg" 20 | video_path = "./IMAGES/test.mp4" 21 | 22 | yolo = Load_Yolo_model() 23 | detect_image(yolo, image_path, "./IMAGES/kite_pred.jpg", input_size=YOLO_INPUT_SIZE, show=True, rectangle_colors=(255,0,0)) 24 | #detect_video(yolo, video_path, "", input_size=YOLO_INPUT_SIZE, show=False, rectangle_colors=(255,0,0)) 25 | #detect_realtime(yolo, '', input_size=YOLO_INPUT_SIZE, show=True, rectangle_colors=(255, 0, 0)) 26 | 27 | #detect_video_realtime_mp(video_path, "Output.mp4", input_size=YOLO_INPUT_SIZE, show=False, rectangle_colors=(255,0,0), realtime=False) 28 | -------------------------------------------------------------------------------- /evaluate_mAP.py: -------------------------------------------------------------------------------- 1 | #================================================================ 2 | # 3 | # File name : evaluate_mAP.py 4 | # Author : PyLessons 5 | # Created date: 2020-08-17 6 | # Website : https://pylessons.com/ 7 | # GitHub : https://github.com/pythonlessons/TensorFlow-2.x-YOLOv3 8 | # Description : used to evaluate model mAP and FPS 9 | # 10 | #================================================================ 11 | import os 12 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 13 | import cv2 14 | import numpy as np 15 | import tensorflow as tf 16 | from tensorflow.python.saved_model import tag_constants 17 | from yolov3.dataset import Dataset 18 | from yolov3.yolov4 import Create_Yolo 19 | from yolov3.utils import load_yolo_weights, detect_image, image_preprocess, postprocess_boxes, nms, read_class_names 20 | from yolov3.configs import * 21 | import shutil 22 | import json 23 | import time 24 | 25 | gpus = tf.config.experimental.list_physical_devices('GPU') 26 | if len(gpus) > 0: 27 | try: tf.config.experimental.set_memory_growth(gpus[0], True) 28 | except RuntimeError: print("RuntimeError in tf.config.experimental.list_physical_devices('GPU')") 29 | 30 | 31 | def voc_ap(rec, prec): 32 | """ 33 | --- Official matlab code VOC2012--- 34 | mrec=[0 ; rec ; 1]; 35 | mpre=[0 ; prec ; 0]; 36 | for i=numel(mpre)-1:-1:1 37 | mpre(i)=max(mpre(i),mpre(i+1)); 38 | end 39 | i=find(mrec(2:end)~=mrec(1:end-1))+1; 40 | ap=sum((mrec(i)-mrec(i-1)).*mpre(i)); 41 | """ 42 | rec.insert(0, 0.0) # insert 0.0 at begining of list 43 | rec.append(1.0) # insert 1.0 at end of list 44 | mrec = rec[:] 45 | prec.insert(0, 0.0) # insert 0.0 at begining of list 46 | prec.append(0.0) # insert 0.0 at end of list 47 | mpre = prec[:] 48 | """ 49 | This part makes the precision monotonically decreasing 50 | (goes from the end to the beginning) 51 | matlab: for i=numel(mpre)-1:-1:1 52 | mpre(i)=max(mpre(i),mpre(i+1)); 53 | """ 54 | # matlab indexes start in 1 but python in 0, so I have to do: 55 | # range(start=(len(mpre) - 2), end=0, step=-1) 56 | # also the python function range excludes the end, resulting in: 57 | # range(start=(len(mpre) - 2), end=-1, step=-1) 58 | for i in range(len(mpre)-2, -1, -1): 59 | mpre[i] = max(mpre[i], mpre[i+1]) 60 | """ 61 | This part creates a list of indexes where the recall changes 62 | matlab: i=find(mrec(2:end)~=mrec(1:end-1))+1; 63 | """ 64 | i_list = [] 65 | for i in range(1, len(mrec)): 66 | if mrec[i] != mrec[i-1]: 67 | i_list.append(i) # if it was matlab would be i + 1 68 | """ 69 | The Average Precision (AP) is the area under the curve 70 | (numerical integration) 71 | matlab: ap=sum((mrec(i)-mrec(i-1)).*mpre(i)); 72 | """ 73 | ap = 0.0 74 | for i in i_list: 75 | ap += ((mrec[i]-mrec[i-1])*mpre[i]) 76 | return ap, mrec, mpre 77 | 78 | 79 | def get_mAP(Yolo, dataset, score_threshold=0.25, iou_threshold=0.50, TEST_INPUT_SIZE=TEST_INPUT_SIZE): 80 | MINOVERLAP = 0.5 # default value (defined in the PASCAL VOC2012 challenge) 81 | NUM_CLASS = read_class_names(TRAIN_CLASSES) 82 | 83 | ground_truth_dir_path = 'mAP/ground-truth' 84 | if os.path.exists(ground_truth_dir_path): shutil.rmtree(ground_truth_dir_path) 85 | 86 | if not os.path.exists('mAP'): os.mkdir('mAP') 87 | os.mkdir(ground_truth_dir_path) 88 | 89 | print(f'\ncalculating mAP{int(iou_threshold*100)}...\n') 90 | 91 | gt_counter_per_class = {} 92 | for index in range(dataset.num_samples): 93 | ann_dataset = dataset.annotations[index] 94 | 95 | original_image, bbox_data_gt = dataset.parse_annotation(ann_dataset, True) 96 | 97 | if len(bbox_data_gt) == 0: 98 | bboxes_gt = [] 99 | classes_gt = [] 100 | else: 101 | bboxes_gt, classes_gt = bbox_data_gt[:, :4], bbox_data_gt[:, 4] 102 | ground_truth_path = os.path.join(ground_truth_dir_path, str(index) + '.txt') 103 | num_bbox_gt = len(bboxes_gt) 104 | 105 | bounding_boxes = [] 106 | for i in range(num_bbox_gt): 107 | class_name = NUM_CLASS[classes_gt[i]] 108 | xmin, ymin, xmax, ymax = list(map(str, bboxes_gt[i])) 109 | bbox = xmin + " " + ymin + " " + xmax + " " +ymax 110 | bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False}) 111 | 112 | # count that object 113 | if class_name in gt_counter_per_class: 114 | gt_counter_per_class[class_name] += 1 115 | else: 116 | # if class didn't exist yet 117 | gt_counter_per_class[class_name] = 1 118 | bbox_mess = ' '.join([class_name, xmin, ymin, xmax, ymax]) + '\n' 119 | with open(f'{ground_truth_dir_path}/{str(index)}_ground_truth.json', 'w') as outfile: 120 | json.dump(bounding_boxes, outfile) 121 | 122 | gt_classes = list(gt_counter_per_class.keys()) 123 | # sort the classes alphabetically 124 | gt_classes = sorted(gt_classes) 125 | n_classes = len(gt_classes) 126 | 127 | times = [] 128 | json_pred = [[] for i in range(n_classes)] 129 | for index in range(dataset.num_samples): 130 | ann_dataset = dataset.annotations[index] 131 | 132 | image_name = ann_dataset[0].split('/')[-1] 133 | original_image, bbox_data_gt = dataset.parse_annotation(ann_dataset, True) 134 | 135 | image = image_preprocess(np.copy(original_image), [TEST_INPUT_SIZE, TEST_INPUT_SIZE]) 136 | image_data = image[np.newaxis, ...].astype(np.float32) 137 | 138 | t1 = time.time() 139 | if YOLO_FRAMEWORK == "tf": 140 | if tf.__version__ > '2.4.0': 141 | pred_bbox = Yolo(image_data) 142 | else: 143 | pred_bbox = Yolo.predict(image_data) 144 | elif YOLO_FRAMEWORK == "trt": 145 | batched_input = tf.constant(image_data) 146 | result = Yolo(batched_input) 147 | pred_bbox = [] 148 | for key, value in result.items(): 149 | value = value.numpy() 150 | pred_bbox.append(value) 151 | 152 | t2 = time.time() 153 | 154 | times.append(t2-t1) 155 | 156 | pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] 157 | pred_bbox = tf.concat(pred_bbox, axis=0) 158 | 159 | bboxes = postprocess_boxes(pred_bbox, original_image, TEST_INPUT_SIZE, score_threshold) 160 | bboxes = nms(bboxes, iou_threshold, method='nms') 161 | 162 | for bbox in bboxes: 163 | coor = np.array(bbox[:4], dtype=np.int32) 164 | score = bbox[4] 165 | class_ind = int(bbox[5]) 166 | class_name = NUM_CLASS[class_ind] 167 | score = '%.4f' % score 168 | xmin, ymin, xmax, ymax = list(map(str, coor)) 169 | bbox = xmin + " " + ymin + " " + xmax + " " +ymax 170 | json_pred[gt_classes.index(class_name)].append({"confidence": str(score), "file_id": str(index), "bbox": str(bbox)}) 171 | 172 | ms = sum(times)/len(times)*1000 173 | fps = 1000 / ms 174 | 175 | for class_name in gt_classes: 176 | json_pred[gt_classes.index(class_name)].sort(key=lambda x:float(x['confidence']), reverse=True) 177 | with open(f'{ground_truth_dir_path}/{class_name}_predictions.json', 'w') as outfile: 178 | json.dump(json_pred[gt_classes.index(class_name)], outfile) 179 | 180 | # Calculate the AP for each class 181 | sum_AP = 0.0 182 | ap_dictionary = {} 183 | # open file to store the results 184 | with open("mAP/results.txt", 'w') as results_file: 185 | results_file.write("# AP and precision/recall per class\n") 186 | count_true_positives = {} 187 | for class_index, class_name in enumerate(gt_classes): 188 | count_true_positives[class_name] = 0 189 | # Load predictions of that class 190 | predictions_file = f'{ground_truth_dir_path}/{class_name}_predictions.json' 191 | predictions_data = json.load(open(predictions_file)) 192 | 193 | # Assign predictions to ground truth objects 194 | nd = len(predictions_data) 195 | tp = [0] * nd # creates an array of zeros of size nd 196 | fp = [0] * nd 197 | for idx, prediction in enumerate(predictions_data): 198 | file_id = prediction["file_id"] 199 | # assign prediction to ground truth object if any 200 | # open ground-truth with that file_id 201 | gt_file = f'{ground_truth_dir_path}/{str(file_id)}_ground_truth.json' 202 | ground_truth_data = json.load(open(gt_file)) 203 | ovmax = -1 204 | gt_match = -1 205 | # load prediction bounding-box 206 | bb = [ float(x) for x in prediction["bbox"].split() ] # bounding box of prediction 207 | for obj in ground_truth_data: 208 | # look for a class_name match 209 | if obj["class_name"] == class_name: 210 | bbgt = [ float(x) for x in obj["bbox"].split() ] # bounding box of ground truth 211 | bi = [max(bb[0],bbgt[0]), max(bb[1],bbgt[1]), min(bb[2],bbgt[2]), min(bb[3],bbgt[3])] 212 | iw = bi[2] - bi[0] + 1 213 | ih = bi[3] - bi[1] + 1 214 | if iw > 0 and ih > 0: 215 | # compute overlap (IoU) = area of intersection / area of union 216 | ua = (bb[2] - bb[0] + 1) * (bb[3] - bb[1] + 1) + (bbgt[2] - bbgt[0] 217 | + 1) * (bbgt[3] - bbgt[1] + 1) - iw * ih 218 | ov = iw * ih / ua 219 | if ov > ovmax: 220 | ovmax = ov 221 | gt_match = obj 222 | 223 | # assign prediction as true positive/don't care/false positive 224 | if ovmax >= MINOVERLAP:# if ovmax > minimum overlap 225 | if not bool(gt_match["used"]): 226 | # true positive 227 | tp[idx] = 1 228 | gt_match["used"] = True 229 | count_true_positives[class_name] += 1 230 | # update the ".json" file 231 | with open(gt_file, 'w') as f: 232 | f.write(json.dumps(ground_truth_data)) 233 | else: 234 | # false positive (multiple detection) 235 | fp[idx] = 1 236 | else: 237 | # false positive 238 | fp[idx] = 1 239 | 240 | # compute precision/recall 241 | cumsum = 0 242 | for idx, val in enumerate(fp): 243 | fp[idx] += cumsum 244 | cumsum += val 245 | cumsum = 0 246 | for idx, val in enumerate(tp): 247 | tp[idx] += cumsum 248 | cumsum += val 249 | #print(tp) 250 | rec = tp[:] 251 | for idx, val in enumerate(tp): 252 | rec[idx] = float(tp[idx]) / gt_counter_per_class[class_name] 253 | #print(rec) 254 | prec = tp[:] 255 | for idx, val in enumerate(tp): 256 | prec[idx] = float(tp[idx]) / (fp[idx] + tp[idx]) 257 | #print(prec) 258 | 259 | ap, mrec, mprec = voc_ap(rec, prec) 260 | sum_AP += ap 261 | text = "{0:.3f}%".format(ap*100) + " = " + class_name + " AP " #class_name + " AP = {0:.2f}%".format(ap*100) 262 | 263 | rounded_prec = [ '%.3f' % elem for elem in prec ] 264 | rounded_rec = [ '%.3f' % elem for elem in rec ] 265 | # Write to results.txt 266 | results_file.write(text + "\n Precision: " + str(rounded_prec) + "\n Recall :" + str(rounded_rec) + "\n\n") 267 | 268 | print(text) 269 | ap_dictionary[class_name] = ap 270 | 271 | results_file.write("\n# mAP of all classes\n") 272 | mAP = sum_AP / n_classes 273 | 274 | text = "mAP = {:.3f}%, {:.2f} FPS".format(mAP*100, fps) 275 | results_file.write(text + "\n") 276 | print(text) 277 | 278 | return mAP*100 279 | 280 | if __name__ == '__main__': 281 | if YOLO_FRAMEWORK == "tf": # TensorFlow detection 282 | if YOLO_TYPE == "yolov4": 283 | Darknet_weights = YOLO_V4_TINY_WEIGHTS if TRAIN_YOLO_TINY else YOLO_V4_WEIGHTS 284 | if YOLO_TYPE == "yolov3": 285 | Darknet_weights = YOLO_V3_TINY_WEIGHTS if TRAIN_YOLO_TINY else YOLO_V3_WEIGHTS 286 | 287 | if YOLO_CUSTOM_WEIGHTS == False: 288 | yolo = Create_Yolo(input_size=YOLO_INPUT_SIZE, CLASSES=YOLO_COCO_CLASSES) 289 | load_yolo_weights(yolo, Darknet_weights) # use Darknet weights 290 | else: 291 | yolo = Create_Yolo(input_size=YOLO_INPUT_SIZE, CLASSES=TRAIN_CLASSES) 292 | yolo.load_weights(f"./checkpoints/{TRAIN_MODEL_NAME}") # use custom weights 293 | 294 | elif YOLO_FRAMEWORK == "trt": # TensorRT detection 295 | saved_model_loaded = tf.saved_model.load(f"./checkpoints/{TRAIN_MODEL_NAME}", tags=[tag_constants.SERVING]) 296 | signature_keys = list(saved_model_loaded.signatures.keys()) 297 | yolo = saved_model_loaded.signatures['serving_default'] 298 | 299 | testset = Dataset('test', TEST_INPUT_SIZE=YOLO_INPUT_SIZE) 300 | get_mAP(yolo, testset, score_threshold=0.05, iou_threshold=0.50, TEST_INPUT_SIZE=YOLO_INPUT_SIZE) 301 | -------------------------------------------------------------------------------- /mnist/make_data.py: -------------------------------------------------------------------------------- 1 | #================================================================ 2 | # 3 | # File name : make_data.py 4 | # Author : PyLessons 5 | # Created date: 2020-04-20 6 | # Website : https://pylessons.com/ 7 | # GitHub : https://github.com/pythonlessons/TensorFlow-2.x-YOLOv3 8 | # Description : create mnist example dataset to train custom yolov3 9 | # 10 | #================================================================ 11 | import os 12 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1' 13 | import cv2 14 | import numpy as np 15 | import shutil 16 | import random 17 | from zipfile import ZipFile 18 | 19 | SIZE = 416 20 | images_num_train = 1000 21 | images_num_test = 200 22 | 23 | image_sizes = [3, 6, 3] # small, medium, big 24 | 25 | # this helps to run script both from terminal and python IDLE 26 | add_path = "mnist" 27 | if os.getcwd().split(os.sep)[-1] != "mnist": 28 | add_path = "mnist" 29 | os.chdir(add_path) 30 | else: 31 | add_path = "" 32 | 33 | def compute_iou(box1, box2): 34 | # xmin, ymin, xmax, ymax 35 | A1 = (box1[2] - box1[0])*(box1[3] - box1[1]) 36 | A2 = (box2[2] - box2[0])*(box2[3] - box2[1]) 37 | 38 | xmin = max(box1[0], box2[0]) 39 | ymin = max(box1[1], box2[1]) 40 | xmax = min(box1[2], box2[2]) 41 | ymax = min(box1[3], box2[3]) 42 | 43 | if ymin >= ymax or xmin >= xmax: return 0 44 | return ((xmax-xmin) * (ymax - ymin)) / (A1 + A2) 45 | 46 | 47 | def make_image(data, image_path, ratio=1): 48 | blank = data[0] 49 | boxes = data[1] 50 | label = data[2] 51 | 52 | ID = image_path.split("/")[-1][0] 53 | image = cv2.imread(image_path) 54 | image = cv2.resize(image, (int(28*ratio), int(28*ratio))) 55 | h, w, c = image.shape 56 | 57 | while True: 58 | xmin = np.random.randint(0, SIZE-w, 1)[0] 59 | ymin = np.random.randint(0, SIZE-h, 1)[0] 60 | xmax = xmin + w 61 | ymax = ymin + h 62 | box = [xmin, ymin, xmax, ymax] 63 | 64 | iou = [compute_iou(box, b) for b in boxes] 65 | if max(iou) < 0.02: 66 | boxes.append(box) 67 | label.append(ID) 68 | break 69 | 70 | for i in range(w): 71 | for j in range(h): 72 | x = xmin + i 73 | y = ymin + j 74 | blank[y][x] = image[j][i] 75 | 76 | # cv2.rectangle(blank, (xmin, ymin), (xmax, ymax), [0, 0, 255], 2) 77 | return blank 78 | 79 | 80 | for file in ["train", "test"]: 81 | if not os.path.exists(f"mnist/{file}"): 82 | with ZipFile(f"mnist/{file}.zip", 'r') as zip: 83 | # extracting all the files 84 | print(f'Extracting all {file} files now...') 85 | zip.extractall() 86 | shutil.move(file, "mnist") 87 | print('Done!') 88 | 89 | for file in ['train','test']: 90 | images_path = os.getcwd()+f"/mnist_{file}" 91 | labels_txt = os.getcwd()+f"/mnist_{file}.txt" 92 | 93 | if file == 'train': images_num = images_num_train 94 | if file == 'test': images_num = images_num_test 95 | 96 | if os.path.exists(images_path): shutil.rmtree(images_path) 97 | os.mkdir(images_path) 98 | 99 | image_paths = [os.path.join(os.path.realpath("."), os.getcwd()+f"/mnist/{file}/" + image_name) 100 | for image_name in os.listdir(os.getcwd()+f"/mnist/{file}")] 101 | 102 | with open(labels_txt, "w") as wf: 103 | image_num = 0 104 | while image_num < images_num: 105 | image_path = os.path.realpath(os.path.join(images_path, "%06d.jpg" %(image_num+1))) 106 | #print(image_path) 107 | annotation = image_path 108 | blanks = np.ones(shape=[SIZE, SIZE, 3]) * 255 109 | bboxes = [[0,0,1,1]] 110 | labels = [0] 111 | data = [blanks, bboxes, labels] 112 | bboxes_num = 0 113 | 114 | # ratios small, medium, big objects 115 | ratios = [[0.5, 0.8], [1., 1.5, 2.], [3., 4.]] 116 | for i in range(len(ratios)): 117 | N = random.randint(0, image_sizes[i]) 118 | if N !=0: bboxes_num += 1 119 | for _ in range(N): 120 | ratio = random.choice(ratios[i]) 121 | idx = random.randint(0, len(image_paths)-1) 122 | data[0] = make_image(data, image_paths[idx], ratio) 123 | 124 | if bboxes_num == 0: continue 125 | cv2.imwrite(image_path, data[0]) 126 | for i in range(len(labels)): 127 | if i == 0: continue 128 | xmin = str(bboxes[i][0]) 129 | ymin = str(bboxes[i][1]) 130 | xmax = str(bboxes[i][2]) 131 | ymax = str(bboxes[i][3]) 132 | class_ind = str(labels[i]) 133 | annotation += ' ' + ','.join([xmin, ymin, xmax, ymax, str(class_ind)]) 134 | image_num += 1 135 | print("=> %s" %annotation) 136 | wf.write(annotation + "\n") 137 | 138 | if add_path != "": os.chdir("..") 139 | -------------------------------------------------------------------------------- /mnist/mnist.names: -------------------------------------------------------------------------------- 1 | 0 2 | 1 3 | 2 4 | 3 5 | 4 6 | 5 7 | 6 8 | 7 9 | 8 10 | 9 11 | -------------------------------------------------------------------------------- /mnist/mnist/test.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/TensorFlow-2.x-YOLOv3/9f29d73ee24cd5db4ead280f95ff06f66d538fc2/mnist/mnist/test.zip -------------------------------------------------------------------------------- /mnist/mnist/train.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/TensorFlow-2.x-YOLOv3/9f29d73ee24cd5db4ead280f95ff06f66d538fc2/mnist/mnist/train.zip -------------------------------------------------------------------------------- /mnist/show_image.py: -------------------------------------------------------------------------------- 1 | #================================================================ 2 | # 3 | # File name : show_image.py 4 | # Author : PyLessons 5 | # Created date: 2020-04-20 6 | # Website : https://pylessons.com/ 7 | # GitHub : https://github.com/pythonlessons/TensorFlow-2.x-YOLOv3 8 | # Description : show random image from created dataset 9 | # 10 | #================================================================ 11 | import random 12 | import cv2 13 | import numpy as np 14 | from PIL import Image 15 | 16 | ID = random.randint(0, 200) 17 | label_txt = "./mnist_train.txt" 18 | image_info = open(label_txt).readlines()[ID].split() 19 | 20 | image_path = image_info[0] 21 | image = cv2.imread(image_path) 22 | for bbox in image_info[1:]: 23 | bbox = bbox.split(",") 24 | image = cv2.rectangle(image,(int(float(bbox[0])), 25 | int(float(bbox[1]))), 26 | (int(float(bbox[2])), 27 | int(float(bbox[3]))), (255,0,0), 2) 28 | 29 | image = Image.fromarray(np.uint8(image)) 30 | image.show() 31 | -------------------------------------------------------------------------------- /model_data/coco/coco.names: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic-light 11 | fire-hydrant 12 | stop-sign 13 | parking-meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports-ball 34 | kite 35 | baseball-bat 36 | baseball-glove 37 | skateboard 38 | surfboard 39 | tennis-racket 40 | bottle 41 | wine-glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot-dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell-phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy-bear 79 | hair-drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /model_data/mars-small128.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/TensorFlow-2.x-YOLOv3/9f29d73ee24cd5db4ead280f95ff06f66d538fc2/model_data/mars-small128.pb -------------------------------------------------------------------------------- /object_tracker.py: -------------------------------------------------------------------------------- 1 | #================================================================ 2 | # 3 | # File name : object_tracker.py 4 | # Author : PyLessons 5 | # Created date: 2020-09-17 6 | # Website : https://pylessons.com/ 7 | # GitHub : https://github.com/pythonlessons/TensorFlow-2.x-YOLOv3 8 | # Description : code to track detected object from video or webcam 9 | # 10 | #================================================================ 11 | import os 12 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 13 | import cv2 14 | import numpy as np 15 | import tensorflow as tf 16 | from yolov3.utils import Load_Yolo_model, image_preprocess, postprocess_boxes, nms, draw_bbox, read_class_names 17 | from yolov3.configs import * 18 | import time 19 | 20 | from deep_sort import nn_matching 21 | from deep_sort.detection import Detection 22 | from deep_sort.tracker import Tracker 23 | from deep_sort import generate_detections as gdet 24 | 25 | video_path = "./IMAGES/test.mp4" 26 | 27 | def Object_tracking(Yolo, video_path, output_path, input_size=416, show=False, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors='', Track_only = []): 28 | # Definition of the parameters 29 | max_cosine_distance = 0.7 30 | nn_budget = None 31 | 32 | #initialize deep sort object 33 | model_filename = 'model_data/mars-small128.pb' 34 | encoder = gdet.create_box_encoder(model_filename, batch_size=1) 35 | metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) 36 | tracker = Tracker(metric) 37 | 38 | times, times_2 = [], [] 39 | 40 | if video_path: 41 | vid = cv2.VideoCapture(video_path) # detect on video 42 | else: 43 | vid = cv2.VideoCapture(0) # detect from webcam 44 | 45 | # by default VideoCapture returns float instead of int 46 | width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) 47 | height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) 48 | fps = int(vid.get(cv2.CAP_PROP_FPS)) 49 | codec = cv2.VideoWriter_fourcc(*'XVID') 50 | out = cv2.VideoWriter(output_path, codec, fps, (width, height)) # output_path must be .mp4 51 | 52 | NUM_CLASS = read_class_names(CLASSES) 53 | key_list = list(NUM_CLASS.keys()) 54 | val_list = list(NUM_CLASS.values()) 55 | while True: 56 | _, frame = vid.read() 57 | 58 | try: 59 | original_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 60 | original_frame = cv2.cvtColor(original_frame, cv2.COLOR_BGR2RGB) 61 | except: 62 | break 63 | 64 | image_data = image_preprocess(np.copy(original_frame), [input_size, input_size]) 65 | #image_data = tf.expand_dims(image_data, 0) 66 | image_data = image_data[np.newaxis, ...].astype(np.float32) 67 | 68 | t1 = time.time() 69 | if YOLO_FRAMEWORK == "tf": 70 | pred_bbox = Yolo.predict(image_data) 71 | elif YOLO_FRAMEWORK == "trt": 72 | batched_input = tf.constant(image_data) 73 | result = Yolo(batched_input) 74 | pred_bbox = [] 75 | for key, value in result.items(): 76 | value = value.numpy() 77 | pred_bbox.append(value) 78 | 79 | #t1 = time.time() 80 | #pred_bbox = Yolo.predict(image_data) 81 | t2 = time.time() 82 | 83 | pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] 84 | pred_bbox = tf.concat(pred_bbox, axis=0) 85 | 86 | bboxes = postprocess_boxes(pred_bbox, original_frame, input_size, score_threshold) 87 | bboxes = nms(bboxes, iou_threshold, method='nms') 88 | 89 | # extract bboxes to boxes (x, y, width, height), scores and names 90 | boxes, scores, names = [], [], [] 91 | for bbox in bboxes: 92 | if len(Track_only) !=0 and NUM_CLASS[int(bbox[5])] in Track_only or len(Track_only) == 0: 93 | boxes.append([bbox[0].astype(int), bbox[1].astype(int), bbox[2].astype(int)-bbox[0].astype(int), bbox[3].astype(int)-bbox[1].astype(int)]) 94 | scores.append(bbox[4]) 95 | names.append(NUM_CLASS[int(bbox[5])]) 96 | 97 | # Obtain all the detections for the given frame. 98 | boxes = np.array(boxes) 99 | names = np.array(names) 100 | scores = np.array(scores) 101 | features = np.array(encoder(original_frame, boxes)) 102 | detections = [Detection(bbox, score, class_name, feature) for bbox, score, class_name, feature in zip(boxes, scores, names, features)] 103 | 104 | # Pass detections to the deepsort object and obtain the track information. 105 | tracker.predict() 106 | tracker.update(detections) 107 | 108 | # Obtain info from the tracks 109 | tracked_bboxes = [] 110 | for track in tracker.tracks: 111 | if not track.is_confirmed() or track.time_since_update > 5: 112 | continue 113 | bbox = track.to_tlbr() # Get the corrected/predicted bounding box 114 | class_name = track.get_class() #Get the class name of particular object 115 | tracking_id = track.track_id # Get the ID for the particular track 116 | index = key_list[val_list.index(class_name)] # Get predicted object index by object name 117 | tracked_bboxes.append(bbox.tolist() + [tracking_id, index]) # Structure data, that we could use it with our draw_bbox function 118 | 119 | # draw detection on frame 120 | image = draw_bbox(original_frame, tracked_bboxes, CLASSES=CLASSES, tracking=True) 121 | 122 | t3 = time.time() 123 | times.append(t2-t1) 124 | times_2.append(t3-t1) 125 | 126 | times = times[-20:] 127 | times_2 = times_2[-20:] 128 | 129 | ms = sum(times)/len(times)*1000 130 | fps = 1000 / ms 131 | fps2 = 1000 / (sum(times_2)/len(times_2)*1000) 132 | 133 | image = cv2.putText(image, "Time: {:.1f} FPS".format(fps), (0, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) 134 | 135 | # draw original yolo detection 136 | #image = draw_bbox(image, bboxes, CLASSES=CLASSES, show_label=False, rectangle_colors=rectangle_colors, tracking=True) 137 | 138 | print("Time: {:.2f}ms, Detection FPS: {:.1f}, total FPS: {:.1f}".format(ms, fps, fps2)) 139 | if output_path != '': out.write(image) 140 | if show: 141 | cv2.imshow('output', image) 142 | 143 | if cv2.waitKey(25) & 0xFF == ord("q"): 144 | cv2.destroyAllWindows() 145 | break 146 | 147 | cv2.destroyAllWindows() 148 | 149 | 150 | yolo = Load_Yolo_model() 151 | Object_tracking(yolo, video_path, "detection.mp4", input_size=YOLO_INPUT_SIZE, show=True, iou_threshold=0.1, rectangle_colors=(255,0,0), Track_only = ["person"]) 152 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.18.2 2 | scipy>=1.4.1 3 | wget>=3.2 4 | seaborn>=0.10.0 5 | tensorflow 6 | opencv-python==4.4.0.46 7 | tqdm==4.43.0 8 | pandas 9 | awscli 10 | urllib3 11 | mss 12 | -------------------------------------------------------------------------------- /tools/Convert_to_TRT.py: -------------------------------------------------------------------------------- 1 | #================================================================ 2 | # 3 | # File name : Convert_to_TRT.py 4 | # Author : PyLessons 5 | # Created date: 2020-08-17 6 | # Website : https://pylessons.com/ 7 | # GitHub : https://github.com/pythonlessons/TensorFlow-2.x-YOLOv3 8 | # Description : convert TF frozen graph to TensorRT model 9 | # 10 | #================================================================ 11 | import os 12 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 13 | import sys 14 | 15 | foldername = os.path.basename(os.getcwd()) 16 | if foldername == "tools": 17 | os.chdir("..") 18 | sys.path.insert(1, os.getcwd()) 19 | 20 | import tensorflow as tf 21 | import numpy as np 22 | physical_devices = tf.config.experimental.list_physical_devices('GPU') 23 | if len(physical_devices) > 0: 24 | tf.config.experimental.set_memory_growth(physical_devices[0], True) 25 | from yolov3.configs import * 26 | from tensorflow.python.compiler.tensorrt import trt_convert as trt 27 | 28 | def calibration_input(): 29 | for i in range(100): 30 | batched_input = np.random.random((1, YOLO_INPUT_SIZE, YOLO_INPUT_SIZE, 3)).astype(np.float32) 31 | batched_input = tf.constant(batched_input) 32 | yield (batched_input,) 33 | 34 | conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS 35 | conversion_params = conversion_params._replace(max_workspace_size_bytes=4000000000) 36 | conversion_params = conversion_params._replace(precision_mode=YOLO_TRT_QUANTIZE_MODE) 37 | conversion_params = conversion_params._replace(max_batch_size=1) 38 | if YOLO_TRT_QUANTIZE_MODE == 'INT8': 39 | conversion_params = conversion_params._replace(use_calibration=True) 40 | 41 | converter = trt.TrtGraphConverterV2(input_saved_model_dir=f'./checkpoints/{YOLO_TYPE}-{YOLO_INPUT_SIZE}', conversion_params=conversion_params) 42 | if YOLO_TRT_QUANTIZE_MODE == 'INT8': 43 | converter.convert(calibration_input_fn=calibration_input) 44 | else: 45 | converter.convert() 46 | 47 | converter.save(output_saved_model_dir=f'./checkpoints/{YOLO_TYPE}-trt-{YOLO_TRT_QUANTIZE_MODE}-{YOLO_INPUT_SIZE}') 48 | print(f'Done Converting to TensorRT, model saved to: /checkpoints/{YOLO_TYPE}-trt-{YOLO_TRT_QUANTIZE_MODE}-{YOLO_INPUT_SIZE}') 49 | -------------------------------------------------------------------------------- /tools/Convert_to_pb.py: -------------------------------------------------------------------------------- 1 | #================================================================ 2 | # 3 | # File name : Convert_to_pb.py 4 | # Author : PyLessons 5 | # Created date: 2020-08-17 6 | # Website : https://pylessons.com/ 7 | # GitHub : https://github.com/pythonlessons/TensorFlow-2.x-YOLOv3 8 | # Description : used to freeze tf model to .pb model 9 | # 10 | #================================================================ 11 | import os 12 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 13 | import sys 14 | 15 | foldername = os.path.basename(os.getcwd()) 16 | if foldername == "tools": 17 | os.chdir("..") 18 | sys.path.insert(1, os.getcwd()) 19 | 20 | import tensorflow as tf 21 | from yolov3.yolov4 import Create_Yolo 22 | from yolov3.utils import load_yolo_weights 23 | from yolov3.configs import * 24 | 25 | if YOLO_TYPE == "yolov4": 26 | Darknet_weights = YOLO_V4_TINY_WEIGHTS if TRAIN_YOLO_TINY else YOLO_V4_WEIGHTS 27 | if YOLO_TYPE == "yolov3": 28 | Darknet_weights = YOLO_V3_TINY_WEIGHTS if TRAIN_YOLO_TINY else YOLO_V3_WEIGHTS 29 | 30 | if YOLO_CUSTOM_WEIGHTS == False: 31 | yolo = Create_Yolo(input_size=YOLO_INPUT_SIZE, CLASSES=YOLO_COCO_CLASSES) 32 | load_yolo_weights(yolo, Darknet_weights) # use Darknet weights 33 | else: 34 | yolo = Create_Yolo(input_size=YOLO_INPUT_SIZE, CLASSES=TRAIN_CLASSES) 35 | yolo.load_weights(YOLO_CUSTOM_WEIGHTS) # use custom weights 36 | 37 | yolo.summary() 38 | yolo.save(f'./checkpoints/{YOLO_TYPE}-{YOLO_INPUT_SIZE}') 39 | 40 | print(f"model saves to /checkpoints/{YOLO_TYPE}-{YOLO_INPUT_SIZE}") 41 | -------------------------------------------------------------------------------- /tools/Detection_to_XML.py: -------------------------------------------------------------------------------- 1 | #================================================================ 2 | # 3 | # File name : Detection_to_XML.py 4 | # Author : PyLessons 5 | # Created date: 2020-09-27 6 | # Website : https://pylessons.com/ 7 | # GitHub : https://github.com/pythonlessons/TensorFlow-2.x-YOLOv3 8 | # Description : converts YOLO detection to XML file 9 | # 10 | #=============================================================== 11 | from textwrap import dedent 12 | from lxml import etree 13 | import glob 14 | import os 15 | import cv2 16 | import time 17 | 18 | def CreateXMLfile(path, file_name, image, bboxes, NUM_CLASS): 19 | boxes = [] 20 | for bbox in bboxes: 21 | boxes.append([bbox[0].astype(int), bbox[1].astype(int), bbox[2].astype(int), bbox[3].astype(int), NUM_CLASS[int(bbox[5])]])#, bbox[4], NUM_CLASS[int(bbox[5])]]) 22 | 23 | if not os.path.exists(path): 24 | os.makedirs(path) 25 | os.chdir(path) 26 | 27 | img_name = "XML_"+file_name+".png" 28 | 29 | cv2.imwrite(img_name,image) 30 | 31 | annotation = etree.Element("annotation") 32 | 33 | folder = etree.Element("folder") 34 | folder.text = os.path.basename(os.getcwd()) 35 | annotation.append(folder) 36 | 37 | filename_xml = etree.Element("filename") 38 | filename_str = img_name.split(".")[0] 39 | filename_xml.text = img_name 40 | annotation.append(filename_xml) 41 | 42 | path = etree.Element("path") 43 | path.text = os.path.join(os.getcwd(), filename_str + ".jpg") 44 | annotation.append(path) 45 | 46 | source = etree.Element("source") 47 | annotation.append(source) 48 | 49 | database = etree.Element("database") 50 | database.text = "Unknown" 51 | source.append(database) 52 | 53 | size = etree.Element("size") 54 | annotation.append(size) 55 | 56 | width = etree.Element("width") 57 | height = etree.Element("height") 58 | depth = etree.Element("depth") 59 | 60 | img = cv2.imread(filename_xml.text) 61 | 62 | width.text = str(img.shape[1]) 63 | height.text = str(img.shape[0]) 64 | depth.text = str(img.shape[2]) 65 | 66 | size.append(width) 67 | size.append(height) 68 | size.append(depth) 69 | 70 | segmented = etree.Element("segmented") 71 | segmented.text = "0" 72 | annotation.append(segmented) 73 | 74 | for Object in boxes: 75 | class_name = Object[4] 76 | xmin_l = str(int(float(Object[0]))) 77 | ymin_l = str(int(float(Object[1]))) 78 | xmax_l = str(int(float(Object[2]))) 79 | ymax_l = str(int(float(Object[3]))) 80 | 81 | obj = etree.Element("object") 82 | annotation.append(obj) 83 | 84 | name = etree.Element("name") 85 | name.text = class_name 86 | obj.append(name) 87 | 88 | pose = etree.Element("pose") 89 | pose.text = "Unspecified" 90 | obj.append(pose) 91 | 92 | truncated = etree.Element("truncated") 93 | truncated.text = "0" 94 | obj.append(truncated) 95 | 96 | difficult = etree.Element("difficult") 97 | difficult.text = "0" 98 | obj.append(difficult) 99 | 100 | bndbox = etree.Element("bndbox") 101 | obj.append(bndbox) 102 | 103 | xmin = etree.Element("xmin") 104 | xmin.text = xmin_l 105 | bndbox.append(xmin) 106 | 107 | ymin = etree.Element("ymin") 108 | ymin.text = ymin_l 109 | bndbox.append(ymin) 110 | 111 | xmax = etree.Element("xmax") 112 | xmax.text = xmax_l 113 | bndbox.append(xmax) 114 | 115 | ymax = etree.Element("ymax") 116 | ymax.text = ymax_l 117 | bndbox.append(ymax) 118 | 119 | # write xml to file 120 | s = etree.tostring(annotation, pretty_print=True) 121 | with open(filename_str + ".xml", 'wb') as f: 122 | f.write(s) 123 | f.close() 124 | 125 | os.chdir("..") 126 | -------------------------------------------------------------------------------- /tools/XML_to_YOLOv3.py: -------------------------------------------------------------------------------- 1 | #================================================================ 2 | # 3 | # File name : XML_to_YOLOv3.py 4 | # Author : PyLessons 5 | # Created date: 2020-06-04 6 | # Website : https://pylessons.com/ 7 | # GitHub : https://github.com/pythonlessons/TensorFlow-2.x-YOLOv3 8 | # Description : used to convert XML labels to YOLOv3 training labels 9 | # 10 | #================================================================ 11 | import xml.etree.ElementTree as ET 12 | import os 13 | import glob 14 | 15 | foldername = os.path.basename(os.getcwd()) 16 | if foldername == "tools": os.chdir("..") 17 | 18 | 19 | data_dir = '/custom_dataset/' 20 | Dataset_names_path = "model_data/license_plate_names.txt" 21 | Dataset_train = "model_data/license_plate_train.txt" 22 | Dataset_test = "model_data/license_plate_test.txt" 23 | is_subfolder = False 24 | 25 | Dataset_names = [] 26 | 27 | def ParseXML(img_folder, file): 28 | for xml_file in glob.glob(img_folder+'/*.xml'): 29 | tree=ET.parse(open(xml_file)) 30 | root = tree.getroot() 31 | image_name = root.find('filename').text 32 | img_path = img_folder+'/'+image_name 33 | for i, obj in enumerate(root.iter('object')): 34 | difficult = obj.find('difficult').text 35 | cls = obj.find('name').text 36 | if cls not in Dataset_names: 37 | Dataset_names.append(cls) 38 | cls_id = Dataset_names.index(cls) 39 | xmlbox = obj.find('bndbox') 40 | OBJECT = (str(int(float(xmlbox.find('xmin').text)))+',' 41 | +str(int(float(xmlbox.find('ymin').text)))+',' 42 | +str(int(float(xmlbox.find('xmax').text)))+',' 43 | +str(int(float(xmlbox.find('ymax').text)))+',' 44 | +str(cls_id)) 45 | img_path += ' '+OBJECT 46 | print(img_path) 47 | file.write(img_path+'\n') 48 | 49 | def run_XML_to_YOLOv3(): 50 | for i, folder in enumerate(['train','test']): 51 | with open([Dataset_train,Dataset_test][i], "w") as file: 52 | print(os.getcwd()+data_dir+folder) 53 | img_path = os.path.join(os.getcwd()+data_dir+folder) 54 | if is_subfolder: 55 | for directory in os.listdir(img_path): 56 | xml_path = os.path.join(img_path, directory) 57 | ParseXML(xml_path, file) 58 | else: 59 | ParseXML(img_path, file) 60 | 61 | print("Dataset_names:", Dataset_names) 62 | with open(Dataset_names_path, "w") as file: 63 | for name in Dataset_names: 64 | file.write(str(name)+'\n') 65 | 66 | run_XML_to_YOLOv3() 67 | -------------------------------------------------------------------------------- /tools/oid_to_pascal_voc_xml.py: -------------------------------------------------------------------------------- 1 | #================================================================ 2 | # 3 | # File name : oid_to_pascal_vos_xml.py 4 | # Author : PyLessons 5 | # Created date: 2020-06-04 6 | # Website : https://pylessons.com/ 7 | # GitHub : https://github.com/pythonlessons/TensorFlow-2.x-YOLOv3 8 | # Description : used to convert oid labels to pascal vos xml 9 | # 10 | #================================================================ 11 | import os 12 | from tqdm import tqdm 13 | from sys import exit 14 | import argparse 15 | import cv2 16 | from textwrap import dedent 17 | from lxml import etree 18 | 19 | foldername = os.path.basename(os.getcwd()) 20 | if foldername == "tools": os.chdir("..") 21 | 22 | Dataset_path = "OIDv4_ToolKit/OID/Dataset" 23 | 24 | def convert_to_xml(): 25 | current_path = os.getcwd() 26 | os.chdir(Dataset_path) 27 | DIRS = os.listdir(os.getcwd()) 28 | 29 | for DIR in DIRS: 30 | if os.path.isdir(DIR): 31 | os.chdir(DIR) 32 | 33 | print("Currently in Subdirectory:", DIR) 34 | CLASS_DIRS = os.listdir(os.getcwd()) 35 | for CLASS_DIR in CLASS_DIRS: 36 | if " " in CLASS_DIR: 37 | os.rename(CLASS_DIR, CLASS_DIR.replace(" ", "_")) 38 | 39 | CLASS_DIRS = os.listdir(os.getcwd()) 40 | for CLASS_DIR in CLASS_DIRS: 41 | if os.path.isdir(CLASS_DIR): 42 | os.chdir(CLASS_DIR) 43 | 44 | print("\n" + "Creating PASCAL VOC XML Files for Class:", CLASS_DIR) 45 | # Create Directory for annotations if it does not exist yet 46 | 47 | #Read Labels from OIDv4 ToolKit 48 | os.chdir("Label") 49 | 50 | #Create PASCAL XML 51 | for filename in tqdm(os.listdir(os.getcwd())): 52 | if filename.endswith(".txt"): 53 | filename_str = str.split(filename, ".")[0] 54 | 55 | 56 | annotation = etree.Element("annotation") 57 | 58 | os.chdir("..") 59 | folder = etree.Element("folder") 60 | folder.text = os.path.basename(os.getcwd()) 61 | annotation.append(folder) 62 | 63 | filename_xml = etree.Element("filename") 64 | filename_xml.text = filename_str + ".jpg" 65 | annotation.append(filename_xml) 66 | 67 | path = etree.Element("path") 68 | path.text = os.path.join(os.path.dirname(os.path.abspath(filename)), filename_str + ".jpg") 69 | annotation.append(path) 70 | 71 | source = etree.Element("source") 72 | annotation.append(source) 73 | 74 | database = etree.Element("database") 75 | database.text = "Unknown" 76 | source.append(database) 77 | 78 | size = etree.Element("size") 79 | annotation.append(size) 80 | 81 | width = etree.Element("width") 82 | height = etree.Element("height") 83 | depth = etree.Element("depth") 84 | 85 | img = cv2.imread(filename_xml.text) 86 | 87 | try: 88 | width.text = str(img.shape[1]) 89 | except AttributeError: 90 | os.chdir("Label") 91 | continue 92 | height.text = str(img.shape[0]) 93 | depth.text = str(img.shape[2]) 94 | 95 | size.append(width) 96 | size.append(height) 97 | size.append(depth) 98 | 99 | segmented = etree.Element("segmented") 100 | segmented.text = "0" 101 | annotation.append(segmented) 102 | 103 | os.chdir("Label") 104 | label_original = open(filename, 'r') 105 | 106 | # Labels from OIDv4 Toolkit: name_of_class X_min Y_min X_max Y_max 107 | for line in label_original: 108 | line = line.strip() 109 | l = line.split(' ') 110 | 111 | class_name_len = len(l) - 4 # 4 coordinates 112 | class_name = l[0] 113 | for i in range(1,class_name_len): 114 | class_name = f"{class_name}_{l[i]}" 115 | 116 | addi = class_name_len 117 | 118 | xmin_l = str(int(round(float(l[0+addi])))) 119 | ymin_l = str(int(round(float(l[1+addi])))) 120 | xmax_l = str(int(round(float(l[2+addi])))) 121 | ymax_l = str(int(round(float(l[3+addi])))) 122 | 123 | obj = etree.Element("object") 124 | annotation.append(obj) 125 | 126 | name = etree.Element("name") 127 | name.text = class_name 128 | obj.append(name) 129 | 130 | pose = etree.Element("pose") 131 | pose.text = "Unspecified" 132 | obj.append(pose) 133 | 134 | truncated = etree.Element("truncated") 135 | truncated.text = "0" 136 | obj.append(truncated) 137 | 138 | difficult = etree.Element("difficult") 139 | difficult.text = "0" 140 | obj.append(difficult) 141 | 142 | bndbox = etree.Element("bndbox") 143 | obj.append(bndbox) 144 | 145 | xmin = etree.Element("xmin") 146 | xmin.text = xmin_l 147 | bndbox.append(xmin) 148 | 149 | ymin = etree.Element("ymin") 150 | ymin.text = ymin_l 151 | bndbox.append(ymin) 152 | 153 | xmax = etree.Element("xmax") 154 | xmax.text = xmax_l 155 | bndbox.append(xmax) 156 | 157 | ymax = etree.Element("ymax") 158 | ymax.text = ymax_l 159 | bndbox.append(ymax) 160 | 161 | os.chdir("..") 162 | # write xml to file 163 | s = etree.tostring(annotation, pretty_print=True) 164 | with open(filename_str + ".xml", 'wb') as f: 165 | f.write(s) 166 | f.close() 167 | 168 | os.chdir("Label") 169 | 170 | os.chdir("..") 171 | os.chdir("..") 172 | 173 | os.chdir("..") 174 | 175 | convert_to_xml() 176 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #================================================================ 2 | # 3 | # File name : train.py 4 | # Author : PyLessons 5 | # Created date: 2020-08-06 6 | # Website : https://pylessons.com/ 7 | # GitHub : https://github.com/pythonlessons/TensorFlow-2.x-YOLOv3 8 | # Description : used to train custom object detector 9 | # 10 | #================================================================ 11 | import os 12 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 13 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' 14 | from tensorflow.python.client import device_lib 15 | print(device_lib.list_local_devices()) 16 | import shutil 17 | import numpy as np 18 | import tensorflow as tf 19 | #from tensorflow.keras.utils import plot_model 20 | from yolov3.dataset import Dataset 21 | from yolov3.yolov4 import Create_Yolo, compute_loss 22 | from yolov3.utils import load_yolo_weights 23 | from yolov3.configs import * 24 | from evaluate_mAP import get_mAP 25 | 26 | if YOLO_TYPE == "yolov4": 27 | Darknet_weights = YOLO_V4_TINY_WEIGHTS if TRAIN_YOLO_TINY else YOLO_V4_WEIGHTS 28 | if YOLO_TYPE == "yolov3": 29 | Darknet_weights = YOLO_V3_TINY_WEIGHTS if TRAIN_YOLO_TINY else YOLO_V3_WEIGHTS 30 | if TRAIN_YOLO_TINY: TRAIN_MODEL_NAME += "_Tiny" 31 | 32 | def main(): 33 | global TRAIN_FROM_CHECKPOINT 34 | 35 | gpus = tf.config.experimental.list_physical_devices('GPU') 36 | print(f'GPUs {gpus}') 37 | if len(gpus) > 0: 38 | try: tf.config.experimental.set_memory_growth(gpus[0], True) 39 | except RuntimeError: pass 40 | 41 | if os.path.exists(TRAIN_LOGDIR): shutil.rmtree(TRAIN_LOGDIR) 42 | writer = tf.summary.create_file_writer(TRAIN_LOGDIR) 43 | 44 | trainset = Dataset('train') 45 | testset = Dataset('test') 46 | 47 | steps_per_epoch = len(trainset) 48 | global_steps = tf.Variable(1, trainable=False, dtype=tf.int64) 49 | warmup_steps = TRAIN_WARMUP_EPOCHS * steps_per_epoch 50 | total_steps = TRAIN_EPOCHS * steps_per_epoch 51 | 52 | if TRAIN_TRANSFER: 53 | Darknet = Create_Yolo(input_size=YOLO_INPUT_SIZE, CLASSES=YOLO_COCO_CLASSES) 54 | load_yolo_weights(Darknet, Darknet_weights) # use darknet weights 55 | 56 | yolo = Create_Yolo(input_size=YOLO_INPUT_SIZE, training=True, CLASSES=TRAIN_CLASSES) 57 | if TRAIN_FROM_CHECKPOINT: 58 | try: 59 | yolo.load_weights(f"./checkpoints/{TRAIN_MODEL_NAME}") 60 | except ValueError: 61 | print("Shapes are incompatible, transfering Darknet weights") 62 | TRAIN_FROM_CHECKPOINT = False 63 | 64 | if TRAIN_TRANSFER and not TRAIN_FROM_CHECKPOINT: 65 | for i, l in enumerate(Darknet.layers): 66 | layer_weights = l.get_weights() 67 | if layer_weights != []: 68 | try: 69 | yolo.layers[i].set_weights(layer_weights) 70 | except: 71 | print("skipping", yolo.layers[i].name) 72 | 73 | optimizer = tf.keras.optimizers.Adam() 74 | 75 | 76 | def train_step(image_data, target): 77 | with tf.GradientTape() as tape: 78 | pred_result = yolo(image_data, training=True) 79 | giou_loss=conf_loss=prob_loss=0 80 | 81 | # optimizing process 82 | grid = 3 if not TRAIN_YOLO_TINY else 2 83 | for i in range(grid): 84 | conv, pred = pred_result[i*2], pred_result[i*2+1] 85 | loss_items = compute_loss(pred, conv, *target[i], i, CLASSES=TRAIN_CLASSES) 86 | giou_loss += loss_items[0] 87 | conf_loss += loss_items[1] 88 | prob_loss += loss_items[2] 89 | 90 | total_loss = giou_loss + conf_loss + prob_loss 91 | 92 | gradients = tape.gradient(total_loss, yolo.trainable_variables) 93 | optimizer.apply_gradients(zip(gradients, yolo.trainable_variables)) 94 | 95 | # update learning rate 96 | # about warmup: https://arxiv.org/pdf/1812.01187.pdf&usg=ALkJrhglKOPDjNt6SHGbphTHyMcT0cuMJg 97 | global_steps.assign_add(1) 98 | if global_steps < warmup_steps:# and not TRAIN_TRANSFER: 99 | lr = global_steps / warmup_steps * TRAIN_LR_INIT 100 | else: 101 | lr = TRAIN_LR_END + 0.5 * (TRAIN_LR_INIT - TRAIN_LR_END)*( 102 | (1 + tf.cos((global_steps - warmup_steps) / (total_steps - warmup_steps) * np.pi))) 103 | optimizer.lr.assign(lr.numpy()) 104 | 105 | # writing summary data 106 | with writer.as_default(): 107 | tf.summary.scalar("lr", optimizer.lr, step=global_steps) 108 | tf.summary.scalar("loss/total_loss", total_loss, step=global_steps) 109 | tf.summary.scalar("loss/giou_loss", giou_loss, step=global_steps) 110 | tf.summary.scalar("loss/conf_loss", conf_loss, step=global_steps) 111 | tf.summary.scalar("loss/prob_loss", prob_loss, step=global_steps) 112 | writer.flush() 113 | 114 | return global_steps.numpy(), optimizer.lr.numpy(), giou_loss.numpy(), conf_loss.numpy(), prob_loss.numpy(), total_loss.numpy() 115 | 116 | validate_writer = tf.summary.create_file_writer(TRAIN_LOGDIR) 117 | def validate_step(image_data, target): 118 | with tf.GradientTape() as tape: 119 | pred_result = yolo(image_data, training=False) 120 | giou_loss=conf_loss=prob_loss=0 121 | 122 | # optimizing process 123 | grid = 3 if not TRAIN_YOLO_TINY else 2 124 | for i in range(grid): 125 | conv, pred = pred_result[i*2], pred_result[i*2+1] 126 | loss_items = compute_loss(pred, conv, *target[i], i, CLASSES=TRAIN_CLASSES) 127 | giou_loss += loss_items[0] 128 | conf_loss += loss_items[1] 129 | prob_loss += loss_items[2] 130 | 131 | total_loss = giou_loss + conf_loss + prob_loss 132 | 133 | return giou_loss.numpy(), conf_loss.numpy(), prob_loss.numpy(), total_loss.numpy() 134 | 135 | mAP_model = Create_Yolo(input_size=YOLO_INPUT_SIZE, CLASSES=TRAIN_CLASSES) # create second model to measure mAP 136 | 137 | best_val_loss = 1000 # should be large at start 138 | for epoch in range(TRAIN_EPOCHS): 139 | for image_data, target in trainset: 140 | results = train_step(image_data, target) 141 | cur_step = results[0]%steps_per_epoch 142 | print("epoch:{:2.0f} step:{:5.0f}/{}, lr:{:.6f}, giou_loss:{:7.2f}, conf_loss:{:7.2f}, prob_loss:{:7.2f}, total_loss:{:7.2f}" 143 | .format(epoch, cur_step, steps_per_epoch, results[1], results[2], results[3], results[4], results[5])) 144 | 145 | if len(testset) == 0: 146 | print("configure TEST options to validate model") 147 | yolo.save_weights(os.path.join(TRAIN_CHECKPOINTS_FOLDER, TRAIN_MODEL_NAME)) 148 | continue 149 | 150 | count, giou_val, conf_val, prob_val, total_val = 0., 0, 0, 0, 0 151 | for image_data, target in testset: 152 | results = validate_step(image_data, target) 153 | count += 1 154 | giou_val += results[0] 155 | conf_val += results[1] 156 | prob_val += results[2] 157 | total_val += results[3] 158 | # writing validate summary data 159 | with validate_writer.as_default(): 160 | tf.summary.scalar("validate_loss/total_val", total_val/count, step=epoch) 161 | tf.summary.scalar("validate_loss/giou_val", giou_val/count, step=epoch) 162 | tf.summary.scalar("validate_loss/conf_val", conf_val/count, step=epoch) 163 | tf.summary.scalar("validate_loss/prob_val", prob_val/count, step=epoch) 164 | validate_writer.flush() 165 | 166 | print("\n\ngiou_val_loss:{:7.2f}, conf_val_loss:{:7.2f}, prob_val_loss:{:7.2f}, total_val_loss:{:7.2f}\n\n". 167 | format(giou_val/count, conf_val/count, prob_val/count, total_val/count)) 168 | 169 | if TRAIN_SAVE_CHECKPOINT and not TRAIN_SAVE_BEST_ONLY: 170 | save_directory = os.path.join(TRAIN_CHECKPOINTS_FOLDER, TRAIN_MODEL_NAME+"_val_loss_{:7.2f}".format(total_val/count)) 171 | yolo.save_weights(save_directory) 172 | if TRAIN_SAVE_BEST_ONLY and best_val_loss>total_val/count: 173 | save_directory = os.path.join(TRAIN_CHECKPOINTS_FOLDER, TRAIN_MODEL_NAME) 174 | yolo.save_weights(save_directory) 175 | best_val_loss = total_val/count 176 | if not TRAIN_SAVE_BEST_ONLY and not TRAIN_SAVE_CHECKPOINT: 177 | save_directory = os.path.join(TRAIN_CHECKPOINTS_FOLDER, TRAIN_MODEL_NAME) 178 | yolo.save_weights(save_directory) 179 | 180 | # measure mAP of trained custom model 181 | try: 182 | mAP_model.load_weights(save_directory) # use keras weights 183 | get_mAP(mAP_model, testset, score_threshold=TEST_SCORE_THRESHOLD, iou_threshold=TEST_IOU_THRESHOLD) 184 | except UnboundLocalError: 185 | print("You don't have saved model weights to measure mAP, check TRAIN_SAVE_BEST_ONLY and TRAIN_SAVE_CHECKPOINT lines in configs.py") 186 | 187 | if __name__ == '__main__': 188 | main() 189 | -------------------------------------------------------------------------------- /yolov3/__ init __.py: -------------------------------------------------------------------------------- 1 | # 2 | -------------------------------------------------------------------------------- /yolov3/__pycache__/configs.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/TensorFlow-2.x-YOLOv3/9f29d73ee24cd5db4ead280f95ff06f66d538fc2/yolov3/__pycache__/configs.cpython-36.pyc -------------------------------------------------------------------------------- /yolov3/__pycache__/dataset.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/TensorFlow-2.x-YOLOv3/9f29d73ee24cd5db4ead280f95ff06f66d538fc2/yolov3/__pycache__/dataset.cpython-36.pyc -------------------------------------------------------------------------------- /yolov3/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/TensorFlow-2.x-YOLOv3/9f29d73ee24cd5db4ead280f95ff06f66d538fc2/yolov3/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /yolov3/__pycache__/yolov3.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/TensorFlow-2.x-YOLOv3/9f29d73ee24cd5db4ead280f95ff06f66d538fc2/yolov3/__pycache__/yolov3.cpython-36.pyc -------------------------------------------------------------------------------- /yolov3/configs.py: -------------------------------------------------------------------------------- 1 | #================================================================ 2 | # 3 | # File name : configs.py 4 | # Author : PyLessons 5 | # Created date: 2020-08-18 6 | # Website : https://pylessons.com/ 7 | # GitHub : https://github.com/pythonlessons/TensorFlow-2.x-YOLOv3 8 | # Description : yolov3 configuration file 9 | # 10 | #================================================================ 11 | 12 | # YOLO options 13 | YOLO_TYPE = "yolov3" # yolov4 or yolov3 14 | YOLO_FRAMEWORK = "tf" # "tf" or "trt" 15 | YOLO_V3_WEIGHTS = "model_data/yolov3.weights" 16 | YOLO_V4_WEIGHTS = "model_data/yolov4.weights" 17 | YOLO_V3_TINY_WEIGHTS = "model_data/yolov3-tiny.weights" 18 | YOLO_V4_TINY_WEIGHTS = "model_data/yolov4-tiny.weights" 19 | YOLO_TRT_QUANTIZE_MODE = "INT8" # INT8, FP16, FP32 20 | YOLO_CUSTOM_WEIGHTS = False # "checkpoints/yolov3_custom" # used in evaluate_mAP.py and custom model detection, if not using leave False 21 | # YOLO_CUSTOM_WEIGHTS also used with TensorRT and custom model detection 22 | YOLO_COCO_CLASSES = "model_data/coco/coco.names" 23 | YOLO_STRIDES = [8, 16, 32] 24 | YOLO_IOU_LOSS_THRESH = 0.5 25 | YOLO_ANCHOR_PER_SCALE = 3 26 | YOLO_MAX_BBOX_PER_SCALE = 100 27 | YOLO_INPUT_SIZE = 416 28 | if YOLO_TYPE == "yolov4": 29 | YOLO_ANCHORS = [[[12, 16], [19, 36], [40, 28]], 30 | [[36, 75], [76, 55], [72, 146]], 31 | [[142,110], [192, 243], [459, 401]]] 32 | if YOLO_TYPE == "yolov3": 33 | YOLO_ANCHORS = [[[10, 13], [16, 30], [33, 23]], 34 | [[30, 61], [62, 45], [59, 119]], 35 | [[116, 90], [156, 198], [373, 326]]] 36 | # Train options 37 | TRAIN_YOLO_TINY = False 38 | TRAIN_SAVE_BEST_ONLY = True # saves only best model according validation loss (True recommended) 39 | TRAIN_SAVE_CHECKPOINT = False # saves all best validated checkpoints in training process (may require a lot disk space) (False recommended) 40 | TRAIN_CLASSES = "mnist/mnist.names" 41 | TRAIN_ANNOT_PATH = "mnist/mnist_train.txt" 42 | TRAIN_LOGDIR = "log" 43 | TRAIN_CHECKPOINTS_FOLDER = "checkpoints" 44 | TRAIN_MODEL_NAME = f"{YOLO_TYPE}_custom" 45 | TRAIN_LOAD_IMAGES_TO_RAM = True # With True faster training, but need more RAM 46 | TRAIN_BATCH_SIZE = 4 47 | TRAIN_INPUT_SIZE = 416 48 | TRAIN_DATA_AUG = True 49 | TRAIN_TRANSFER = True 50 | TRAIN_FROM_CHECKPOINT = False # "checkpoints/yolov3_custom" 51 | TRAIN_LR_INIT = 1e-4 52 | TRAIN_LR_END = 1e-6 53 | TRAIN_WARMUP_EPOCHS = 2 54 | TRAIN_EPOCHS = 100 55 | 56 | # TEST options 57 | TEST_ANNOT_PATH = "mnist/mnist_test.txt" 58 | TEST_BATCH_SIZE = 4 59 | TEST_INPUT_SIZE = 416 60 | TEST_DATA_AUG = False 61 | TEST_DECTECTED_IMAGE_PATH = "" 62 | TEST_SCORE_THRESHOLD = 0.3 63 | TEST_IOU_THRESHOLD = 0.45 64 | 65 | if TRAIN_YOLO_TINY: 66 | YOLO_STRIDES = [16, 32] 67 | # YOLO_ANCHORS = [[[23, 27], [37, 58], [81, 82]], # this line can be uncommented for default coco weights 68 | YOLO_ANCHORS = [[[10, 14], [23, 27], [37, 58]], 69 | [[81, 82], [135, 169], [344, 319]]] 70 | -------------------------------------------------------------------------------- /yolov3/dataset.py: -------------------------------------------------------------------------------- 1 | #================================================================ 2 | # 3 | # File name : dataset.py 4 | # Author : PyLessons 5 | # Created date: 2020-07-31 6 | # Website : https://pylessons.com/ 7 | # GitHub : https://github.com/pythonlessons/TensorFlow-2.x-YOLOv3 8 | # Description : functions used to prepare dataset for custom training 9 | # 10 | #================================================================ 11 | # TODO: transfer numpy to tensorflow operations 12 | import os 13 | import cv2 14 | import random 15 | import numpy as np 16 | import tensorflow as tf 17 | from yolov3.utils import read_class_names, image_preprocess 18 | from yolov3.yolov3 import bbox_iou 19 | from yolov3.configs import * 20 | 21 | 22 | class Dataset(object): 23 | # Dataset preprocess implementation 24 | def __init__(self, dataset_type, TEST_INPUT_SIZE=TEST_INPUT_SIZE): 25 | self.annot_path = TRAIN_ANNOT_PATH if dataset_type == 'train' else TEST_ANNOT_PATH 26 | self.input_sizes = TRAIN_INPUT_SIZE if dataset_type == 'train' else TEST_INPUT_SIZE 27 | self.batch_size = TRAIN_BATCH_SIZE if dataset_type == 'train' else TEST_BATCH_SIZE 28 | self.data_aug = TRAIN_DATA_AUG if dataset_type == 'train' else TEST_DATA_AUG 29 | 30 | self.train_yolo_tiny = TRAIN_YOLO_TINY 31 | self.train_input_sizes = TRAIN_INPUT_SIZE 32 | self.strides = np.array(YOLO_STRIDES) 33 | self.classes = read_class_names(TRAIN_CLASSES) 34 | self.num_classes = len(self.classes) 35 | self.anchors = (np.array(YOLO_ANCHORS).T/self.strides).T 36 | self.anchor_per_scale = YOLO_ANCHOR_PER_SCALE 37 | self.max_bbox_per_scale = YOLO_MAX_BBOX_PER_SCALE 38 | 39 | self.annotations = self.load_annotations(dataset_type) 40 | self.num_samples = len(self.annotations) 41 | self.num_batchs = int(np.ceil(self.num_samples / self.batch_size)) 42 | self.batch_count = 0 43 | 44 | 45 | def load_annotations(self, dataset_type): 46 | final_annotations = [] 47 | with open(self.annot_path, 'r') as f: 48 | txt = f.read().splitlines() 49 | annotations = [line.strip() for line in txt if len(line.strip().split()[1:]) != 0] 50 | np.random.shuffle(annotations) 51 | 52 | # for annotation in annotations: 53 | # image_extension = '.jpg' 54 | # extension_index = annotation.find(image_extension) 55 | # image_path = annotation[:extension_index+len(image_extension)] 56 | # line = annotation[extension_index+len(image_extension):].split() 57 | # if not os.path.exists(image_path): 58 | # raise KeyError("%s does not exist ... " %image_path) 59 | # if TRAIN_LOAD_IMAGES_TO_RAM: 60 | # image = cv2.imread(image_path) 61 | # else: 62 | # image = '' 63 | # final_annotations.append([image_path, line, image]) 64 | # return final_annotations 65 | for annotation in annotations: 66 | # fully parse annotations 67 | line = annotation.split() 68 | image_path, index = "", 1 69 | for i, one_line in enumerate(line): 70 | if not one_line.replace(",","").isnumeric(): 71 | if image_path != "": image_path += " " 72 | image_path += one_line 73 | else: 74 | index = i 75 | break 76 | if not os.path.exists(image_path): 77 | raise KeyError("%s does not exist ... " %image_path) 78 | if TRAIN_LOAD_IMAGES_TO_RAM: 79 | image = cv2.imread(image_path) 80 | else: 81 | image = '' 82 | final_annotations.append([image_path, line[index:], image]) 83 | return final_annotations 84 | 85 | def __iter__(self): 86 | return self 87 | 88 | def Delete_bad_annotation(self, bad_annotation): 89 | print(f'Deleting {bad_annotation} annotation line') 90 | bad_image_path = bad_annotation[0] 91 | bad_image_name = bad_annotation[0].split('/')[-1] # can be used to delete bad image 92 | bad_xml_path = bad_annotation[0][:-3]+'xml' # can be used to delete bad xml file 93 | 94 | # remove bad annotation line from annotation file 95 | with open(self.annot_path, "r+") as f: 96 | d = f.readlines() 97 | f.seek(0) 98 | for i in d: 99 | if bad_image_name not in i: 100 | f.write(i) 101 | f.truncate() 102 | 103 | def __next__(self): 104 | with tf.device('/cpu:0'): 105 | self.train_input_size = random.choice([self.train_input_sizes]) 106 | self.train_output_sizes = self.train_input_size // self.strides 107 | 108 | batch_image = np.zeros((self.batch_size, self.train_input_size, self.train_input_size, 3), dtype=np.float32) 109 | 110 | if self.train_yolo_tiny: 111 | batch_label_mbbox = np.zeros((self.batch_size, self.train_output_sizes[0], self.train_output_sizes[0], self.anchor_per_scale, 5 + self.num_classes), dtype=np.float32) 112 | batch_label_lbbox = np.zeros((self.batch_size, self.train_output_sizes[1], self.train_output_sizes[1], self.anchor_per_scale, 5 + self.num_classes), dtype=np.float32) 113 | else: 114 | batch_label_sbbox = np.zeros((self.batch_size, self.train_output_sizes[0], self.train_output_sizes[0], self.anchor_per_scale, 5 + self.num_classes), dtype=np.float32) 115 | batch_label_mbbox = np.zeros((self.batch_size, self.train_output_sizes[1], self.train_output_sizes[1], self.anchor_per_scale, 5 + self.num_classes), dtype=np.float32) 116 | batch_label_lbbox = np.zeros((self.batch_size, self.train_output_sizes[2], self.train_output_sizes[2], self.anchor_per_scale, 5 + self.num_classes), dtype=np.float32) 117 | 118 | batch_sbboxes = np.zeros((self.batch_size, self.max_bbox_per_scale, 4), dtype=np.float32) 119 | 120 | batch_mbboxes = np.zeros((self.batch_size, self.max_bbox_per_scale, 4), dtype=np.float32) 121 | batch_lbboxes = np.zeros((self.batch_size, self.max_bbox_per_scale, 4), dtype=np.float32) 122 | 123 | exceptions = False 124 | num = 0 125 | if self.batch_count < self.num_batchs: 126 | while num < self.batch_size: 127 | index = self.batch_count * self.batch_size + num 128 | if index >= self.num_samples: index -= self.num_samples 129 | annotation = self.annotations[index] 130 | image, bboxes = self.parse_annotation(annotation) 131 | try: 132 | if self.train_yolo_tiny: 133 | label_mbbox, label_lbbox, mbboxes, lbboxes = self.preprocess_true_boxes(bboxes) 134 | else: 135 | label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes = self.preprocess_true_boxes(bboxes) 136 | except IndexError: 137 | exceptions = True 138 | self.Delete_bad_annotation(annotation) 139 | print("IndexError, something wrong with", annotation[0], "removed this line from annotation file") 140 | 141 | batch_image[num, :, :, :] = image 142 | batch_label_mbbox[num, :, :, :, :] = label_mbbox 143 | batch_label_lbbox[num, :, :, :, :] = label_lbbox 144 | batch_mbboxes[num, :, :] = mbboxes 145 | batch_lbboxes[num, :, :] = lbboxes 146 | if not self.train_yolo_tiny: 147 | batch_label_sbbox[num, :, :, :, :] = label_sbbox 148 | batch_sbboxes[num, :, :] = sbboxes 149 | 150 | num += 1 151 | 152 | if exceptions: 153 | print('\n') 154 | raise Exception("There were problems with dataset, I fixed them, now restart the training process.") 155 | self.batch_count += 1 156 | if not self.train_yolo_tiny: 157 | batch_smaller_target = batch_label_sbbox, batch_sbboxes 158 | batch_medium_target = batch_label_mbbox, batch_mbboxes 159 | batch_larger_target = batch_label_lbbox, batch_lbboxes 160 | 161 | if self.train_yolo_tiny: 162 | return batch_image, (batch_medium_target, batch_larger_target) 163 | return batch_image, (batch_smaller_target, batch_medium_target, batch_larger_target) 164 | else: 165 | self.batch_count = 0 166 | np.random.shuffle(self.annotations) 167 | raise StopIteration 168 | 169 | def random_horizontal_flip(self, image, bboxes): 170 | if random.random() < 0.5: 171 | _, w, _ = image.shape 172 | image = image[:, ::-1, :] 173 | bboxes[:, [0,2]] = w - bboxes[:, [2,0]] 174 | 175 | return image, bboxes 176 | 177 | def random_crop(self, image, bboxes): 178 | if random.random() < 0.5: 179 | h, w, _ = image.shape 180 | max_bbox = np.concatenate([np.min(bboxes[:, 0:2], axis=0), np.max(bboxes[:, 2:4], axis=0)], axis=-1) 181 | 182 | max_l_trans = max_bbox[0] 183 | max_u_trans = max_bbox[1] 184 | max_r_trans = w - max_bbox[2] 185 | max_d_trans = h - max_bbox[3] 186 | 187 | crop_xmin = max(0, int(max_bbox[0] - random.uniform(0, max_l_trans))) 188 | crop_ymin = max(0, int(max_bbox[1] - random.uniform(0, max_u_trans))) 189 | crop_xmax = max(w, int(max_bbox[2] + random.uniform(0, max_r_trans))) 190 | crop_ymax = max(h, int(max_bbox[3] + random.uniform(0, max_d_trans))) 191 | 192 | image = image[crop_ymin : crop_ymax, crop_xmin : crop_xmax] 193 | 194 | bboxes[:, [0, 2]] = bboxes[:, [0, 2]] - crop_xmin 195 | bboxes[:, [1, 3]] = bboxes[:, [1, 3]] - crop_ymin 196 | 197 | return image, bboxes 198 | 199 | def random_translate(self, image, bboxes): 200 | if random.random() < 0.5: 201 | h, w, _ = image.shape 202 | max_bbox = np.concatenate([np.min(bboxes[:, 0:2], axis=0), np.max(bboxes[:, 2:4], axis=0)], axis=-1) 203 | 204 | max_l_trans = max_bbox[0] 205 | max_u_trans = max_bbox[1] 206 | max_r_trans = w - max_bbox[2] 207 | max_d_trans = h - max_bbox[3] 208 | 209 | tx = random.uniform(-(max_l_trans - 1), (max_r_trans - 1)) 210 | ty = random.uniform(-(max_u_trans - 1), (max_d_trans - 1)) 211 | 212 | M = np.array([[1, 0, tx], [0, 1, ty]]) 213 | image = cv2.warpAffine(image, M, (w, h)) 214 | 215 | bboxes[:, [0, 2]] = bboxes[:, [0, 2]] + tx 216 | bboxes[:, [1, 3]] = bboxes[:, [1, 3]] + ty 217 | 218 | return image, bboxes 219 | 220 | def parse_annotation(self, annotation, mAP = 'False'): 221 | if TRAIN_LOAD_IMAGES_TO_RAM: 222 | image_path = annotation[0] 223 | image = annotation[2] 224 | else: 225 | image_path = annotation[0] 226 | image = cv2.imread(image_path) 227 | 228 | bboxes = np.array([list(map(int, box.split(','))) for box in annotation[1]]) 229 | 230 | if self.data_aug: 231 | image, bboxes = self.random_horizontal_flip(np.copy(image), np.copy(bboxes)) 232 | image, bboxes = self.random_crop(np.copy(image), np.copy(bboxes)) 233 | image, bboxes = self.random_translate(np.copy(image), np.copy(bboxes)) 234 | 235 | #image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 236 | if mAP == True: 237 | return image, bboxes 238 | 239 | image, bboxes = image_preprocess(np.copy(image), [self.input_sizes, self.input_sizes], np.copy(bboxes)) 240 | return image, bboxes 241 | 242 | def preprocess_true_boxes(self, bboxes): 243 | OUTPUT_LEVELS = len(self.strides) 244 | 245 | label = [np.zeros((self.train_output_sizes[i], self.train_output_sizes[i], self.anchor_per_scale, 246 | 5 + self.num_classes)) for i in range(OUTPUT_LEVELS)] 247 | bboxes_xywh = [np.zeros((self.max_bbox_per_scale, 4)) for _ in range(OUTPUT_LEVELS)] 248 | bbox_count = np.zeros((OUTPUT_LEVELS,)) 249 | 250 | for bbox in bboxes: 251 | bbox_coor = bbox[:4] 252 | bbox_class_ind = bbox[4] 253 | 254 | onehot = np.zeros(self.num_classes, dtype=np.float) 255 | onehot[bbox_class_ind] = 1.0 256 | uniform_distribution = np.full(self.num_classes, 1.0 / self.num_classes) 257 | deta = 0.01 258 | smooth_onehot = onehot * (1 - deta) + deta * uniform_distribution 259 | 260 | bbox_xywh = np.concatenate([(bbox_coor[2:] + bbox_coor[:2]) * 0.5, bbox_coor[2:] - bbox_coor[:2]], axis=-1) 261 | bbox_xywh_scaled = 1.0 * bbox_xywh[np.newaxis, :] / self.strides[:, np.newaxis] 262 | 263 | iou = [] 264 | exist_positive = False 265 | for i in range(OUTPUT_LEVELS):#range(3): 266 | anchors_xywh = np.zeros((self.anchor_per_scale, 4)) 267 | anchors_xywh[:, 0:2] = np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32) + 0.5 268 | anchors_xywh[:, 2:4] = self.anchors[i] 269 | 270 | iou_scale = bbox_iou(bbox_xywh_scaled[i][np.newaxis, :], anchors_xywh) 271 | iou.append(iou_scale) 272 | iou_mask = iou_scale > 0.3 273 | 274 | if np.any(iou_mask): 275 | xind, yind = np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32) 276 | 277 | label[i][yind, xind, iou_mask, :] = 0 278 | label[i][yind, xind, iou_mask, 0:4] = bbox_xywh 279 | label[i][yind, xind, iou_mask, 4:5] = 1.0 280 | label[i][yind, xind, iou_mask, 5:] = smooth_onehot 281 | 282 | bbox_ind = int(bbox_count[i] % self.max_bbox_per_scale) 283 | bboxes_xywh[i][bbox_ind, :4] = bbox_xywh 284 | bbox_count[i] += 1 285 | 286 | exist_positive = True 287 | 288 | if not exist_positive: 289 | best_anchor_ind = np.argmax(np.array(iou).reshape(-1), axis=-1) 290 | best_detect = int(best_anchor_ind / self.anchor_per_scale) 291 | best_anchor = int(best_anchor_ind % self.anchor_per_scale) 292 | xind, yind = np.floor(bbox_xywh_scaled[best_detect, 0:2]).astype(np.int32) 293 | 294 | label[best_detect][yind, xind, best_anchor, :] = 0 295 | label[best_detect][yind, xind, best_anchor, 0:4] = bbox_xywh 296 | label[best_detect][yind, xind, best_anchor, 4:5] = 1.0 297 | label[best_detect][yind, xind, best_anchor, 5:] = smooth_onehot 298 | 299 | bbox_ind = int(bbox_count[best_detect] % self.max_bbox_per_scale) 300 | bboxes_xywh[best_detect][bbox_ind, :4] = bbox_xywh 301 | bbox_count[best_detect] += 1 302 | 303 | if self.train_yolo_tiny: 304 | label_mbbox, label_lbbox = label 305 | mbboxes, lbboxes = bboxes_xywh 306 | return label_mbbox, label_lbbox, mbboxes, lbboxes 307 | 308 | label_sbbox, label_mbbox, label_lbbox = label 309 | sbboxes, mbboxes, lbboxes = bboxes_xywh 310 | return label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes 311 | 312 | def __len__(self): 313 | return self.num_batchs 314 | -------------------------------------------------------------------------------- /yolov3/utils.py: -------------------------------------------------------------------------------- 1 | #================================================================ 2 | # 3 | # File name : utils.py 4 | # Author : PyLessons 5 | # Created date: 2020-09-27 6 | # Website : https://pylessons.com/ 7 | # GitHub : https://github.com/pythonlessons/TensorFlow-2.x-YOLOv3 8 | # Description : additional yolov3 and yolov4 functions 9 | # 10 | #================================================================ 11 | from multiprocessing import Process, Queue, Pipe 12 | import cv2 13 | import time 14 | import random 15 | import colorsys 16 | import numpy as np 17 | import tensorflow as tf 18 | from yolov3.configs import * 19 | from yolov3.yolov4 import * 20 | from tensorflow.python.saved_model import tag_constants 21 | 22 | def load_yolo_weights(model, weights_file): 23 | tf.keras.backend.clear_session() # used to reset layer names 24 | # load Darknet original weights to TensorFlow model 25 | if YOLO_TYPE == "yolov3": 26 | range1 = 75 if not TRAIN_YOLO_TINY else 13 27 | range2 = [58, 66, 74] if not TRAIN_YOLO_TINY else [9, 12] 28 | if YOLO_TYPE == "yolov4": 29 | range1 = 110 if not TRAIN_YOLO_TINY else 21 30 | range2 = [93, 101, 109] if not TRAIN_YOLO_TINY else [17, 20] 31 | 32 | with open(weights_file, 'rb') as wf: 33 | major, minor, revision, seen, _ = np.fromfile(wf, dtype=np.int32, count=5) 34 | 35 | j = 0 36 | for i in range(range1): 37 | if i > 0: 38 | conv_layer_name = 'conv2d_%d' %i 39 | else: 40 | conv_layer_name = 'conv2d' 41 | 42 | if j > 0: 43 | bn_layer_name = 'batch_normalization_%d' %j 44 | else: 45 | bn_layer_name = 'batch_normalization' 46 | 47 | conv_layer = model.get_layer(conv_layer_name) 48 | filters = conv_layer.filters 49 | k_size = conv_layer.kernel_size[0] 50 | in_dim = conv_layer.input_shape[-1] 51 | 52 | if i not in range2: 53 | # darknet weights: [beta, gamma, mean, variance] 54 | bn_weights = np.fromfile(wf, dtype=np.float32, count=4 * filters) 55 | # tf weights: [gamma, beta, mean, variance] 56 | bn_weights = bn_weights.reshape((4, filters))[[1, 0, 2, 3]] 57 | bn_layer = model.get_layer(bn_layer_name) 58 | j += 1 59 | else: 60 | conv_bias = np.fromfile(wf, dtype=np.float32, count=filters) 61 | 62 | # darknet shape (out_dim, in_dim, height, width) 63 | conv_shape = (filters, in_dim, k_size, k_size) 64 | conv_weights = np.fromfile(wf, dtype=np.float32, count=np.product(conv_shape)) 65 | # tf shape (height, width, in_dim, out_dim) 66 | conv_weights = conv_weights.reshape(conv_shape).transpose([2, 3, 1, 0]) 67 | 68 | if i not in range2: 69 | conv_layer.set_weights([conv_weights]) 70 | bn_layer.set_weights(bn_weights) 71 | else: 72 | conv_layer.set_weights([conv_weights, conv_bias]) 73 | 74 | assert len(wf.read()) == 0, 'failed to read all data' 75 | 76 | def Load_Yolo_model(): 77 | gpus = tf.config.experimental.list_physical_devices('GPU') 78 | if len(gpus) > 0: 79 | print(f'GPUs {gpus}') 80 | try: tf.config.experimental.set_memory_growth(gpus[0], True) 81 | except RuntimeError: pass 82 | 83 | if YOLO_FRAMEWORK == "tf": # TensorFlow detection 84 | if YOLO_TYPE == "yolov4": 85 | Darknet_weights = YOLO_V4_TINY_WEIGHTS if TRAIN_YOLO_TINY else YOLO_V4_WEIGHTS 86 | if YOLO_TYPE == "yolov3": 87 | Darknet_weights = YOLO_V3_TINY_WEIGHTS if TRAIN_YOLO_TINY else YOLO_V3_WEIGHTS 88 | 89 | if YOLO_CUSTOM_WEIGHTS == False: 90 | print("Loading Darknet_weights from:", Darknet_weights) 91 | yolo = Create_Yolo(input_size=YOLO_INPUT_SIZE, CLASSES=YOLO_COCO_CLASSES) 92 | load_yolo_weights(yolo, Darknet_weights) # use Darknet weights 93 | else: 94 | checkpoint = f"./checkpoints/{TRAIN_MODEL_NAME}" 95 | if TRAIN_YOLO_TINY: 96 | checkpoint += "_Tiny" 97 | print("Loading custom weights from:", checkpoint) 98 | yolo = Create_Yolo(input_size=YOLO_INPUT_SIZE, CLASSES=TRAIN_CLASSES) 99 | yolo.load_weights(checkpoint) # use custom weights 100 | 101 | elif YOLO_FRAMEWORK == "trt": # TensorRT detection 102 | saved_model_loaded = tf.saved_model.load(YOLO_CUSTOM_WEIGHTS, tags=[tag_constants.SERVING]) 103 | signature_keys = list(saved_model_loaded.signatures.keys()) 104 | yolo = saved_model_loaded.signatures['serving_default'] 105 | 106 | return yolo 107 | 108 | def image_preprocess(image, target_size, gt_boxes=None): 109 | ih, iw = target_size 110 | h, w, _ = image.shape 111 | 112 | scale = min(iw/w, ih/h) 113 | nw, nh = int(scale * w), int(scale * h) 114 | image_resized = cv2.resize(image, (nw, nh)) 115 | 116 | image_paded = np.full(shape=[ih, iw, 3], fill_value=128.0) 117 | dw, dh = (iw - nw) // 2, (ih-nh) // 2 118 | image_paded[dh:nh+dh, dw:nw+dw, :] = image_resized 119 | image_paded = image_paded / 255. 120 | 121 | if gt_boxes is None: 122 | return image_paded 123 | 124 | else: 125 | gt_boxes[:, [0, 2]] = gt_boxes[:, [0, 2]] * scale + dw 126 | gt_boxes[:, [1, 3]] = gt_boxes[:, [1, 3]] * scale + dh 127 | return image_paded, gt_boxes 128 | 129 | 130 | def draw_bbox(image, bboxes, CLASSES=YOLO_COCO_CLASSES, show_label=True, show_confidence = True, Text_colors=(255,255,0), rectangle_colors='', tracking=False): 131 | NUM_CLASS = read_class_names(CLASSES) 132 | num_classes = len(NUM_CLASS) 133 | image_h, image_w, _ = image.shape 134 | hsv_tuples = [(1.0 * x / num_classes, 1., 1.) for x in range(num_classes)] 135 | #print("hsv_tuples", hsv_tuples) 136 | colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples)) 137 | colors = list(map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), colors)) 138 | 139 | random.seed(0) 140 | random.shuffle(colors) 141 | random.seed(None) 142 | 143 | for i, bbox in enumerate(bboxes): 144 | coor = np.array(bbox[:4], dtype=np.int32) 145 | score = bbox[4] 146 | class_ind = int(bbox[5]) 147 | bbox_color = rectangle_colors if rectangle_colors != '' else colors[class_ind] 148 | bbox_thick = int(0.6 * (image_h + image_w) / 1000) 149 | if bbox_thick < 1: bbox_thick = 1 150 | fontScale = 0.75 * bbox_thick 151 | (x1, y1), (x2, y2) = (coor[0], coor[1]), (coor[2], coor[3]) 152 | 153 | # put object rectangle 154 | cv2.rectangle(image, (x1, y1), (x2, y2), bbox_color, bbox_thick*2) 155 | 156 | if show_label: 157 | # get text label 158 | score_str = " {:.2f}".format(score) if show_confidence else "" 159 | 160 | if tracking: score_str = " "+str(score) 161 | 162 | try: 163 | label = "{}".format(NUM_CLASS[class_ind]) + score_str 164 | except KeyError: 165 | print("You received KeyError, this might be that you are trying to use yolo original weights") 166 | print("while using custom classes, if using custom model in configs.py set YOLO_CUSTOM_WEIGHTS = True") 167 | 168 | # get text size 169 | (text_width, text_height), baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_COMPLEX_SMALL, 170 | fontScale, thickness=bbox_thick) 171 | # put filled text rectangle 172 | cv2.rectangle(image, (x1, y1), (x1 + text_width, y1 - text_height - baseline), bbox_color, thickness=cv2.FILLED) 173 | 174 | # put text above rectangle 175 | cv2.putText(image, label, (x1, y1-4), cv2.FONT_HERSHEY_COMPLEX_SMALL, 176 | fontScale, Text_colors, bbox_thick, lineType=cv2.LINE_AA) 177 | 178 | return image 179 | 180 | 181 | def bboxes_iou(boxes1, boxes2): 182 | boxes1 = np.array(boxes1) 183 | boxes2 = np.array(boxes2) 184 | 185 | boxes1_area = (boxes1[..., 2] - boxes1[..., 0]) * (boxes1[..., 3] - boxes1[..., 1]) 186 | boxes2_area = (boxes2[..., 2] - boxes2[..., 0]) * (boxes2[..., 3] - boxes2[..., 1]) 187 | 188 | left_up = np.maximum(boxes1[..., :2], boxes2[..., :2]) 189 | right_down = np.minimum(boxes1[..., 2:], boxes2[..., 2:]) 190 | 191 | inter_section = np.maximum(right_down - left_up, 0.0) 192 | inter_area = inter_section[..., 0] * inter_section[..., 1] 193 | union_area = boxes1_area + boxes2_area - inter_area 194 | ious = np.maximum(1.0 * inter_area / union_area, np.finfo(np.float32).eps) 195 | 196 | return ious 197 | 198 | 199 | def nms(bboxes, iou_threshold, sigma=0.3, method='nms'): 200 | """ 201 | :param bboxes: (xmin, ymin, xmax, ymax, score, class) 202 | 203 | Note: soft-nms, https://arxiv.org/pdf/1704.04503.pdf 204 | https://github.com/bharatsingh430/soft-nms 205 | """ 206 | classes_in_img = list(set(bboxes[:, 5])) 207 | best_bboxes = [] 208 | 209 | for cls in classes_in_img: 210 | cls_mask = (bboxes[:, 5] == cls) 211 | cls_bboxes = bboxes[cls_mask] 212 | # Process 1: Determine whether the number of bounding boxes is greater than 0 213 | while len(cls_bboxes) > 0: 214 | # Process 2: Select the bounding box with the highest score according to socre order A 215 | max_ind = np.argmax(cls_bboxes[:, 4]) 216 | best_bbox = cls_bboxes[max_ind] 217 | best_bboxes.append(best_bbox) 218 | cls_bboxes = np.concatenate([cls_bboxes[: max_ind], cls_bboxes[max_ind + 1:]]) 219 | # Process 3: Calculate this bounding box A and 220 | # Remain all iou of the bounding box and remove those bounding boxes whose iou value is higher than the threshold 221 | iou = bboxes_iou(best_bbox[np.newaxis, :4], cls_bboxes[:, :4]) 222 | weight = np.ones((len(iou),), dtype=np.float32) 223 | 224 | assert method in ['nms', 'soft-nms'] 225 | 226 | if method == 'nms': 227 | iou_mask = iou > iou_threshold 228 | weight[iou_mask] = 0.0 229 | 230 | if method == 'soft-nms': 231 | weight = np.exp(-(1.0 * iou ** 2 / sigma)) 232 | 233 | cls_bboxes[:, 4] = cls_bboxes[:, 4] * weight 234 | score_mask = cls_bboxes[:, 4] > 0. 235 | cls_bboxes = cls_bboxes[score_mask] 236 | 237 | return best_bboxes 238 | 239 | 240 | def postprocess_boxes(pred_bbox, original_image, input_size, score_threshold): 241 | valid_scale=[0, np.inf] 242 | pred_bbox = np.array(pred_bbox) 243 | 244 | pred_xywh = pred_bbox[:, 0:4] 245 | pred_conf = pred_bbox[:, 4] 246 | pred_prob = pred_bbox[:, 5:] 247 | 248 | # 1. (x, y, w, h) --> (xmin, ymin, xmax, ymax) 249 | pred_coor = np.concatenate([pred_xywh[:, :2] - pred_xywh[:, 2:] * 0.5, 250 | pred_xywh[:, :2] + pred_xywh[:, 2:] * 0.5], axis=-1) 251 | # 2. (xmin, ymin, xmax, ymax) -> (xmin_org, ymin_org, xmax_org, ymax_org) 252 | org_h, org_w = original_image.shape[:2] 253 | resize_ratio = min(input_size / org_w, input_size / org_h) 254 | 255 | dw = (input_size - resize_ratio * org_w) / 2 256 | dh = (input_size - resize_ratio * org_h) / 2 257 | 258 | pred_coor[:, 0::2] = 1.0 * (pred_coor[:, 0::2] - dw) / resize_ratio 259 | pred_coor[:, 1::2] = 1.0 * (pred_coor[:, 1::2] - dh) / resize_ratio 260 | 261 | # 3. clip some boxes those are out of range 262 | pred_coor = np.concatenate([np.maximum(pred_coor[:, :2], [0, 0]), 263 | np.minimum(pred_coor[:, 2:], [org_w - 1, org_h - 1])], axis=-1) 264 | invalid_mask = np.logical_or((pred_coor[:, 0] > pred_coor[:, 2]), (pred_coor[:, 1] > pred_coor[:, 3])) 265 | pred_coor[invalid_mask] = 0 266 | 267 | # 4. discard some invalid boxes 268 | bboxes_scale = np.sqrt(np.multiply.reduce(pred_coor[:, 2:4] - pred_coor[:, 0:2], axis=-1)) 269 | scale_mask = np.logical_and((valid_scale[0] < bboxes_scale), (bboxes_scale < valid_scale[1])) 270 | 271 | # 5. discard boxes with low scores 272 | classes = np.argmax(pred_prob, axis=-1) 273 | scores = pred_conf * pred_prob[np.arange(len(pred_coor)), classes] 274 | score_mask = scores > score_threshold 275 | mask = np.logical_and(scale_mask, score_mask) 276 | coors, scores, classes = pred_coor[mask], scores[mask], classes[mask] 277 | 278 | return np.concatenate([coors, scores[:, np.newaxis], classes[:, np.newaxis]], axis=-1) 279 | 280 | 281 | def detect_image(Yolo, image_path, output_path, input_size=416, show=False, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors=''): 282 | original_image = cv2.imread(image_path) 283 | original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB) 284 | original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB) 285 | 286 | image_data = image_preprocess(np.copy(original_image), [input_size, input_size]) 287 | image_data = image_data[np.newaxis, ...].astype(np.float32) 288 | 289 | if YOLO_FRAMEWORK == "tf": 290 | pred_bbox = Yolo.predict(image_data) 291 | elif YOLO_FRAMEWORK == "trt": 292 | batched_input = tf.constant(image_data) 293 | result = Yolo(batched_input) 294 | pred_bbox = [] 295 | for key, value in result.items(): 296 | value = value.numpy() 297 | pred_bbox.append(value) 298 | 299 | pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] 300 | pred_bbox = tf.concat(pred_bbox, axis=0) 301 | 302 | bboxes = postprocess_boxes(pred_bbox, original_image, input_size, score_threshold) 303 | bboxes = nms(bboxes, iou_threshold, method='nms') 304 | 305 | image = draw_bbox(original_image, bboxes, CLASSES=CLASSES, rectangle_colors=rectangle_colors) 306 | # CreateXMLfile("XML_Detections", str(int(time.time())), original_image, bboxes, read_class_names(CLASSES)) 307 | 308 | if output_path != '': cv2.imwrite(output_path, image) 309 | if show: 310 | # Show the image 311 | cv2.imshow("predicted image", image) 312 | # Load and hold the image 313 | cv2.waitKey(0) 314 | # To close the window after the required kill value was provided 315 | cv2.destroyAllWindows() 316 | 317 | return image 318 | 319 | def Predict_bbox_mp(Frames_data, Predicted_data, Processing_times): 320 | gpus = tf.config.experimental.list_physical_devices('GPU') 321 | if len(gpus) > 0: 322 | try: tf.config.experimental.set_memory_growth(gpus[0], True) 323 | except RuntimeError: print("RuntimeError in tf.config.experimental.list_physical_devices('GPU')") 324 | Yolo = Load_Yolo_model() 325 | times = [] 326 | while True: 327 | if Frames_data.qsize()>0: 328 | image_data = Frames_data.get() 329 | t1 = time.time() 330 | Processing_times.put(time.time()) 331 | 332 | if YOLO_FRAMEWORK == "tf": 333 | if tf.__version__ > '2.4.0': 334 | pred_bbox = Yolo(image_data) 335 | else: 336 | pred_bbox = Yolo.predict(image_data) 337 | elif YOLO_FRAMEWORK == "trt": 338 | batched_input = tf.constant(image_data) 339 | result = Yolo(batched_input) 340 | pred_bbox = [] 341 | for key, value in result.items(): 342 | value = value.numpy() 343 | pred_bbox.append(value) 344 | 345 | pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] 346 | pred_bbox = tf.concat(pred_bbox, axis=0) 347 | 348 | Predicted_data.put(pred_bbox) 349 | 350 | 351 | def postprocess_mp(Predicted_data, original_frames, Processed_frames, Processing_times, input_size, CLASSES, score_threshold, iou_threshold, rectangle_colors, realtime): 352 | times = [] 353 | while True: 354 | if Predicted_data.qsize()>0: 355 | pred_bbox = Predicted_data.get() 356 | if realtime: 357 | while original_frames.qsize() > 1: 358 | original_image = original_frames.get() 359 | else: 360 | original_image = original_frames.get() 361 | 362 | bboxes = postprocess_boxes(pred_bbox, original_image, input_size, score_threshold) 363 | bboxes = nms(bboxes, iou_threshold, method='nms') 364 | image = draw_bbox(original_image, bboxes, CLASSES=CLASSES, rectangle_colors=rectangle_colors) 365 | times.append(time.time()-Processing_times.get()) 366 | times = times[-20:] 367 | 368 | ms = sum(times)/len(times)*1000 369 | fps = 1000 / ms 370 | image = cv2.putText(image, "Time: {:.1f}FPS".format(fps), (0, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) 371 | #print("Time: {:.2f}ms, Final FPS: {:.1f}".format(ms, fps)) 372 | 373 | Processed_frames.put(image) 374 | 375 | def Show_Image_mp(Processed_frames, show, Final_frames): 376 | while True: 377 | if Processed_frames.qsize()>0: 378 | image = Processed_frames.get() 379 | Final_frames.put(image) 380 | if show: 381 | cv2.imshow('output', image) 382 | if cv2.waitKey(25) & 0xFF == ord("q"): 383 | cv2.destroyAllWindows() 384 | break 385 | 386 | # detect from webcam 387 | def detect_video_realtime_mp(video_path, output_path, input_size=416, show=False, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors='', realtime=False): 388 | if realtime: 389 | vid = cv2.VideoCapture(0) 390 | else: 391 | vid = cv2.VideoCapture(video_path) 392 | 393 | # by default VideoCapture returns float instead of int 394 | width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) 395 | height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) 396 | fps = int(vid.get(cv2.CAP_PROP_FPS)) 397 | codec = cv2.VideoWriter_fourcc(*'XVID') 398 | out = cv2.VideoWriter(output_path, codec, fps, (width, height)) # output_path must be .mp4 399 | no_of_frames = int(vid.get(cv2.CAP_PROP_FRAME_COUNT)) 400 | 401 | original_frames = Queue() 402 | Frames_data = Queue() 403 | Predicted_data = Queue() 404 | Processed_frames = Queue() 405 | Processing_times = Queue() 406 | Final_frames = Queue() 407 | 408 | p1 = Process(target=Predict_bbox_mp, args=(Frames_data, Predicted_data, Processing_times)) 409 | p2 = Process(target=postprocess_mp, args=(Predicted_data, original_frames, Processed_frames, Processing_times, input_size, CLASSES, score_threshold, iou_threshold, rectangle_colors, realtime)) 410 | p3 = Process(target=Show_Image_mp, args=(Processed_frames, show, Final_frames)) 411 | p1.start() 412 | p2.start() 413 | p3.start() 414 | 415 | while True: 416 | ret, img = vid.read() 417 | if not ret: 418 | break 419 | 420 | original_image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 421 | original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB) 422 | original_frames.put(original_image) 423 | 424 | image_data = image_preprocess(np.copy(original_image), [input_size, input_size]) 425 | image_data = image_data[np.newaxis, ...].astype(np.float32) 426 | Frames_data.put(image_data) 427 | 428 | while True: 429 | if original_frames.qsize() == 0 and Frames_data.qsize() == 0 and Predicted_data.qsize() == 0 and Processed_frames.qsize() == 0 and Processing_times.qsize() == 0 and Final_frames.qsize() == 0: 430 | p1.terminate() 431 | p2.terminate() 432 | p3.terminate() 433 | break 434 | elif Final_frames.qsize()>0: 435 | image = Final_frames.get() 436 | if output_path != '': out.write(image) 437 | 438 | cv2.destroyAllWindows() 439 | 440 | def detect_video(Yolo, video_path, output_path, input_size=416, show=False, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors=''): 441 | times, times_2 = [], [] 442 | vid = cv2.VideoCapture(video_path) 443 | 444 | # by default VideoCapture returns float instead of int 445 | width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) 446 | height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) 447 | fps = int(vid.get(cv2.CAP_PROP_FPS)) 448 | codec = cv2.VideoWriter_fourcc(*'XVID') 449 | out = cv2.VideoWriter(output_path, codec, fps, (width, height)) # output_path must be .mp4 450 | 451 | while True: 452 | _, img = vid.read() 453 | 454 | try: 455 | original_image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 456 | original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB) 457 | except: 458 | break 459 | 460 | image_data = image_preprocess(np.copy(original_image), [input_size, input_size]) 461 | image_data = image_data[np.newaxis, ...].astype(np.float32) 462 | 463 | t1 = time.time() 464 | if YOLO_FRAMEWORK == "tf": 465 | if tf.__version__ > '2.4.0': 466 | pred_bbox = Yolo(image_data, training=False) 467 | else: 468 | pred_bbox = Yolo.predict(image_data) 469 | elif YOLO_FRAMEWORK == "trt": 470 | batched_input = tf.constant(image_data) 471 | result = Yolo(batched_input) 472 | pred_bbox = [] 473 | for key, value in result.items(): 474 | value = value.numpy() 475 | pred_bbox.append(value) 476 | 477 | t2 = time.time() 478 | 479 | pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] 480 | pred_bbox = tf.concat(pred_bbox, axis=0) 481 | 482 | bboxes = postprocess_boxes(pred_bbox, original_image, input_size, score_threshold) 483 | bboxes = nms(bboxes, iou_threshold, method='nms') 484 | 485 | image = draw_bbox(original_image, bboxes, CLASSES=CLASSES, rectangle_colors=rectangle_colors) 486 | 487 | t3 = time.time() 488 | times.append(t2-t1) 489 | times_2.append(t3-t1) 490 | 491 | times = times[-20:] 492 | times_2 = times_2[-20:] 493 | 494 | ms = sum(times)/len(times)*1000 495 | fps = 1000 / ms 496 | fps2 = 1000 / (sum(times_2)/len(times_2)*1000) 497 | 498 | image = cv2.putText(image, "Time: {:.1f}FPS".format(fps), (0, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) 499 | # CreateXMLfile("XML_Detections", str(int(time.time())), original_image, bboxes, read_class_names(CLASSES)) 500 | 501 | print("Time: {:.2f}ms, Detection FPS: {:.1f}, total FPS: {:.1f}".format(ms, fps, fps2)) 502 | if output_path != '': out.write(image) 503 | if show: 504 | cv2.imshow('output', image) 505 | if cv2.waitKey(25) & 0xFF == ord("q"): 506 | cv2.destroyAllWindows() 507 | break 508 | 509 | cv2.destroyAllWindows() 510 | 511 | # detect from webcam 512 | def detect_realtime(Yolo, output_path, input_size=416, show=False, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors=''): 513 | times = [] 514 | vid = cv2.VideoCapture(1) 515 | 516 | if output_path: 517 | # by default VideoCapture returns float instead of int 518 | width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) 519 | height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) 520 | fps = int(vid.get(cv2.CAP_PROP_FPS)) 521 | codec = cv2.VideoWriter_fourcc(*'XVID') 522 | out = cv2.VideoWriter(output_path, codec, fps, (width, height)) # output_path must be .mp4 523 | 524 | while True: 525 | ret, frame = vid.read() 526 | 527 | try: 528 | original_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 529 | original_frame = cv2.cvtColor(original_frame, cv2.COLOR_BGR2RGB) 530 | except: 531 | break 532 | image_data = image_preprocess(np.copy(original_frame), [input_size, input_size]) 533 | image_data = image_data[np.newaxis, ...].astype(np.float32) 534 | 535 | t1 = time.time() 536 | if YOLO_FRAMEWORK == "tf": 537 | if tf.__version__ > '2.4.0': 538 | pred_bbox = Yolo(image_data, training=False) 539 | else: 540 | pred_bbox = Yolo.predict(image_data) 541 | # if True: 542 | # pred_bbox = Yolo.predict(image_data) 543 | elif YOLO_FRAMEWORK == "trt": 544 | batched_input = tf.constant(image_data) 545 | result = Yolo(batched_input) 546 | pred_bbox = [] 547 | for key, value in result.items(): 548 | value = value.numpy() 549 | pred_bbox.append(value) 550 | 551 | t2 = time.time() 552 | 553 | pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] 554 | pred_bbox = tf.concat(pred_bbox, axis=0) 555 | 556 | bboxes = postprocess_boxes(pred_bbox, original_frame, input_size, score_threshold) 557 | bboxes = nms(bboxes, iou_threshold, method='nms') 558 | 559 | times.append(t2-t1) 560 | times = times[-20:] 561 | 562 | ms = sum(times)/len(times)*1000 563 | fps = 1000 / ms 564 | 565 | print("Time: {:.2f}ms, {:.1f} FPS".format(ms, fps)) 566 | 567 | frame = draw_bbox(original_frame, bboxes, CLASSES=CLASSES, rectangle_colors=rectangle_colors) 568 | # CreateXMLfile("XML_Detections", str(int(time.time())), original_frame, bboxes, read_class_names(CLASSES)) 569 | image = cv2.putText(frame, "Time: {:.1f}FPS".format(fps), (0, 30), 570 | cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) 571 | 572 | if output_path != '': out.write(frame) 573 | if show: 574 | cv2.imshow('output', frame) 575 | if cv2.waitKey(25) & 0xFF == ord("q"): 576 | cv2.destroyAllWindows() 577 | break 578 | 579 | cv2.destroyAllWindows() 580 | -------------------------------------------------------------------------------- /yolov3/yolov3.py: -------------------------------------------------------------------------------- 1 | #================================================================ 2 | # 3 | # File name : yolov3.py 4 | # Author : PyLessons 5 | # Created date: 2020-06-04 6 | # Website : https://pylessons.com/ 7 | # GitHub : https://github.com/pythonlessons/TensorFlow-2.x-YOLOv3 8 | # Description : main yolov3 functions 9 | # 10 | #================================================================ 11 | import numpy as np 12 | import tensorflow as tf 13 | from tensorflow.keras.layers import Conv2D, Input, LeakyReLU, ZeroPadding2D, BatchNormalization, MaxPool2D 14 | from tensorflow.keras.regularizers import l2 15 | from yolov3.utils import read_class_names 16 | from yolov3.configs import * 17 | 18 | STRIDES = np.array(YOLO_STRIDES) 19 | ANCHORS = (np.array(YOLO_ANCHORS).T/STRIDES).T 20 | 21 | class BatchNormalization(BatchNormalization): 22 | # "Frozen state" and "inference mode" are two separate concepts. 23 | # `layer.trainable = False` is to freeze the layer, so the layer will use 24 | # stored moving `var` and `mean` in the "inference mode", and both `gama` 25 | # and `beta` will not be updated ! 26 | def call(self, x, training=False): 27 | if not training: 28 | training = tf.constant(False) 29 | training = tf.logical_and(training, self.trainable) 30 | return super().call(x, training) 31 | 32 | def convolutional(input_layer, filters_shape, downsample=False, activate=True, bn=True): 33 | if downsample: 34 | input_layer = ZeroPadding2D(((1, 0), (1, 0)))(input_layer) 35 | padding = 'valid' 36 | strides = 2 37 | else: 38 | strides = 1 39 | padding = 'same' 40 | 41 | conv = Conv2D(filters=filters_shape[-1], kernel_size = filters_shape[0], strides=strides, 42 | padding=padding, use_bias=not bn, kernel_regularizer=l2(0.0005), 43 | kernel_initializer=tf.random_normal_initializer(stddev=0.01), 44 | bias_initializer=tf.constant_initializer(0.))(input_layer) 45 | if bn: 46 | conv = BatchNormalization()(conv) 47 | if activate == True: 48 | conv = LeakyReLU(alpha=0.1)(conv) 49 | 50 | return conv 51 | 52 | def residual_block(input_layer, input_channel, filter_num1, filter_num2): 53 | short_cut = input_layer 54 | conv = convolutional(input_layer, filters_shape=(1, 1, input_channel, filter_num1)) 55 | conv = convolutional(conv , filters_shape=(3, 3, filter_num1, filter_num2)) 56 | 57 | residual_output = short_cut + conv 58 | return residual_output 59 | 60 | def upsample(input_layer): 61 | return tf.image.resize(input_layer, (input_layer.shape[1] * 2, input_layer.shape[2] * 2), method='nearest') 62 | 63 | 64 | def darknet53(input_data): 65 | input_data = convolutional(input_data, (3, 3, 3, 32)) 66 | input_data = convolutional(input_data, (3, 3, 32, 64), downsample=True) 67 | 68 | for i in range(1): 69 | input_data = residual_block(input_data, 64, 32, 64) 70 | 71 | input_data = convolutional(input_data, (3, 3, 64, 128), downsample=True) 72 | 73 | for i in range(2): 74 | input_data = residual_block(input_data, 128, 64, 128) 75 | 76 | input_data = convolutional(input_data, (3, 3, 128, 256), downsample=True) 77 | 78 | for i in range(8): 79 | input_data = residual_block(input_data, 256, 128, 256) 80 | 81 | route_1 = input_data 82 | input_data = convolutional(input_data, (3, 3, 256, 512), downsample=True) 83 | 84 | for i in range(8): 85 | input_data = residual_block(input_data, 512, 256, 512) 86 | 87 | route_2 = input_data 88 | input_data = convolutional(input_data, (3, 3, 512, 1024), downsample=True) 89 | 90 | for i in range(4): 91 | input_data = residual_block(input_data, 1024, 512, 1024) 92 | 93 | return route_1, route_2, input_data 94 | 95 | def darknet19_tiny(input_data): 96 | input_data = convolutional(input_data, (3, 3, 3, 16)) 97 | input_data = MaxPool2D(2, 2, 'same')(input_data) 98 | input_data = convolutional(input_data, (3, 3, 16, 32)) 99 | input_data = MaxPool2D(2, 2, 'same')(input_data) 100 | input_data = convolutional(input_data, (3, 3, 32, 64)) 101 | input_data = MaxPool2D(2, 2, 'same')(input_data) 102 | input_data = convolutional(input_data, (3, 3, 64, 128)) 103 | input_data = MaxPool2D(2, 2, 'same')(input_data) 104 | input_data = convolutional(input_data, (3, 3, 128, 256)) 105 | route_1 = input_data 106 | input_data = MaxPool2D(2, 2, 'same')(input_data) 107 | input_data = convolutional(input_data, (3, 3, 256, 512)) 108 | input_data = MaxPool2D(2, 1, 'same')(input_data) 109 | input_data = convolutional(input_data, (3, 3, 512, 1024)) 110 | 111 | return route_1, input_data 112 | 113 | def YOLOv3(input_layer, NUM_CLASS): 114 | # After the input layer enters the Darknet-53 network, we get three branches 115 | route_1, route_2, conv = darknet53(input_layer) 116 | # See the orange module (DBL) in the figure above, a total of 5 Subconvolution operation 117 | conv = convolutional(conv, (1, 1, 1024, 512)) 118 | conv = convolutional(conv, (3, 3, 512, 1024)) 119 | conv = convolutional(conv, (1, 1, 1024, 512)) 120 | conv = convolutional(conv, (3, 3, 512, 1024)) 121 | conv = convolutional(conv, (1, 1, 1024, 512)) 122 | conv_lobj_branch = convolutional(conv, (3, 3, 512, 1024)) 123 | 124 | # conv_lbbox is used to predict large-sized objects , Shape = [None, 13, 13, 255] 125 | conv_lbbox = convolutional(conv_lobj_branch, (1, 1, 1024, 3*(NUM_CLASS + 5)), activate=False, bn=False) 126 | 127 | conv = convolutional(conv, (1, 1, 512, 256)) 128 | # upsample here uses the nearest neighbor interpolation method, which has the advantage that the 129 | # upsampling process does not need to learn, thereby reducing the network parameter 130 | conv = upsample(conv) 131 | 132 | conv = tf.concat([conv, route_2], axis=-1) 133 | conv = convolutional(conv, (1, 1, 768, 256)) 134 | conv = convolutional(conv, (3, 3, 256, 512)) 135 | conv = convolutional(conv, (1, 1, 512, 256)) 136 | conv = convolutional(conv, (3, 3, 256, 512)) 137 | conv = convolutional(conv, (1, 1, 512, 256)) 138 | conv_mobj_branch = convolutional(conv, (3, 3, 256, 512)) 139 | 140 | # conv_mbbox is used to predict medium-sized objects, shape = [None, 26, 26, 255] 141 | conv_mbbox = convolutional(conv_mobj_branch, (1, 1, 512, 3*(NUM_CLASS + 5)), activate=False, bn=False) 142 | 143 | conv = convolutional(conv, (1, 1, 256, 128)) 144 | conv = upsample(conv) 145 | 146 | conv = tf.concat([conv, route_1], axis=-1) 147 | conv = convolutional(conv, (1, 1, 384, 128)) 148 | conv = convolutional(conv, (3, 3, 128, 256)) 149 | conv = convolutional(conv, (1, 1, 256, 128)) 150 | conv = convolutional(conv, (3, 3, 128, 256)) 151 | conv = convolutional(conv, (1, 1, 256, 128)) 152 | conv_sobj_branch = convolutional(conv, (3, 3, 128, 256)) 153 | 154 | # conv_sbbox is used to predict small size objects, shape = [None, 52, 52, 255] 155 | conv_sbbox = convolutional(conv_sobj_branch, (1, 1, 256, 3*(NUM_CLASS +5)), activate=False, bn=False) 156 | 157 | return [conv_sbbox, conv_mbbox, conv_lbbox] 158 | 159 | def YOLOv3_tiny(input_layer, NUM_CLASS): 160 | # After the input layer enters the Darknet-53 network, we get three branches 161 | route_1, conv = darknet19_tiny(input_layer) 162 | 163 | conv = convolutional(conv, (1, 1, 1024, 256)) 164 | conv_lobj_branch = convolutional(conv, (3, 3, 256, 512)) 165 | 166 | # conv_lbbox is used to predict large-sized objects , Shape = [None, 26, 26, 255] 167 | conv_lbbox = convolutional(conv_lobj_branch, (1, 1, 512, 3*(NUM_CLASS + 5)), activate=False, bn=False) 168 | 169 | conv = convolutional(conv, (1, 1, 256, 128)) 170 | # upsample here uses the nearest neighbor interpolation method, which has the advantage that the 171 | # upsampling process does not need to learn, thereby reducing the network parameter 172 | conv = upsample(conv) 173 | 174 | conv = tf.concat([conv, route_1], axis=-1) 175 | conv_mobj_branch = convolutional(conv, (3, 3, 128, 256)) 176 | # conv_mbbox is used to predict medium size objects, shape = [None, 13, 13, 255] 177 | conv_mbbox = convolutional(conv_mobj_branch, (1, 1, 256, 3 * (NUM_CLASS + 5)), activate=False, bn=False) 178 | 179 | return [conv_mbbox, conv_lbbox] 180 | 181 | def Create_Yolov3(input_size=416, channels=3, training=False, CLASSES=YOLO_COCO_CLASSES): 182 | NUM_CLASS = len(read_class_names(CLASSES)) 183 | input_layer = Input([input_size, input_size, channels]) 184 | 185 | if TRAIN_YOLO_TINY: 186 | conv_tensors = YOLOv3_tiny(input_layer, NUM_CLASS) 187 | else: 188 | conv_tensors = YOLOv3(input_layer, NUM_CLASS) 189 | 190 | output_tensors = [] 191 | for i, conv_tensor in enumerate(conv_tensors): 192 | pred_tensor = decode(conv_tensor, NUM_CLASS, i) 193 | if training: output_tensors.append(conv_tensor) 194 | output_tensors.append(pred_tensor) 195 | 196 | YoloV3 = tf.keras.Model(input_layer, output_tensors) 197 | return YoloV3 198 | 199 | def decode(conv_output, NUM_CLASS, i=0): 200 | # where i = 0, 1 or 2 to correspond to the three grid scales 201 | conv_shape = tf.shape(conv_output) 202 | batch_size = conv_shape[0] 203 | output_size = conv_shape[1] 204 | 205 | conv_output = tf.reshape(conv_output, (batch_size, output_size, output_size, 3, 5 + NUM_CLASS)) 206 | 207 | conv_raw_dxdy = conv_output[:, :, :, :, 0:2] # offset of center position 208 | conv_raw_dwdh = conv_output[:, :, :, :, 2:4] # Prediction box length and width offset 209 | conv_raw_conf = conv_output[:, :, :, :, 4:5] # confidence of the prediction box 210 | conv_raw_prob = conv_output[:, :, :, :, 5: ] # category probability of the prediction box 211 | 212 | # next need Draw the grid. Where output_size is equal to 13, 26 or 52 213 | y = tf.range(output_size, dtype=tf.int32) 214 | y = tf.expand_dims(y, -1) 215 | y = tf.tile(y, [1, output_size]) 216 | x = tf.range(output_size,dtype=tf.int32) 217 | x = tf.expand_dims(x, 0) 218 | x = tf.tile(x, [output_size, 1]) 219 | 220 | xy_grid = tf.concat([x[:, :, tf.newaxis], y[:, :, tf.newaxis]], axis=-1) 221 | xy_grid = tf.tile(xy_grid[tf.newaxis, :, :, tf.newaxis, :], [batch_size, 1, 1, 3, 1]) 222 | xy_grid = tf.cast(xy_grid, tf.float32) 223 | 224 | # Calculate the center position of the prediction box: 225 | pred_xy = (tf.sigmoid(conv_raw_dxdy) + xy_grid) * STRIDES[i] 226 | # Calculate the length and width of the prediction box: 227 | pred_wh = (tf.exp(conv_raw_dwdh) * ANCHORS[i]) * STRIDES[i] 228 | 229 | pred_xywh = tf.concat([pred_xy, pred_wh], axis=-1) 230 | pred_conf = tf.sigmoid(conv_raw_conf) # object box calculates the predicted confidence 231 | pred_prob = tf.sigmoid(conv_raw_prob) # calculating the predicted probability category box object 232 | 233 | # calculating the predicted probability category box object 234 | return tf.concat([pred_xywh, pred_conf, pred_prob], axis=-1) 235 | 236 | def bbox_iou(boxes1, boxes2): 237 | boxes1_area = boxes1[..., 2] * boxes1[..., 3] 238 | boxes2_area = boxes2[..., 2] * boxes2[..., 3] 239 | 240 | boxes1 = tf.concat([boxes1[..., :2] - boxes1[..., 2:] * 0.5, 241 | boxes1[..., :2] + boxes1[..., 2:] * 0.5], axis=-1) 242 | boxes2 = tf.concat([boxes2[..., :2] - boxes2[..., 2:] * 0.5, 243 | boxes2[..., :2] + boxes2[..., 2:] * 0.5], axis=-1) 244 | 245 | left_up = tf.maximum(boxes1[..., :2], boxes2[..., :2]) 246 | right_down = tf.minimum(boxes1[..., 2:], boxes2[..., 2:]) 247 | 248 | inter_section = tf.maximum(right_down - left_up, 0.0) 249 | inter_area = inter_section[..., 0] * inter_section[..., 1] 250 | union_area = boxes1_area + boxes2_area - inter_area 251 | 252 | return 1.0 * inter_area / union_area 253 | 254 | def bbox_giou(boxes1, boxes2): 255 | boxes1 = tf.concat([boxes1[..., :2] - boxes1[..., 2:] * 0.5, 256 | boxes1[..., :2] + boxes1[..., 2:] * 0.5], axis=-1) 257 | boxes2 = tf.concat([boxes2[..., :2] - boxes2[..., 2:] * 0.5, 258 | boxes2[..., :2] + boxes2[..., 2:] * 0.5], axis=-1) 259 | 260 | boxes1 = tf.concat([tf.minimum(boxes1[..., :2], boxes1[..., 2:]), 261 | tf.maximum(boxes1[..., :2], boxes1[..., 2:])], axis=-1) 262 | boxes2 = tf.concat([tf.minimum(boxes2[..., :2], boxes2[..., 2:]), 263 | tf.maximum(boxes2[..., :2], boxes2[..., 2:])], axis=-1) 264 | 265 | boxes1_area = (boxes1[..., 2] - boxes1[..., 0]) * (boxes1[..., 3] - boxes1[..., 1]) 266 | boxes2_area = (boxes2[..., 2] - boxes2[..., 0]) * (boxes2[..., 3] - boxes2[..., 1]) 267 | 268 | left_up = tf.maximum(boxes1[..., :2], boxes2[..., :2]) 269 | right_down = tf.minimum(boxes1[..., 2:], boxes2[..., 2:]) 270 | 271 | inter_section = tf.maximum(right_down - left_up, 0.0) 272 | inter_area = inter_section[..., 0] * inter_section[..., 1] 273 | union_area = boxes1_area + boxes2_area - inter_area 274 | 275 | # Calculate the iou value between the two bounding boxes 276 | iou = inter_area / union_area 277 | 278 | # Calculate the coordinates of the upper left corner and the lower right corner of the smallest closed convex surface 279 | enclose_left_up = tf.minimum(boxes1[..., :2], boxes2[..., :2]) 280 | enclose_right_down = tf.maximum(boxes1[..., 2:], boxes2[..., 2:]) 281 | enclose = tf.maximum(enclose_right_down - enclose_left_up, 0.0) 282 | 283 | # Calculate the area of the smallest closed convex surface C 284 | enclose_area = enclose[..., 0] * enclose[..., 1] 285 | 286 | # Calculate the GIoU value according to the GioU formula 287 | giou = iou - 1.0 * (enclose_area - union_area) / enclose_area 288 | 289 | return giou 290 | 291 | # testing (should be better than giou) 292 | def bbox_ciou(boxes1, boxes2): 293 | boxes1_coor = tf.concat([boxes1[..., :2] - boxes1[..., 2:] * 0.5, 294 | boxes1[..., :2] + boxes1[..., 2:] * 0.5], axis=-1) 295 | boxes2_coor = tf.concat([boxes2[..., :2] - boxes2[..., 2:] * 0.5, 296 | boxes2[..., :2] + boxes2[..., 2:] * 0.5], axis=-1) 297 | 298 | left = tf.maximum(boxes1_coor[..., 0], boxes2_coor[..., 0]) 299 | up = tf.maximum(boxes1_coor[..., 1], boxes2_coor[..., 1]) 300 | right = tf.maximum(boxes1_coor[..., 2], boxes2_coor[..., 2]) 301 | down = tf.maximum(boxes1_coor[..., 3], boxes2_coor[..., 3]) 302 | 303 | c = (right - left) * (right - left) + (up - down) * (up - down) 304 | iou = bbox_iou(boxes1, boxes2) 305 | 306 | u = (boxes1[..., 0] - boxes2[..., 0]) * (boxes1[..., 0] - boxes2[..., 0]) + (boxes1[..., 1] - boxes2[..., 1]) * (boxes1[..., 1] - boxes2[..., 1]) 307 | d = u / c 308 | 309 | ar_gt = boxes2[..., 2] / boxes2[..., 3] 310 | ar_pred = boxes1[..., 2] / boxes1[..., 3] 311 | 312 | ar_loss = 4 / (np.pi * np.pi) * (tf.atan(ar_gt) - tf.atan(ar_pred)) * (tf.atan(ar_gt) - tf.atan(ar_pred)) 313 | alpha = ar_loss / (1 - iou + ar_loss + 0.000001) 314 | ciou_term = d + alpha * ar_loss 315 | 316 | return iou - ciou_term 317 | 318 | 319 | def compute_loss(pred, conv, label, bboxes, i=0, CLASSES=YOLO_COCO_CLASSES): 320 | NUM_CLASS = len(read_class_names(CLASSES)) 321 | conv_shape = tf.shape(conv) 322 | batch_size = conv_shape[0] 323 | output_size = conv_shape[1] 324 | input_size = STRIDES[i] * output_size 325 | conv = tf.reshape(conv, (batch_size, output_size, output_size, 3, 5 + NUM_CLASS)) 326 | 327 | conv_raw_conf = conv[:, :, :, :, 4:5] 328 | conv_raw_prob = conv[:, :, :, :, 5:] 329 | 330 | pred_xywh = pred[:, :, :, :, 0:4] 331 | pred_conf = pred[:, :, :, :, 4:5] 332 | 333 | label_xywh = label[:, :, :, :, 0:4] 334 | respond_bbox = label[:, :, :, :, 4:5] 335 | label_prob = label[:, :, :, :, 5:] 336 | 337 | giou = tf.expand_dims(bbox_giou(pred_xywh, label_xywh), axis=-1) 338 | input_size = tf.cast(input_size, tf.float32) 339 | 340 | bbox_loss_scale = 2.0 - 1.0 * label_xywh[:, :, :, :, 2:3] * label_xywh[:, :, :, :, 3:4] / (input_size ** 2) 341 | giou_loss = respond_bbox * bbox_loss_scale * (1 - giou) 342 | 343 | iou = bbox_iou(pred_xywh[:, :, :, :, np.newaxis, :], bboxes[:, np.newaxis, np.newaxis, np.newaxis, :, :]) 344 | # Find the value of IoU with the real box The largest prediction box 345 | max_iou = tf.expand_dims(tf.reduce_max(iou, axis=-1), axis=-1) 346 | 347 | # If the largest iou is less than the threshold, it is considered that the prediction box contains no objects, then the background box 348 | respond_bgd = (1.0 - respond_bbox) * tf.cast( max_iou < YOLO_IOU_LOSS_THRESH, tf.float32 ) 349 | 350 | conf_focal = tf.pow(respond_bbox - pred_conf, 2) 351 | 352 | # Calculate the loss of confidence 353 | # we hope that if the grid contains objects, then the network output prediction box has a confidence of 1 and 0 when there is no object. 354 | conf_loss = conf_focal * ( 355 | respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits(labels=respond_bbox, logits=conv_raw_conf) 356 | + 357 | respond_bgd * tf.nn.sigmoid_cross_entropy_with_logits(labels=respond_bbox, logits=conv_raw_conf) 358 | ) 359 | 360 | prob_loss = respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits(labels=label_prob, logits=conv_raw_prob) 361 | 362 | giou_loss = tf.reduce_mean(tf.reduce_sum(giou_loss, axis=[1,2,3,4])) 363 | conf_loss = tf.reduce_mean(tf.reduce_sum(conf_loss, axis=[1,2,3,4])) 364 | prob_loss = tf.reduce_mean(tf.reduce_sum(prob_loss, axis=[1,2,3,4])) 365 | 366 | return giou_loss, conf_loss, prob_loss 367 | -------------------------------------------------------------------------------- /yolov3/yolov4.py: -------------------------------------------------------------------------------- 1 | #================================================================ 2 | # 3 | # File name : yolov4.py 4 | # Author : PyLessons 5 | # Created date: 2020-09-31 6 | # Website : https://pylessons.com/ 7 | # GitHub : https://github.com/pythonlessons/TensorFlow-2.x-YOLOv3 8 | # Description : main yolov3 & yolov4 functions 9 | # 10 | #================================================================ 11 | import numpy as np 12 | import tensorflow as tf 13 | from tensorflow.keras.layers import Conv2D, Input, LeakyReLU, ZeroPadding2D, BatchNormalization, MaxPool2D 14 | from tensorflow.keras.regularizers import l2 15 | from yolov3.configs import * 16 | 17 | STRIDES = np.array(YOLO_STRIDES) 18 | ANCHORS = (np.array(YOLO_ANCHORS).T/STRIDES).T 19 | 20 | def read_class_names(class_file_name): 21 | # loads class name from a file 22 | names = {} 23 | with open(class_file_name, 'r') as data: 24 | for ID, name in enumerate(data): 25 | names[ID] = name.strip('\n') 26 | return names 27 | 28 | class BatchNormalization(BatchNormalization): 29 | # "Frozen state" and "inference mode" are two separate concepts. 30 | # `layer.trainable = False` is to freeze the layer, so the layer will use 31 | # stored moving `var` and `mean` in the "inference mode", and both `gama` 32 | # and `beta` will not be updated ! 33 | def call(self, x, training=False): 34 | if not training: 35 | training = tf.constant(False) 36 | training = tf.logical_and(training, self.trainable) 37 | return super().call(x, training) 38 | 39 | def convolutional(input_layer, filters_shape, downsample=False, activate=True, bn=True, activate_type='leaky'): 40 | if downsample: 41 | input_layer = ZeroPadding2D(((1, 0), (1, 0)))(input_layer) 42 | padding = 'valid' 43 | strides = 2 44 | else: 45 | strides = 1 46 | padding = 'same' 47 | 48 | conv = Conv2D(filters=filters_shape[-1], kernel_size = filters_shape[0], strides=strides, 49 | padding=padding, use_bias=not bn, kernel_regularizer=l2(0.0005), 50 | kernel_initializer=tf.random_normal_initializer(stddev=0.01), 51 | bias_initializer=tf.constant_initializer(0.))(input_layer) 52 | if bn: 53 | conv = BatchNormalization()(conv) 54 | if activate == True: 55 | if activate_type == "leaky": 56 | conv = LeakyReLU(alpha=0.1)(conv) 57 | elif activate_type == "mish": 58 | conv = mish(conv) 59 | 60 | return conv 61 | 62 | def mish(x): 63 | return x * tf.math.tanh(tf.math.softplus(x)) 64 | 65 | def residual_block(input_layer, input_channel, filter_num1, filter_num2, activate_type='leaky'): 66 | short_cut = input_layer 67 | conv = convolutional(input_layer, filters_shape=(1, 1, input_channel, filter_num1), activate_type=activate_type) 68 | conv = convolutional(conv , filters_shape=(3, 3, filter_num1, filter_num2), activate_type=activate_type) 69 | 70 | residual_output = short_cut + conv 71 | return residual_output 72 | 73 | def upsample(input_layer): 74 | return tf.image.resize(input_layer, (input_layer.shape[1] * 2, input_layer.shape[2] * 2), method='nearest') 75 | 76 | def route_group(input_layer, groups, group_id): 77 | convs = tf.split(input_layer, num_or_size_splits=groups, axis=-1) 78 | return convs[group_id] 79 | 80 | def darknet53(input_data): 81 | input_data = convolutional(input_data, (3, 3, 3, 32)) 82 | input_data = convolutional(input_data, (3, 3, 32, 64), downsample=True) 83 | 84 | for i in range(1): 85 | input_data = residual_block(input_data, 64, 32, 64) 86 | 87 | input_data = convolutional(input_data, (3, 3, 64, 128), downsample=True) 88 | 89 | for i in range(2): 90 | input_data = residual_block(input_data, 128, 64, 128) 91 | 92 | input_data = convolutional(input_data, (3, 3, 128, 256), downsample=True) 93 | 94 | for i in range(8): 95 | input_data = residual_block(input_data, 256, 128, 256) 96 | 97 | route_1 = input_data 98 | input_data = convolutional(input_data, (3, 3, 256, 512), downsample=True) 99 | 100 | for i in range(8): 101 | input_data = residual_block(input_data, 512, 256, 512) 102 | 103 | route_2 = input_data 104 | input_data = convolutional(input_data, (3, 3, 512, 1024), downsample=True) 105 | 106 | for i in range(4): 107 | input_data = residual_block(input_data, 1024, 512, 1024) 108 | 109 | return route_1, route_2, input_data 110 | 111 | def cspdarknet53(input_data): 112 | input_data = convolutional(input_data, (3, 3, 3, 32), activate_type="mish") 113 | input_data = convolutional(input_data, (3, 3, 32, 64), downsample=True, activate_type="mish") 114 | 115 | route = input_data 116 | route = convolutional(route, (1, 1, 64, 64), activate_type="mish") 117 | input_data = convolutional(input_data, (1, 1, 64, 64), activate_type="mish") 118 | for i in range(1): 119 | input_data = residual_block(input_data, 64, 32, 64, activate_type="mish") 120 | input_data = convolutional(input_data, (1, 1, 64, 64), activate_type="mish") 121 | 122 | input_data = tf.concat([input_data, route], axis=-1) 123 | input_data = convolutional(input_data, (1, 1, 128, 64), activate_type="mish") 124 | input_data = convolutional(input_data, (3, 3, 64, 128), downsample=True, activate_type="mish") 125 | route = input_data 126 | route = convolutional(route, (1, 1, 128, 64), activate_type="mish") 127 | input_data = convolutional(input_data, (1, 1, 128, 64), activate_type="mish") 128 | for i in range(2): 129 | input_data = residual_block(input_data, 64, 64, 64, activate_type="mish") 130 | input_data = convolutional(input_data, (1, 1, 64, 64), activate_type="mish") 131 | input_data = tf.concat([input_data, route], axis=-1) 132 | 133 | input_data = convolutional(input_data, (1, 1, 128, 128), activate_type="mish") 134 | input_data = convolutional(input_data, (3, 3, 128, 256), downsample=True, activate_type="mish") 135 | route = input_data 136 | route = convolutional(route, (1, 1, 256, 128), activate_type="mish") 137 | input_data = convolutional(input_data, (1, 1, 256, 128), activate_type="mish") 138 | for i in range(8): 139 | input_data = residual_block(input_data, 128, 128, 128, activate_type="mish") 140 | input_data = convolutional(input_data, (1, 1, 128, 128), activate_type="mish") 141 | input_data = tf.concat([input_data, route], axis=-1) 142 | 143 | input_data = convolutional(input_data, (1, 1, 256, 256), activate_type="mish") 144 | route_1 = input_data 145 | input_data = convolutional(input_data, (3, 3, 256, 512), downsample=True, activate_type="mish") 146 | route = input_data 147 | route = convolutional(route, (1, 1, 512, 256), activate_type="mish") 148 | input_data = convolutional(input_data, (1, 1, 512, 256), activate_type="mish") 149 | for i in range(8): 150 | input_data = residual_block(input_data, 256, 256, 256, activate_type="mish") 151 | input_data = convolutional(input_data, (1, 1, 256, 256), activate_type="mish") 152 | input_data = tf.concat([input_data, route], axis=-1) 153 | 154 | input_data = convolutional(input_data, (1, 1, 512, 512), activate_type="mish") 155 | route_2 = input_data 156 | input_data = convolutional(input_data, (3, 3, 512, 1024), downsample=True, activate_type="mish") 157 | route = input_data 158 | route = convolutional(route, (1, 1, 1024, 512), activate_type="mish") 159 | input_data = convolutional(input_data, (1, 1, 1024, 512), activate_type="mish") 160 | for i in range(4): 161 | input_data = residual_block(input_data, 512, 512, 512, activate_type="mish") 162 | input_data = convolutional(input_data, (1, 1, 512, 512), activate_type="mish") 163 | input_data = tf.concat([input_data, route], axis=-1) 164 | 165 | input_data = convolutional(input_data, (1, 1, 1024, 1024), activate_type="mish") 166 | input_data = convolutional(input_data, (1, 1, 1024, 512)) 167 | input_data = convolutional(input_data, (3, 3, 512, 1024)) 168 | input_data = convolutional(input_data, (1, 1, 1024, 512)) 169 | 170 | max_pooling_1 = tf.keras.layers.MaxPool2D(pool_size=13, padding='SAME', strides=1)(input_data) 171 | max_pooling_2 = tf.keras.layers.MaxPool2D(pool_size=9, padding='SAME', strides=1)(input_data) 172 | max_pooling_3 = tf.keras.layers.MaxPool2D(pool_size=5, padding='SAME', strides=1)(input_data) 173 | input_data = tf.concat([max_pooling_1, max_pooling_2, max_pooling_3, input_data], axis=-1) 174 | 175 | input_data = convolutional(input_data, (1, 1, 2048, 512)) 176 | input_data = convolutional(input_data, (3, 3, 512, 1024)) 177 | input_data = convolutional(input_data, (1, 1, 1024, 512)) 178 | 179 | return route_1, route_2, input_data 180 | 181 | def darknet19_tiny(input_data): 182 | input_data = convolutional(input_data, (3, 3, 3, 16)) 183 | input_data = MaxPool2D(2, 2, 'same')(input_data) 184 | input_data = convolutional(input_data, (3, 3, 16, 32)) 185 | input_data = MaxPool2D(2, 2, 'same')(input_data) 186 | input_data = convolutional(input_data, (3, 3, 32, 64)) 187 | input_data = MaxPool2D(2, 2, 'same')(input_data) 188 | input_data = convolutional(input_data, (3, 3, 64, 128)) 189 | input_data = MaxPool2D(2, 2, 'same')(input_data) 190 | input_data = convolutional(input_data, (3, 3, 128, 256)) 191 | route_1 = input_data 192 | input_data = MaxPool2D(2, 2, 'same')(input_data) 193 | input_data = convolutional(input_data, (3, 3, 256, 512)) 194 | input_data = MaxPool2D(2, 1, 'same')(input_data) 195 | input_data = convolutional(input_data, (3, 3, 512, 1024)) 196 | 197 | return route_1, input_data 198 | 199 | def cspdarknet53_tiny(input_data): # not sure how this should be called 200 | input_data = convolutional(input_data, (3, 3, 3, 32), downsample=True) 201 | input_data = convolutional(input_data, (3, 3, 32, 64), downsample=True) 202 | input_data = convolutional(input_data, (3, 3, 64, 64)) 203 | 204 | route = input_data 205 | input_data = route_group(input_data, 2, 1) 206 | input_data = convolutional(input_data, (3, 3, 32, 32)) 207 | route_1 = input_data 208 | input_data = convolutional(input_data, (3, 3, 32, 32)) 209 | input_data = tf.concat([input_data, route_1], axis=-1) 210 | input_data = convolutional(input_data, (1, 1, 32, 64)) 211 | input_data = tf.concat([route, input_data], axis=-1) 212 | input_data = MaxPool2D(2, 2, 'same')(input_data) 213 | 214 | input_data = convolutional(input_data, (3, 3, 64, 128)) 215 | route = input_data 216 | input_data = route_group(input_data, 2, 1) 217 | input_data = convolutional(input_data, (3, 3, 64, 64)) 218 | route_1 = input_data 219 | input_data = convolutional(input_data, (3, 3, 64, 64)) 220 | input_data = tf.concat([input_data, route_1], axis=-1) 221 | input_data = convolutional(input_data, (1, 1, 64, 128)) 222 | input_data = tf.concat([route, input_data], axis=-1) 223 | input_data = MaxPool2D(2, 2, 'same')(input_data) 224 | 225 | input_data = convolutional(input_data, (3, 3, 128, 256)) 226 | route = input_data 227 | input_data = route_group(input_data, 2, 1) 228 | input_data = convolutional(input_data, (3, 3, 128, 128)) 229 | route_1 = input_data 230 | input_data = convolutional(input_data, (3, 3, 128, 128)) 231 | input_data = tf.concat([input_data, route_1], axis=-1) 232 | input_data = convolutional(input_data, (1, 1, 128, 256)) 233 | route_1 = input_data 234 | input_data = tf.concat([route, input_data], axis=-1) 235 | input_data = MaxPool2D(2, 2, 'same')(input_data) 236 | 237 | input_data = convolutional(input_data, (3, 3, 512, 512)) 238 | 239 | return route_1, input_data 240 | 241 | def YOLOv3(input_layer, NUM_CLASS): 242 | # After the input layer enters the Darknet-53 network, we get three branches 243 | route_1, route_2, conv = darknet53(input_layer) 244 | # See the orange module (DBL) in the figure above, a total of 5 Subconvolution operation 245 | conv = convolutional(conv, (1, 1, 1024, 512)) 246 | conv = convolutional(conv, (3, 3, 512, 1024)) 247 | conv = convolutional(conv, (1, 1, 1024, 512)) 248 | conv = convolutional(conv, (3, 3, 512, 1024)) 249 | conv = convolutional(conv, (1, 1, 1024, 512)) 250 | conv_lobj_branch = convolutional(conv, (3, 3, 512, 1024)) 251 | 252 | # conv_lbbox is used to predict large-sized objects , Shape = [None, 13, 13, 255] 253 | conv_lbbox = convolutional(conv_lobj_branch, (1, 1, 1024, 3*(NUM_CLASS + 5)), activate=False, bn=False) 254 | 255 | conv = convolutional(conv, (1, 1, 512, 256)) 256 | # upsample here uses the nearest neighbor interpolation method, which has the advantage that the 257 | # upsampling process does not need to learn, thereby reducing the network parameter 258 | conv = upsample(conv) 259 | 260 | conv = tf.concat([conv, route_2], axis=-1) 261 | conv = convolutional(conv, (1, 1, 768, 256)) 262 | conv = convolutional(conv, (3, 3, 256, 512)) 263 | conv = convolutional(conv, (1, 1, 512, 256)) 264 | conv = convolutional(conv, (3, 3, 256, 512)) 265 | conv = convolutional(conv, (1, 1, 512, 256)) 266 | conv_mobj_branch = convolutional(conv, (3, 3, 256, 512)) 267 | 268 | # conv_mbbox is used to predict medium-sized objects, shape = [None, 26, 26, 255] 269 | conv_mbbox = convolutional(conv_mobj_branch, (1, 1, 512, 3*(NUM_CLASS + 5)), activate=False, bn=False) 270 | 271 | conv = convolutional(conv, (1, 1, 256, 128)) 272 | conv = upsample(conv) 273 | 274 | conv = tf.concat([conv, route_1], axis=-1) 275 | conv = convolutional(conv, (1, 1, 384, 128)) 276 | conv = convolutional(conv, (3, 3, 128, 256)) 277 | conv = convolutional(conv, (1, 1, 256, 128)) 278 | conv = convolutional(conv, (3, 3, 128, 256)) 279 | conv = convolutional(conv, (1, 1, 256, 128)) 280 | conv_sobj_branch = convolutional(conv, (3, 3, 128, 256)) 281 | 282 | # conv_sbbox is used to predict small size objects, shape = [None, 52, 52, 255] 283 | conv_sbbox = convolutional(conv_sobj_branch, (1, 1, 256, 3*(NUM_CLASS +5)), activate=False, bn=False) 284 | 285 | return [conv_sbbox, conv_mbbox, conv_lbbox] 286 | 287 | def YOLOv4(input_layer, NUM_CLASS): 288 | route_1, route_2, conv = cspdarknet53(input_layer) 289 | 290 | route = conv 291 | conv = convolutional(conv, (1, 1, 512, 256)) 292 | conv = upsample(conv) 293 | route_2 = convolutional(route_2, (1, 1, 512, 256)) 294 | conv = tf.concat([route_2, conv], axis=-1) 295 | 296 | conv = convolutional(conv, (1, 1, 512, 256)) 297 | conv = convolutional(conv, (3, 3, 256, 512)) 298 | conv = convolutional(conv, (1, 1, 512, 256)) 299 | conv = convolutional(conv, (3, 3, 256, 512)) 300 | conv = convolutional(conv, (1, 1, 512, 256)) 301 | 302 | route_2 = conv 303 | conv = convolutional(conv, (1, 1, 256, 128)) 304 | conv = upsample(conv) 305 | route_1 = convolutional(route_1, (1, 1, 256, 128)) 306 | conv = tf.concat([route_1, conv], axis=-1) 307 | 308 | conv = convolutional(conv, (1, 1, 256, 128)) 309 | conv = convolutional(conv, (3, 3, 128, 256)) 310 | conv = convolutional(conv, (1, 1, 256, 128)) 311 | conv = convolutional(conv, (3, 3, 128, 256)) 312 | conv = convolutional(conv, (1, 1, 256, 128)) 313 | 314 | route_1 = conv 315 | conv = convolutional(conv, (3, 3, 128, 256)) 316 | conv_sbbox = convolutional(conv, (1, 1, 256, 3 * (NUM_CLASS + 5)), activate=False, bn=False) 317 | 318 | conv = convolutional(route_1, (3, 3, 128, 256), downsample=True) 319 | conv = tf.concat([conv, route_2], axis=-1) 320 | 321 | conv = convolutional(conv, (1, 1, 512, 256)) 322 | conv = convolutional(conv, (3, 3, 256, 512)) 323 | conv = convolutional(conv, (1, 1, 512, 256)) 324 | conv = convolutional(conv, (3, 3, 256, 512)) 325 | conv = convolutional(conv, (1, 1, 512, 256)) 326 | 327 | route_2 = conv 328 | conv = convolutional(conv, (3, 3, 256, 512)) 329 | conv_mbbox = convolutional(conv, (1, 1, 512, 3 * (NUM_CLASS + 5)), activate=False, bn=False) 330 | 331 | conv = convolutional(route_2, (3, 3, 256, 512), downsample=True) 332 | conv = tf.concat([conv, route], axis=-1) 333 | 334 | conv = convolutional(conv, (1, 1, 1024, 512)) 335 | conv = convolutional(conv, (3, 3, 512, 1024)) 336 | conv = convolutional(conv, (1, 1, 1024, 512)) 337 | conv = convolutional(conv, (3, 3, 512, 1024)) 338 | conv = convolutional(conv, (1, 1, 1024, 512)) 339 | 340 | conv = convolutional(conv, (3, 3, 512, 1024)) 341 | conv_lbbox = convolutional(conv, (1, 1, 1024, 3 * (NUM_CLASS + 5)), activate=False, bn=False) 342 | 343 | return [conv_sbbox, conv_mbbox, conv_lbbox] 344 | 345 | def YOLOv3_tiny(input_layer, NUM_CLASS): 346 | # After the input layer enters the Darknet-53 network, we get three branches 347 | route_1, conv = darknet19_tiny(input_layer) 348 | 349 | conv = convolutional(conv, (1, 1, 1024, 256)) 350 | conv_lobj_branch = convolutional(conv, (3, 3, 256, 512)) 351 | 352 | # conv_lbbox is used to predict large-sized objects , Shape = [None, 26, 26, 255] 353 | conv_lbbox = convolutional(conv_lobj_branch, (1, 1, 512, 3*(NUM_CLASS + 5)), activate=False, bn=False) 354 | 355 | conv = convolutional(conv, (1, 1, 256, 128)) 356 | # upsample here uses the nearest neighbor interpolation method, which has the advantage that the 357 | # upsampling process does not need to learn, thereby reducing the network parameter 358 | conv = upsample(conv) 359 | 360 | conv = tf.concat([conv, route_1], axis=-1) 361 | conv_mobj_branch = convolutional(conv, (3, 3, 128, 256)) 362 | # conv_mbbox is used to predict medium size objects, shape = [None, 13, 13, 255] 363 | conv_mbbox = convolutional(conv_mobj_branch, (1, 1, 256, 3 * (NUM_CLASS + 5)), activate=False, bn=False) 364 | 365 | return [conv_mbbox, conv_lbbox] 366 | 367 | def YOLOv4_tiny(input_layer, NUM_CLASS): 368 | route_1, conv = cspdarknet53_tiny(input_layer) 369 | 370 | conv = convolutional(conv, (1, 1, 512, 256)) 371 | 372 | conv_lobj_branch = convolutional(conv, (3, 3, 256, 512)) 373 | conv_lbbox = convolutional(conv_lobj_branch, (1, 1, 512, 3 * (NUM_CLASS + 5)), activate=False, bn=False) 374 | 375 | conv = convolutional(conv, (1, 1, 256, 128)) 376 | conv = upsample(conv) 377 | conv = tf.concat([conv, route_1], axis=-1) 378 | 379 | conv_mobj_branch = convolutional(conv, (3, 3, 128, 256)) 380 | conv_mbbox = convolutional(conv_mobj_branch, (1, 1, 256, 3 * (NUM_CLASS + 5)), activate=False, bn=False) 381 | 382 | return [conv_mbbox, conv_lbbox] 383 | 384 | def Create_Yolo(input_size=416, channels=3, training=False, CLASSES=YOLO_COCO_CLASSES): 385 | NUM_CLASS = len(read_class_names(CLASSES)) 386 | input_layer = Input([input_size, input_size, channels]) 387 | 388 | if TRAIN_YOLO_TINY: 389 | if YOLO_TYPE == "yolov4": 390 | conv_tensors = YOLOv4_tiny(input_layer, NUM_CLASS) 391 | if YOLO_TYPE == "yolov3": 392 | conv_tensors = YOLOv3_tiny(input_layer, NUM_CLASS) 393 | else: 394 | if YOLO_TYPE == "yolov4": 395 | conv_tensors = YOLOv4(input_layer, NUM_CLASS) 396 | if YOLO_TYPE == "yolov3": 397 | conv_tensors = YOLOv3(input_layer, NUM_CLASS) 398 | 399 | output_tensors = [] 400 | for i, conv_tensor in enumerate(conv_tensors): 401 | pred_tensor = decode(conv_tensor, NUM_CLASS, i) 402 | if training: output_tensors.append(conv_tensor) 403 | output_tensors.append(pred_tensor) 404 | 405 | Yolo = tf.keras.Model(input_layer, output_tensors) 406 | return Yolo 407 | 408 | 409 | def decode(conv_output, NUM_CLASS, i=0): 410 | # where i = 0, 1 or 2 to correspond to the three grid scales 411 | conv_shape = tf.shape(conv_output) 412 | batch_size = conv_shape[0] 413 | output_size = conv_shape[1] 414 | 415 | conv_output = tf.reshape(conv_output, (batch_size, output_size, output_size, 3, 5 + NUM_CLASS)) 416 | 417 | #conv_raw_dxdy = conv_output[:, :, :, :, 0:2] # offset of center position 418 | #conv_raw_dwdh = conv_output[:, :, :, :, 2:4] # Prediction box length and width offset 419 | #conv_raw_conf = conv_output[:, :, :, :, 4:5] # confidence of the prediction box 420 | #conv_raw_prob = conv_output[:, :, :, :, 5: ] # category probability of the prediction box 421 | conv_raw_dxdy, conv_raw_dwdh, conv_raw_conf, conv_raw_prob = tf.split(conv_output, (2, 2, 1, NUM_CLASS), axis=-1) 422 | 423 | # next need Draw the grid. Where output_size is equal to 13, 26 or 52 424 | #y = tf.range(output_size, dtype=tf.int32) 425 | #y = tf.expand_dims(y, -1) 426 | #y = tf.tile(y, [1, output_size]) 427 | #x = tf.range(output_size,dtype=tf.int32) 428 | #x = tf.expand_dims(x, 0) 429 | #x = tf.tile(x, [output_size, 1]) 430 | xy_grid = tf.meshgrid(tf.range(output_size), tf.range(output_size)) 431 | xy_grid = tf.expand_dims(tf.stack(xy_grid, axis=-1), axis=2) # [gx, gy, 1, 2] 432 | xy_grid = tf.tile(tf.expand_dims(xy_grid, axis=0), [batch_size, 1, 1, 3, 1]) 433 | xy_grid = tf.cast(xy_grid, tf.float32) 434 | 435 | #xy_grid = tf.concat([x[:, :, tf.newaxis], y[:, :, tf.newaxis]], axis=-1) 436 | #xy_grid = tf.tile(xy_grid[tf.newaxis, :, :, tf.newaxis, :], [batch_size, 1, 1, 3, 1]) 437 | #y_grid = tf.cast(xy_grid, tf.float32) 438 | 439 | # Calculate the center position of the prediction box: 440 | pred_xy = (tf.sigmoid(conv_raw_dxdy) + xy_grid) * STRIDES[i] 441 | # Calculate the length and width of the prediction box: 442 | pred_wh = (tf.exp(conv_raw_dwdh) * ANCHORS[i]) * STRIDES[i] 443 | 444 | pred_xywh = tf.concat([pred_xy, pred_wh], axis=-1) 445 | pred_conf = tf.sigmoid(conv_raw_conf) # object box calculates the predicted confidence 446 | pred_prob = tf.sigmoid(conv_raw_prob) # calculating the predicted probability category box object 447 | 448 | # calculating the predicted probability category box object 449 | return tf.concat([pred_xywh, pred_conf, pred_prob], axis=-1) 450 | 451 | 452 | def bbox_iou(boxes1, boxes2): 453 | boxes1_area = boxes1[..., 2] * boxes1[..., 3] 454 | boxes2_area = boxes2[..., 2] * boxes2[..., 3] 455 | 456 | boxes1 = tf.concat([boxes1[..., :2] - boxes1[..., 2:] * 0.5, 457 | boxes1[..., :2] + boxes1[..., 2:] * 0.5], axis=-1) 458 | boxes2 = tf.concat([boxes2[..., :2] - boxes2[..., 2:] * 0.5, 459 | boxes2[..., :2] + boxes2[..., 2:] * 0.5], axis=-1) 460 | 461 | left_up = tf.maximum(boxes1[..., :2], boxes2[..., :2]) 462 | right_down = tf.minimum(boxes1[..., 2:], boxes2[..., 2:]) 463 | 464 | inter_section = tf.maximum(right_down - left_up, 0.0) 465 | inter_area = inter_section[..., 0] * inter_section[..., 1] 466 | union_area = boxes1_area + boxes2_area - inter_area 467 | 468 | return 1.0 * inter_area / union_area 469 | 470 | def bbox_giou(boxes1, boxes2): 471 | boxes1 = tf.concat([boxes1[..., :2] - boxes1[..., 2:] * 0.5, 472 | boxes1[..., :2] + boxes1[..., 2:] * 0.5], axis=-1) 473 | boxes2 = tf.concat([boxes2[..., :2] - boxes2[..., 2:] * 0.5, 474 | boxes2[..., :2] + boxes2[..., 2:] * 0.5], axis=-1) 475 | 476 | boxes1 = tf.concat([tf.minimum(boxes1[..., :2], boxes1[..., 2:]), 477 | tf.maximum(boxes1[..., :2], boxes1[..., 2:])], axis=-1) 478 | boxes2 = tf.concat([tf.minimum(boxes2[..., :2], boxes2[..., 2:]), 479 | tf.maximum(boxes2[..., :2], boxes2[..., 2:])], axis=-1) 480 | 481 | boxes1_area = (boxes1[..., 2] - boxes1[..., 0]) * (boxes1[..., 3] - boxes1[..., 1]) 482 | boxes2_area = (boxes2[..., 2] - boxes2[..., 0]) * (boxes2[..., 3] - boxes2[..., 1]) 483 | 484 | left_up = tf.maximum(boxes1[..., :2], boxes2[..., :2]) 485 | right_down = tf.minimum(boxes1[..., 2:], boxes2[..., 2:]) 486 | 487 | inter_section = tf.maximum(right_down - left_up, 0.0) 488 | inter_area = inter_section[..., 0] * inter_section[..., 1] 489 | union_area = boxes1_area + boxes2_area - inter_area 490 | 491 | # Calculate the iou value between the two bounding boxes 492 | iou = inter_area / union_area 493 | 494 | # Calculate the coordinates of the upper left corner and the lower right corner of the smallest closed convex surface 495 | enclose_left_up = tf.minimum(boxes1[..., :2], boxes2[..., :2]) 496 | enclose_right_down = tf.maximum(boxes1[..., 2:], boxes2[..., 2:]) 497 | enclose = tf.maximum(enclose_right_down - enclose_left_up, 0.0) 498 | 499 | # Calculate the area of the smallest closed convex surface C 500 | enclose_area = enclose[..., 0] * enclose[..., 1] 501 | 502 | # Calculate the GIoU value according to the GioU formula 503 | giou = iou - 1.0 * (enclose_area - union_area) / enclose_area 504 | 505 | return giou 506 | 507 | # testing (should be better than giou) 508 | def bbox_ciou(boxes1, boxes2): 509 | boxes1_coor = tf.concat([boxes1[..., :2] - boxes1[..., 2:] * 0.5, 510 | boxes1[..., :2] + boxes1[..., 2:] * 0.5], axis=-1) 511 | boxes2_coor = tf.concat([boxes2[..., :2] - boxes2[..., 2:] * 0.5, 512 | boxes2[..., :2] + boxes2[..., 2:] * 0.5], axis=-1) 513 | 514 | left = tf.maximum(boxes1_coor[..., 0], boxes2_coor[..., 0]) 515 | up = tf.maximum(boxes1_coor[..., 1], boxes2_coor[..., 1]) 516 | right = tf.maximum(boxes1_coor[..., 2], boxes2_coor[..., 2]) 517 | down = tf.maximum(boxes1_coor[..., 3], boxes2_coor[..., 3]) 518 | 519 | c = (right - left) * (right - left) + (up - down) * (up - down) 520 | iou = bbox_iou(boxes1, boxes2) 521 | 522 | u = (boxes1[..., 0] - boxes2[..., 0]) * (boxes1[..., 0] - boxes2[..., 0]) + (boxes1[..., 1] - boxes2[..., 1]) * (boxes1[..., 1] - boxes2[..., 1]) 523 | d = u / c 524 | 525 | ar_gt = boxes2[..., 2] / boxes2[..., 3] 526 | ar_pred = boxes1[..., 2] / boxes1[..., 3] 527 | 528 | ar_loss = 4 / (np.pi * np.pi) * (tf.atan(ar_gt) - tf.atan(ar_pred)) * (tf.atan(ar_gt) - tf.atan(ar_pred)) 529 | alpha = ar_loss / (1 - iou + ar_loss + 0.000001) 530 | ciou_term = d + alpha * ar_loss 531 | 532 | return iou - ciou_term 533 | 534 | 535 | def compute_loss(pred, conv, label, bboxes, i=0, CLASSES=YOLO_COCO_CLASSES): 536 | NUM_CLASS = len(read_class_names(CLASSES)) 537 | conv_shape = tf.shape(conv) 538 | batch_size = conv_shape[0] 539 | output_size = conv_shape[1] 540 | input_size = STRIDES[i] * output_size 541 | conv = tf.reshape(conv, (batch_size, output_size, output_size, 3, 5 + NUM_CLASS)) 542 | 543 | conv_raw_conf = conv[:, :, :, :, 4:5] 544 | conv_raw_prob = conv[:, :, :, :, 5:] 545 | 546 | pred_xywh = pred[:, :, :, :, 0:4] 547 | pred_conf = pred[:, :, :, :, 4:5] 548 | 549 | label_xywh = label[:, :, :, :, 0:4] 550 | respond_bbox = label[:, :, :, :, 4:5] 551 | label_prob = label[:, :, :, :, 5:] 552 | 553 | giou = tf.expand_dims(bbox_giou(pred_xywh, label_xywh), axis=-1) 554 | input_size = tf.cast(input_size, tf.float32) 555 | 556 | bbox_loss_scale = 2.0 - 1.0 * label_xywh[:, :, :, :, 2:3] * label_xywh[:, :, :, :, 3:4] / (input_size ** 2) 557 | giou_loss = respond_bbox * bbox_loss_scale * (1 - giou) 558 | 559 | iou = bbox_iou(pred_xywh[:, :, :, :, np.newaxis, :], bboxes[:, np.newaxis, np.newaxis, np.newaxis, :, :]) 560 | # Find the value of IoU with the real box The largest prediction box 561 | max_iou = tf.expand_dims(tf.reduce_max(iou, axis=-1), axis=-1) 562 | 563 | # If the largest iou is less than the threshold, it is considered that the prediction box contains no objects, then the background box 564 | respond_bgd = (1.0 - respond_bbox) * tf.cast( max_iou < YOLO_IOU_LOSS_THRESH, tf.float32 ) 565 | 566 | conf_focal = tf.pow(respond_bbox - pred_conf, 2) 567 | 568 | # Calculate the loss of confidence 569 | # we hope that if the grid contains objects, then the network output prediction box has a confidence of 1 and 0 when there is no object. 570 | conf_loss = conf_focal * ( 571 | respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits(labels=respond_bbox, logits=conv_raw_conf) 572 | + 573 | respond_bgd * tf.nn.sigmoid_cross_entropy_with_logits(labels=respond_bbox, logits=conv_raw_conf) 574 | ) 575 | 576 | prob_loss = respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits(labels=label_prob, logits=conv_raw_prob) 577 | 578 | giou_loss = tf.reduce_mean(tf.reduce_sum(giou_loss, axis=[1,2,3,4])) 579 | conf_loss = tf.reduce_mean(tf.reduce_sum(conf_loss, axis=[1,2,3,4])) 580 | prob_loss = tf.reduce_mean(tf.reduce_sum(prob_loss, axis=[1,2,3,4])) 581 | 582 | return giou_loss, conf_loss, prob_loss 583 | --------------------------------------------------------------------------------