├── .gitignore ├── LICENSE ├── README.md ├── checkpoints └── .gitkeep ├── colab_gpu.ipynb ├── conda-cpu.yml ├── conda-gpu.yml ├── convert.py ├── data ├── checkpoint ├── coco.names ├── girl.png ├── meme.jpg ├── meme2.jpeg ├── meme_out.jpg ├── street.jpg ├── street_out.jpg └── voc2012.names ├── detect.py ├── detect_video.py ├── docs └── training_voc.md ├── requirements-gpu.txt ├── requirements.txt ├── setup.py ├── tools ├── export_tflite.py ├── export_tfserving.py ├── visualize_dataset.py └── voc2012.py ├── train.py └── yolov3_tf2 ├── __init__.py ├── dataset.py ├── models.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.h5 2 | *.weights 3 | *.tar 4 | *.tfrecord 5 | /checkpoints/* 6 | /serving/* 7 | /logs/ 8 | /Untitled.ipynb 9 | /output.jpg 10 | /data/voc2012_raw/ 11 | 12 | # Created by https://www.gitignore.io/api/python 13 | # Edit at https://www.gitignore.io/?templates=python 14 | 15 | ### Python ### 16 | # Byte-compiled / optimized / DLL files 17 | __pycache__/ 18 | *.py[cod] 19 | *$py.class 20 | 21 | # C extensions 22 | *.so 23 | 24 | # Distribution / packaging 25 | .Python 26 | build/ 27 | develop-eggs/ 28 | dist/ 29 | downloads/ 30 | eggs/ 31 | .eggs/ 32 | lib/ 33 | lib64/ 34 | parts/ 35 | sdist/ 36 | var/ 37 | wheels/ 38 | pip-wheel-metadata/ 39 | share/python-wheels/ 40 | *.egg-info/ 41 | .installed.cfg 42 | *.egg 43 | MANIFEST 44 | 45 | # PyInstaller 46 | # Usually these files are written by a python script from a template 47 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 48 | *.manifest 49 | *.spec 50 | 51 | # Installer logs 52 | pip-log.txt 53 | pip-delete-this-directory.txt 54 | 55 | # Unit test / coverage reports 56 | htmlcov/ 57 | .tox/ 58 | .nox/ 59 | .coverage 60 | .coverage.* 61 | .cache 62 | nosetests.xml 63 | coverage.xml 64 | *.cover 65 | .hypothesis/ 66 | .pytest_cache/ 67 | 68 | # Translations 69 | *.mo 70 | *.pot 71 | 72 | # Django stuff: 73 | *.log 74 | local_settings.py 75 | db.sqlite3 76 | 77 | # Flask stuff: 78 | instance/ 79 | .webassets-cache 80 | 81 | # Scrapy stuff: 82 | .scrapy 83 | 84 | # Sphinx documentation 85 | docs/_build/ 86 | 87 | # PyBuilder 88 | target/ 89 | 90 | # Jupyter Notebook 91 | .ipynb_checkpoints 92 | 93 | # IPython 94 | profile_default/ 95 | ipython_config.py 96 | 97 | # pyenv 98 | .python-version 99 | 100 | # pipenv 101 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 102 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 103 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 104 | # install all needed dependencies. 105 | #Pipfile.lock 106 | 107 | # celery beat schedule file 108 | celerybeat-schedule 109 | 110 | # SageMath parsed files 111 | *.sage.py 112 | 113 | # Environments 114 | .env 115 | .venv 116 | env/ 117 | venv/ 118 | ENV/ 119 | env.bak/ 120 | venv.bak/ 121 | 122 | # Spyder project settings 123 | .spyderproject 124 | .spyproject 125 | 126 | # Rope project settings 127 | .ropeproject 128 | 129 | # mkdocs documentation 130 | /site 131 | 132 | # mypy 133 | .mypy_cache/ 134 | .dmypy.json 135 | dmypy.json 136 | 137 | # Pyre type checker 138 | .pyre/ 139 | 140 | # End of https://www.gitignore.io/api/python 141 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Zihao Zhang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # YoloV3 Implemented in TensorFlow 2.0 2 | 3 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/zzh8829/yolov3-tf2/blob/master/colab_gpu.ipynb) 4 | 5 | This repo provides a clean implementation of YoloV3 in TensorFlow 2.0 using all the best practices. 6 | 7 | ## Key Features 8 | 9 | - [x] TensorFlow 2.0 10 | - [x] `yolov3` with pre-trained Weights 11 | - [x] `yolov3-tiny` with pre-trained Weights 12 | - [x] Inference example 13 | - [x] Transfer learning example 14 | - [x] Eager mode training with `tf.GradientTape` 15 | - [x] Graph mode training with `model.fit` 16 | - [x] Functional model with `tf.keras.layers` 17 | - [x] Input pipeline using `tf.data` 18 | - [x] Tensorflow Serving 19 | - [x] Vectorized transformations 20 | - [x] GPU accelerated 21 | - [x] Fully integrated with `absl-py` from [abseil.io](https://abseil.io) 22 | - [x] Clean implementation 23 | - [x] Following the best practices 24 | - [x] MIT License 25 | 26 | ![demo](https://raw.githubusercontent.com/zzh8829/yolov3-tf2/master/data/meme_out.jpg) 27 | ![demo](https://raw.githubusercontent.com/zzh8829/yolov3-tf2/master/data/street_out.jpg) 28 | 29 | ## Usage 30 | 31 | ### Installation 32 | 33 | #### Conda (Recommended) 34 | 35 | ```bash 36 | # Tensorflow CPU 37 | conda env create -f conda-cpu.yml 38 | conda activate yolov3-tf2-cpu 39 | 40 | # Tensorflow GPU 41 | conda env create -f conda-gpu.yml 42 | conda activate yolov3-tf2-gpu 43 | ``` 44 | 45 | #### Pip 46 | 47 | ```bash 48 | pip install -r requirements.txt 49 | ``` 50 | 51 | ### Nvidia Driver (For GPU) 52 | 53 | ```bash 54 | # Ubuntu 18.04 55 | sudo apt-add-repository -r ppa:graphics-drivers/ppa 56 | sudo apt install nvidia-driver-430 57 | # Windows/Other 58 | https://www.nvidia.com/Download/index.aspx 59 | ``` 60 | 61 | ### Convert pre-trained Darknet weights 62 | 63 | ```bash 64 | # yolov3 65 | wget https://pjreddie.com/media/files/yolov3.weights -O data/yolov3.weights 66 | python convert.py --weights ./data/yolov3.weights --output ./checkpoints/yolov3.tf 67 | 68 | # yolov3-tiny 69 | wget https://pjreddie.com/media/files/yolov3-tiny.weights -O data/yolov3-tiny.weights 70 | python convert.py --weights ./data/yolov3-tiny.weights --output ./checkpoints/yolov3-tiny.tf --tiny 71 | ``` 72 | 73 | ### Detection 74 | 75 | ```bash 76 | # yolov3 77 | python detect.py --image ./data/meme.jpg 78 | 79 | # yolov3-tiny 80 | python detect.py --weights ./checkpoints/yolov3-tiny.tf --tiny --image ./data/street.jpg 81 | 82 | # webcam 83 | python detect_video.py --video 0 84 | 85 | # video file 86 | python detect_video.py --video path_to_file.mp4 --weights ./checkpoints/yolov3-tiny.tf --tiny 87 | 88 | # video file with output 89 | python detect_video.py --video path_to_file.mp4 --output ./output.avi 90 | ``` 91 | 92 | ### Training 93 | 94 | I have created a complete tutorial on how to train from scratch using the VOC2012 Dataset. 95 | See the documentation here https://github.com/zzh8829/yolov3-tf2/blob/master/docs/training_voc.md 96 | 97 | For customzied training, you need to generate tfrecord following the TensorFlow Object Detection API. 98 | For example you can use [Microsoft VOTT](https://github.com/Microsoft/VoTT) to generate such dataset. 99 | You can also use this [script](https://github.com/tensorflow/models/blob/master/research/object_detection/dataset_tools/create_pascal_tf_record.py) to create the pascal voc dataset. 100 | 101 | Example commend line arguments for training 102 | ``` bash 103 | python train.py --batch_size 8 --dataset ~/Data/voc2012.tfrecord --val_dataset ~/Data/voc2012_val.tfrecord --epochs 100 --mode eager_tf --transfer fine_tune 104 | 105 | python train.py --batch_size 8 --dataset ~/Data/voc2012.tfrecord --val_dataset ~/Data/voc2012_val.tfrecord --epochs 100 --mode fit --transfer none 106 | 107 | python train.py --batch_size 8 --dataset ~/Data/voc2012.tfrecord --val_dataset ~/Data/voc2012_val.tfrecord --epochs 100 --mode fit --transfer no_output 108 | 109 | python train.py --batch_size 8 --dataset ~/Data/voc2012.tfrecord --val_dataset ~/Data/voc2012_val.tfrecord --epochs 10 --mode eager_fit --transfer fine_tune --weights ./checkpoints/yolov3-tiny.tf --tiny 110 | ``` 111 | 112 | ### Tensorflow Serving 113 | You can export the model to tf serving 114 | ``` 115 | python export_tfserving.py --output serving/yolov3/1/ 116 | # verify tfserving graph 117 | saved_model_cli show --dir serving/yolov3/1/ --tag_set serve --signature_def serving_default 118 | ``` 119 | 120 | The inputs are preprocessed images (see `dataset.transform_iamges`) 121 | 122 | outputs are 123 | ``` 124 | yolo_nms_0: bounding boxes 125 | yolo_nms_1: scores 126 | yolo_nms_2: classes 127 | yolo_nms_3: numbers of valid detections 128 | ``` 129 | 130 | ## Benchmark (No Training Yet) 131 | 132 | Numbers are obtained with rough calculations from `detect_video.py` 133 | 134 | ### Macbook Pro 13 (2.7GHz i5) 135 | 136 | | Detection | 416x416 | 320x320 | 608x608 | 137 | |-------------|---------|---------|---------| 138 | | YoloV3 | 1000ms | 500ms | 1546ms | 139 | | YoloV3-Tiny | 100ms | 58ms | 208ms | 140 | 141 | ### Desktop PC (GTX 970) 142 | 143 | | Detection | 416x416 | 320x320 | 608x608 | 144 | |-------------|---------|---------|---------| 145 | | YoloV3 | 74ms | 57ms | 129ms | 146 | | YoloV3-Tiny | 18ms | 15ms | 28ms | 147 | 148 | ### AWS g3.4xlarge (Tesla M60) 149 | 150 | | Detection | 416x416 | 320x320 | 608x608 | 151 | |-------------|---------|---------|---------| 152 | | YoloV3 | 66ms | 50ms | 123ms | 153 | | YoloV3-Tiny | 15ms | 10ms | 24ms | 154 | 155 | ### RTX 2070 (credit to @AnaRhisT94) 156 | 157 | | Detection | 416x416 | 158 | |-------------|---------| 159 | | YoloV3 predict_on_batch | 29-32ms | 160 | | YoloV3 predict_on_batch + TensorRT | 22-28ms | 161 | 162 | 163 | Darknet version of YoloV3 at 416x416 takes 29ms on Titan X. 164 | Considering Titan X has about double the benchmark of Tesla M60, 165 | Performance-wise this implementation is pretty comparable. 166 | 167 | ## Implementation Details 168 | 169 | ### Eager execution 170 | 171 | Great addition for existing TensorFlow experts. 172 | Not very easy to use without some intermediate understanding of TensorFlow graphs. 173 | It is annoying when you accidentally use incompatible features like tensor.shape[0] 174 | or some sort of python control flow that works fine in eager mode, but 175 | totally breaks down when you try to compile the model to graph. 176 | 177 | ### model(x) vs. model.predict(x) 178 | 179 | When calling model(x) directly, we are executing the graph in eager mode. For 180 | `model.predict`, tf actually compiles the graph on the first run and then 181 | execute in graph mode. So if you are only running the model once, `model(x)` is 182 | faster since there is no compilation needed. Otherwise, `model.predict` or 183 | using exported SavedModel graph is much faster (by 2x). For non real-time usage, 184 | `model.predict_on_batch` is even faster as tested by @AnaRhisT94) 185 | 186 | ### GradientTape 187 | 188 | Extremely useful for debugging purpose, you can set breakpoints anywhere. 189 | You can compile all the keras fitting functionalities with gradient tape using the 190 | `run_eagerly` argument in model.compile. From my limited testing, all training methods 191 | including GradientTape, keras.fit, eager or not yeilds similar performance. But graph 192 | mode is still preferred since it's a tiny bit more efficient. 193 | 194 | ### @tf.function 195 | 196 | @tf.function is very cool. It's like an in-between version of eager and graph. 197 | You can step through the function by disabling tf.function and then gain 198 | performance when you enable it in production. Important note, you should not 199 | pass any non-tensor parameter to @tf.function, it will cause re-compilation 200 | on every call. I am not sure whats the best way other than using globals. 201 | 202 | ### absl.py (abseil) 203 | 204 | Absolutely amazing. If you don't know already, absl.py is officially used by 205 | internal projects at Google. It standardizes application interface for Python 206 | and many other languages. After using it within Google, I was so excited 207 | to hear abseil going open source. It includes many decades of best practices 208 | learned from creating large size scalable applications. I literally have 209 | nothing bad to say about it, strongly recommend absl.py to everybody. 210 | 211 | ### Loading pre-trained Darknet weights 212 | 213 | very hard with pure functional API because the layer ordering is different in 214 | tf.keras and darknet. The clean solution here is creating sub-models in keras. 215 | Keras is not able to save nested model in h5 format properly, TF Checkpoint is 216 | recommended since its offically supported by TensorFlow. 217 | 218 | ### tf.keras.layers.BatchNormalization 219 | 220 | It doesn't work very well for transfer learning. There are many articles and 221 | github issues all over the internet. I used a simple hack to make it work nicer 222 | on transfer learning with small batches. 223 | 224 | ### What is the output of transform_targets ??? 225 | 226 | I know it's very confusion but the output is tuple of shape 227 | ``` 228 | ( 229 | [N, 13, 13, 3, 6], 230 | [N, 26, 26, 3, 6], 231 | [N, 52, 52, 3, 6] 232 | ) 233 | ``` 234 | where N is the number of labels in batch and the last dimension "6" represents 235 | `[x, y, w, h, obj, class]` of the bounding boxes. 236 | 237 | ### IOU and Score Threshold 238 | 239 | the default threshold is 0.5 for both IOU and score, you can adjust them 240 | according to your need by setting `--yolo_iou_threshold` and 241 | `--yolo_score_threshold` flags 242 | 243 | ### Maximum number of boxes 244 | 245 | By default there can be maximum 100 bounding boxes per image, 246 | if for some reason you would like to have more boxes you can use the `--yolo_max_boxes` flag. 247 | 248 | ### NAN Loss / Training Failed / Doesn't Converge 249 | 250 | Many people including me have succeeded in training, so the code definitely works 251 | @LongxingTan in https://github.com/zzh8829/yolov3-tf2/issues/128 provided some of his insights summarized here: 252 | 253 | 1. For nan loss, try to make learning rate smaller 254 | 2. Double check the format of your input data. Data input labelled by vott and labelImg is different. so make sure the input box is the right, and check carefully the format is `x1/width,y1/height,x2/width,y2/height` and **NOT** x1,y1,x2,y2, or x,y,w,h 255 | 256 | Make sure to visualize your custom dataset using this tool 257 | ``` 258 | python tools/visualize_dataset.py --classes=./data/voc2012.names 259 | ``` 260 | 261 | It will output one random image from your dataset with label to `output.jpg` 262 | Training definitely won't work if the rendered label doesn't look correct 263 | 264 | ## Command Line Args Reference 265 | 266 | ```bash 267 | convert.py: 268 | --output: path to output 269 | (default: './checkpoints/yolov3.tf') 270 | --[no]tiny: yolov3 or yolov3-tiny 271 | (default: 'false') 272 | --weights: path to weights file 273 | (default: './data/yolov3.weights') 274 | --num_classes: number of classes in the model 275 | (default: '80') 276 | (an integer) 277 | 278 | detect.py: 279 | --classes: path to classes file 280 | (default: './data/coco.names') 281 | --image: path to input image 282 | (default: './data/girl.png') 283 | --output: path to output image 284 | (default: './output.jpg') 285 | --[no]tiny: yolov3 or yolov3-tiny 286 | (default: 'false') 287 | --weights: path to weights file 288 | (default: './checkpoints/yolov3.tf') 289 | --num_classes: number of classes in the model 290 | (default: '80') 291 | (an integer) 292 | 293 | detect_video.py: 294 | --classes: path to classes file 295 | (default: './data/coco.names') 296 | --video: path to input video (use 0 for cam) 297 | (default: './data/video.mp4') 298 | --output: path to output video (remember to set right codec for given format. e.g. XVID for .avi) 299 | (default: None) 300 | --output_format: codec used in VideoWriter when saving video to file 301 | (default: 'XVID) 302 | --[no]tiny: yolov3 or yolov3-tiny 303 | (default: 'false') 304 | --weights: path to weights file 305 | (default: './checkpoints/yolov3.tf') 306 | --num_classes: number of classes in the model 307 | (default: '80') 308 | (an integer) 309 | 310 | train.py: 311 | --batch_size: batch size 312 | (default: '8') 313 | (an integer) 314 | --classes: path to classes file 315 | (default: './data/coco.names') 316 | --dataset: path to dataset 317 | (default: '') 318 | --epochs: number of epochs 319 | (default: '2') 320 | (an integer) 321 | --learning_rate: learning rate 322 | (default: '0.001') 323 | (a number) 324 | --mode: : fit: model.fit, eager_fit: model.fit(run_eagerly=True), eager_tf: custom GradientTape 325 | (default: 'fit') 326 | --num_classes: number of classes in the model 327 | (default: '80') 328 | (an integer) 329 | --size: image size 330 | (default: '416') 331 | (an integer) 332 | --[no]tiny: yolov3 or yolov3-tiny 333 | (default: 'false') 334 | --transfer: : none: Training from scratch, darknet: Transfer darknet, no_output: Transfer all but output, frozen: Transfer and freeze all, 335 | fine_tune: Transfer all and freeze darknet only 336 | (default: 'none') 337 | --val_dataset: path to validation dataset 338 | (default: '') 339 | --weights: path to weights file 340 | (default: './checkpoints/yolov3.tf') 341 | ``` 342 | 343 | ## Change Log 344 | 345 | #### October 1, 2019 346 | 347 | - Updated to Tensorflow to v2.0.0 Release 348 | 349 | 350 | ## References 351 | 352 | It is pretty much impossible to implement this from the yolov3 paper alone. I had to reference the official (very hard to understand) and many un-official (many minor errors) repos to piece together the complete picture. 353 | 354 | - https://github.com/pjreddie/darknet 355 | - official yolov3 implementation 356 | - https://github.com/AlexeyAB 357 | - explinations of parameters 358 | - https://github.com/qqwweee/keras-yolo3 359 | - models 360 | - loss functions 361 | - https://github.com/YunYang1994/tensorflow-yolov3 362 | - data transformations 363 | - loss functions 364 | - https://github.com/ayooshkathuria/pytorch-yolo-v3 365 | - models 366 | - https://github.com/broadinstitute/keras-resnet 367 | - batch normalization fix 368 | -------------------------------------------------------------------------------- /checkpoints/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zzh8829/yolov3-tf2/2784050f2fc1ff060f4c8b3ac2db231370569aa8/checkpoints/.gitkeep -------------------------------------------------------------------------------- /conda-cpu.yml: -------------------------------------------------------------------------------- 1 | name: yolov3-tf2-cpu 2 | 3 | dependencies: 4 | - python==3.7 5 | - pip 6 | - matplotlib 7 | - opencv 8 | - pip: 9 | - tensorflow==2.1.0rc1 10 | - lxml 11 | - tqdm 12 | - -e . 13 | -------------------------------------------------------------------------------- /conda-gpu.yml: -------------------------------------------------------------------------------- 1 | name: yolov3-tf2-gpu 2 | 3 | dependencies: 4 | - python==3.7 5 | - pip 6 | - matplotlib 7 | - opencv 8 | - cudnn 9 | - cudatoolkit==10.1.243 10 | - pip: 11 | - tensorflow-gpu==2.1.0rc1 12 | - lxml 13 | - tqdm 14 | - -e . 15 | -------------------------------------------------------------------------------- /convert.py: -------------------------------------------------------------------------------- 1 | from absl import app, flags, logging 2 | from absl.flags import FLAGS 3 | import numpy as np 4 | from yolov3_tf2.models import YoloV3, YoloV3Tiny 5 | from yolov3_tf2.utils import load_darknet_weights 6 | import tensorflow as tf 7 | 8 | flags.DEFINE_string('weights', './data/yolov3.weights', 'path to weights file') 9 | flags.DEFINE_string('output', './checkpoints/yolov3.tf', 'path to output') 10 | flags.DEFINE_boolean('tiny', False, 'yolov3 or yolov3-tiny') 11 | flags.DEFINE_integer('num_classes', 80, 'number of classes in the model') 12 | 13 | 14 | def main(_argv): 15 | physical_devices = tf.config.experimental.list_physical_devices('GPU') 16 | if len(physical_devices) > 0: 17 | tf.config.experimental.set_memory_growth(physical_devices[0], True) 18 | 19 | if FLAGS.tiny: 20 | yolo = YoloV3Tiny(classes=FLAGS.num_classes) 21 | else: 22 | yolo = YoloV3(classes=FLAGS.num_classes) 23 | yolo.summary() 24 | logging.info('model created') 25 | 26 | load_darknet_weights(yolo, FLAGS.weights, FLAGS.tiny) 27 | logging.info('weights loaded') 28 | 29 | img = np.random.random((1, 320, 320, 3)).astype(np.float32) 30 | output = yolo(img) 31 | logging.info('sanity check passed') 32 | 33 | yolo.save_weights(FLAGS.output) 34 | logging.info('weights saved') 35 | 36 | 37 | if __name__ == '__main__': 38 | try: 39 | app.run(main) 40 | except SystemExit: 41 | pass 42 | -------------------------------------------------------------------------------- /data/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "yolov3.tf" 2 | all_model_checkpoint_paths: "yolov3.tf" 3 | -------------------------------------------------------------------------------- /data/coco.names: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /data/girl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zzh8829/yolov3-tf2/2784050f2fc1ff060f4c8b3ac2db231370569aa8/data/girl.png -------------------------------------------------------------------------------- /data/meme.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zzh8829/yolov3-tf2/2784050f2fc1ff060f4c8b3ac2db231370569aa8/data/meme.jpg -------------------------------------------------------------------------------- /data/meme2.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zzh8829/yolov3-tf2/2784050f2fc1ff060f4c8b3ac2db231370569aa8/data/meme2.jpeg -------------------------------------------------------------------------------- /data/meme_out.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zzh8829/yolov3-tf2/2784050f2fc1ff060f4c8b3ac2db231370569aa8/data/meme_out.jpg -------------------------------------------------------------------------------- /data/street.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zzh8829/yolov3-tf2/2784050f2fc1ff060f4c8b3ac2db231370569aa8/data/street.jpg -------------------------------------------------------------------------------- /data/street_out.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zzh8829/yolov3-tf2/2784050f2fc1ff060f4c8b3ac2db231370569aa8/data/street_out.jpg -------------------------------------------------------------------------------- /data/voc2012.names: -------------------------------------------------------------------------------- 1 | aeroplane 2 | bicycle 3 | bird 4 | boat 5 | bottle 6 | bus 7 | car 8 | cat 9 | chair 10 | cow 11 | diningtable 12 | dog 13 | horse 14 | motorbike 15 | person 16 | pottedplant 17 | sheep 18 | sofa 19 | train 20 | tvmonitor 21 | -------------------------------------------------------------------------------- /detect.py: -------------------------------------------------------------------------------- 1 | import time 2 | from absl import app, flags, logging 3 | from absl.flags import FLAGS 4 | import cv2 5 | import numpy as np 6 | import tensorflow as tf 7 | from yolov3_tf2.models import ( 8 | YoloV3, YoloV3Tiny 9 | ) 10 | from yolov3_tf2.dataset import transform_images, load_tfrecord_dataset 11 | from yolov3_tf2.utils import draw_outputs 12 | 13 | flags.DEFINE_string('classes', './data/coco.names', 'path to classes file') 14 | flags.DEFINE_string('weights', './checkpoints/yolov3.tf', 15 | 'path to weights file') 16 | flags.DEFINE_boolean('tiny', False, 'yolov3 or yolov3-tiny') 17 | flags.DEFINE_integer('size', 416, 'resize images to') 18 | flags.DEFINE_string('image', './data/girl.png', 'path to input image') 19 | flags.DEFINE_string('tfrecord', None, 'tfrecord instead of image') 20 | flags.DEFINE_string('output', './output.jpg', 'path to output image') 21 | flags.DEFINE_integer('num_classes', 80, 'number of classes in the model') 22 | 23 | 24 | def main(_argv): 25 | physical_devices = tf.config.experimental.list_physical_devices('GPU') 26 | for physical_device in physical_devices: 27 | tf.config.experimental.set_memory_growth(physical_device, True) 28 | 29 | if FLAGS.tiny: 30 | yolo = YoloV3Tiny(classes=FLAGS.num_classes) 31 | else: 32 | yolo = YoloV3(classes=FLAGS.num_classes) 33 | 34 | yolo.load_weights(FLAGS.weights).expect_partial() 35 | logging.info('weights loaded') 36 | 37 | class_names = [c.strip() for c in open(FLAGS.classes).readlines()] 38 | logging.info('classes loaded') 39 | 40 | if FLAGS.tfrecord: 41 | dataset = load_tfrecord_dataset( 42 | FLAGS.tfrecord, FLAGS.classes, FLAGS.size) 43 | dataset = dataset.shuffle(512) 44 | img_raw, _label = next(iter(dataset.take(1))) 45 | else: 46 | img_raw = tf.image.decode_image( 47 | open(FLAGS.image, 'rb').read(), channels=3) 48 | 49 | img = tf.expand_dims(img_raw, 0) 50 | img = transform_images(img, FLAGS.size) 51 | 52 | t1 = time.time() 53 | boxes, scores, classes, nums = yolo(img) 54 | t2 = time.time() 55 | logging.info('time: {}'.format(t2 - t1)) 56 | 57 | logging.info('detections:') 58 | for i in range(nums[0]): 59 | logging.info('\t{}, {}, {}'.format(class_names[int(classes[0][i])], 60 | np.array(scores[0][i]), 61 | np.array(boxes[0][i]))) 62 | 63 | img = cv2.cvtColor(img_raw.numpy(), cv2.COLOR_RGB2BGR) 64 | img = draw_outputs(img, (boxes, scores, classes, nums), class_names) 65 | cv2.imwrite(FLAGS.output, img) 66 | logging.info('output saved to: {}'.format(FLAGS.output)) 67 | 68 | 69 | if __name__ == '__main__': 70 | try: 71 | app.run(main) 72 | except SystemExit: 73 | pass 74 | -------------------------------------------------------------------------------- /detect_video.py: -------------------------------------------------------------------------------- 1 | import time 2 | from absl import app, flags, logging 3 | from absl.flags import FLAGS 4 | import cv2 5 | import tensorflow as tf 6 | from yolov3_tf2.models import ( 7 | YoloV3, YoloV3Tiny 8 | ) 9 | from yolov3_tf2.dataset import transform_images 10 | from yolov3_tf2.utils import draw_outputs 11 | 12 | 13 | flags.DEFINE_string('classes', './data/coco.names', 'path to classes file') 14 | flags.DEFINE_string('weights', './checkpoints/yolov3.tf', 15 | 'path to weights file') 16 | flags.DEFINE_boolean('tiny', False, 'yolov3 or yolov3-tiny') 17 | flags.DEFINE_integer('size', 416, 'resize images to') 18 | flags.DEFINE_string('video', './data/video.mp4', 19 | 'path to video file or number for webcam)') 20 | flags.DEFINE_string('output', None, 'path to output video') 21 | flags.DEFINE_string('output_format', 'XVID', 'codec used in VideoWriter when saving video to file') 22 | flags.DEFINE_integer('num_classes', 80, 'number of classes in the model') 23 | 24 | 25 | def main(_argv): 26 | physical_devices = tf.config.experimental.list_physical_devices('GPU') 27 | for physical_device in physical_devices: 28 | tf.config.experimental.set_memory_growth(physical_device, True) 29 | 30 | if FLAGS.tiny: 31 | yolo = YoloV3Tiny(classes=FLAGS.num_classes) 32 | else: 33 | yolo = YoloV3(classes=FLAGS.num_classes) 34 | 35 | yolo.load_weights(FLAGS.weights) 36 | logging.info('weights loaded') 37 | 38 | class_names = [c.strip() for c in open(FLAGS.classes).readlines()] 39 | logging.info('classes loaded') 40 | 41 | times = [] 42 | 43 | try: 44 | vid = cv2.VideoCapture(int(FLAGS.video)) 45 | except: 46 | vid = cv2.VideoCapture(FLAGS.video) 47 | 48 | out = None 49 | 50 | if FLAGS.output: 51 | # by default VideoCapture returns float instead of int 52 | width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) 53 | height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) 54 | fps = int(vid.get(cv2.CAP_PROP_FPS)) 55 | codec = cv2.VideoWriter_fourcc(*FLAGS.output_format) 56 | out = cv2.VideoWriter(FLAGS.output, codec, fps, (width, height)) 57 | 58 | while True: 59 | _, img = vid.read() 60 | 61 | if img is None: 62 | logging.warning("Empty Frame") 63 | time.sleep(0.1) 64 | continue 65 | 66 | img_in = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 67 | img_in = tf.expand_dims(img_in, 0) 68 | img_in = transform_images(img_in, FLAGS.size) 69 | 70 | t1 = time.time() 71 | boxes, scores, classes, nums = yolo.predict(img_in) 72 | t2 = time.time() 73 | times.append(t2-t1) 74 | times = times[-20:] 75 | 76 | img = draw_outputs(img, (boxes, scores, classes, nums), class_names) 77 | img = cv2.putText(img, "Time: {:.2f}ms".format(sum(times)/len(times)*1000), (0, 30), 78 | cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) 79 | if FLAGS.output: 80 | out.write(img) 81 | cv2.imshow('output', img) 82 | if cv2.waitKey(1) == ord('q'): 83 | break 84 | 85 | cv2.destroyAllWindows() 86 | 87 | 88 | if __name__ == '__main__': 89 | try: 90 | app.run(main) 91 | except SystemExit: 92 | pass 93 | -------------------------------------------------------------------------------- /docs/training_voc.md: -------------------------------------------------------------------------------- 1 | # Training Instruction 2 | 3 | ## VOC 2012 Dataset from Scratch 4 | 5 | Full instruction on how to train using VOC 2012 from scratch 6 | 7 | Requirement: 8 | 1. Able to detect image using pretrained darknet model 9 | 2. Many Gigabytes of Disk Space 10 | 3. High Speed Internet Connection Preferred 11 | 4. GPU Preferred 12 | 13 | 14 | ### 1. Download Dataset 15 | 16 | You can read the full description of dataset [here](http://host.robots.ox.ac.uk/pascal/VOC/) 17 | ```bash 18 | wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar -O ./data/voc2012_raw.tar 19 | mkdir -p ./data/voc2012_raw 20 | tar -xf ./data/voc2012_raw.tar -C ./data/voc2012_raw 21 | ls ./data/voc2012_raw/VOCdevkit/VOC2012 # Explore the dataset 22 | ``` 23 | 24 | ### 2. Transform Dataset 25 | 26 | See tools/voc2012.py for implementation, this format is based on [tensorflow object detection API](https://github.com/tensorflow/models/tree/master/research/object_detection). Many fields 27 | are not required, I left them there for compatibility with official API. 28 | 29 | ```bash 30 | python tools/voc2012.py \ 31 | --data_dir './data/voc2012_raw/VOCdevkit/VOC2012' \ 32 | --split train \ 33 | --output_file ./data/voc2012_train.tfrecord 34 | 35 | python tools/voc2012.py \ 36 | --data_dir './data/voc2012_raw/VOCdevkit/VOC2012' \ 37 | --split val \ 38 | --output_file ./data/voc2012_val.tfrecord 39 | ``` 40 | 41 | You can visualize the dataset using this tool 42 | ``` 43 | python tools/visualize_dataset.py --classes=./data/voc2012.names 44 | ``` 45 | 46 | It will output one random image with label to `output.jpg` 47 | 48 | ### 3. Training 49 | 50 | You can adjust the parameters based on your setup 51 | 52 | #### With Transfer Learning 53 | 54 | This step requires loading the pretrained darknet (feature extractor) weights. 55 | ``` 56 | wget https://pjreddie.com/media/files/yolov3.weights -O data/yolov3.weights 57 | python convert.py 58 | python detect.py --image ./data/meme.jpg # Sanity check 59 | 60 | python train.py \ 61 | --dataset ./data/voc2012_train.tfrecord \ 62 | --val_dataset ./data/voc2012_val.tfrecord \ 63 | --classes ./data/voc2012.names \ 64 | --num_classes 20 \ 65 | --mode fit --transfer darknet \ 66 | --batch_size 16 \ 67 | --epochs 10 \ 68 | --weights ./checkpoints/yolov3.tf \ 69 | --weights_num_classes 80 70 | ``` 71 | 72 | Original pretrained yolov3 has 80 classes, here we demonstrated how to 73 | do transfer learning on 20 classes. 74 | 75 | #### Training from random weights (NOT RECOMMENDED) 76 | Training from scratch is very difficult to converge 77 | The original paper trained darknet 78 | on imagenet before training the whole network as well. 79 | 80 | ```bash 81 | python train.py \ 82 | --dataset ./data/voc2012_train.tfrecord \ 83 | --val_dataset ./data/voc2012_val.tfrecord \ 84 | --classes ./data/voc2012.names \ 85 | --num_classes 20 \ 86 | --mode fit --transfer none \ 87 | --batch_size 16 \ 88 | --epochs 10 \ 89 | ``` 90 | 91 | I have tested this works 100% with correct loss and converging over time. 92 | Each epoch takes around 10 minutes on single AWS p2.xlarge (Nvidia K80 GPU) Instance. 93 | 94 | You might see warnings or error messages during training, they are not critical dont' worry too much about them. 95 | There might be a long wait time between each epoch becaues we are calculating validation loss. 96 | 97 | ### 4. Inference 98 | 99 | ```bash 100 | # detect from images 101 | python detect.py \ 102 | --classes ./data/voc2012.names \ 103 | --num_classes 20 \ 104 | --weights ./checkpoints/yolov3_train_5.tf \ 105 | --image ./data/street.jpg 106 | 107 | # detect from validation set 108 | python detect.py \ 109 | --classes ./data/voc2012.names \ 110 | --num_classes 20 \ 111 | --weights ./checkpoints/yolov3_train_5.tf \ 112 | --tfrecord ./data/voc2012_val.tfrecord 113 | ``` 114 | 115 | You should see some detect objects in the standard output and the visualization at `output.jpg`. 116 | this is just a proof of concept, so it won't be as good as pretrained models. 117 | In my experience, you might need lower score score thershold if you didn't train it enough. 118 | 119 | -------------------------------------------------------------------------------- /requirements-gpu.txt: -------------------------------------------------------------------------------- 1 | tensorflow-gpu==2.12.0 2 | opencv-python==4.2.0.32 3 | lxml 4 | tqdm 5 | 6 | -e . 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow==2.11.1 2 | opencv-python==4.2.0.32 3 | lxml 4 | tqdm 5 | 6 | -e . 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='yolov3_tf2', 4 | version='0.1', 5 | url='https://github.com/zzh8829/yolov3-tf2', 6 | author='Zihao Zhang', 7 | author_email='zzh8829@gmail.com', 8 | packages=['yolov3_tf2']) -------------------------------------------------------------------------------- /tools/export_tflite.py: -------------------------------------------------------------------------------- 1 | import time 2 | from absl import app, flags, logging 3 | from absl.flags import FLAGS 4 | import cv2 5 | import numpy as np 6 | import tensorflow as tf 7 | from yolov3_tf2.models import ( 8 | YoloV3, YoloV3Tiny 9 | ) 10 | from yolov3_tf2.dataset import transform_images 11 | 12 | from tensorflow.python.eager import def_function 13 | from tensorflow.python.framework import tensor_spec 14 | from tensorflow.python.util import nest 15 | 16 | flags.DEFINE_string('weights', './checkpoints/yolov3.tf', 17 | 'path to weights file') 18 | flags.DEFINE_boolean('tiny', False, 'yolov3 or yolov3-tiny') 19 | flags.DEFINE_string('output', './checkpoints/yolov3.tflite', 20 | 'path to saved_model') 21 | flags.DEFINE_string('classes', './data/coco.names', 'path to classes file') 22 | flags.DEFINE_string('image', './data/girl.png', 'path to input image') 23 | flags.DEFINE_integer('num_classes', 80, 'number of classes in the model') 24 | flags.DEFINE_integer('size', 416, 'image size') 25 | 26 | 27 | def main(_argv): 28 | if FLAGS.tiny: 29 | yolo = YoloV3Tiny(size=FLAGS.size, classes=FLAGS.num_classes) 30 | else: 31 | yolo = YoloV3(size=FLAGS.size, classes=FLAGS.num_classes) 32 | 33 | yolo.load_weights(FLAGS.weights) 34 | logging.info('weights loaded') 35 | 36 | converter = tf.lite.TFLiteConverter.from_keras_model(yolo) 37 | 38 | # Fix from https://stackoverflow.com/questions/64490203/tf-lite-non-max-suppression 39 | converter.experimental_new_converter = True 40 | converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] 41 | 42 | tflite_model = converter.convert() 43 | open(FLAGS.output, 'wb').write(tflite_model) 44 | logging.info("model saved to: {}".format(FLAGS.output)) 45 | 46 | interpreter = tf.lite.Interpreter(model_path=FLAGS.output) 47 | interpreter.allocate_tensors() 48 | logging.info('tflite model loaded') 49 | 50 | input_details = interpreter.get_input_details() 51 | output_details = interpreter.get_output_details() 52 | 53 | class_names = [c.strip() for c in open(FLAGS.classes).readlines()] 54 | logging.info('classes loaded') 55 | 56 | img = tf.image.decode_image(open(FLAGS.image, 'rb').read(), channels=3) 57 | img = tf.expand_dims(img, 0) 58 | img = transform_images(img, 416) 59 | 60 | t1 = time.time() 61 | outputs = interpreter.set_tensor(input_details[0]['index'], img) 62 | 63 | interpreter.invoke() 64 | 65 | output_data = interpreter.get_tensor(output_details[0]['index']) 66 | 67 | print(output_data) 68 | 69 | if __name__ == '__main__': 70 | app.run(main) 71 | -------------------------------------------------------------------------------- /tools/export_tfserving.py: -------------------------------------------------------------------------------- 1 | import time 2 | from absl import app, flags, logging 3 | from absl.flags import FLAGS 4 | import cv2 5 | import numpy as np 6 | import tensorflow as tf 7 | from yolov3_tf2.models import ( 8 | YoloV3, YoloV3Tiny 9 | ) 10 | from yolov3_tf2.dataset import transform_images 11 | 12 | from tensorflow.python.eager import def_function 13 | from tensorflow.python.framework import tensor_spec 14 | from tensorflow.python.util import nest 15 | 16 | flags.DEFINE_string('weights', './checkpoints/yolov3.tf', 17 | 'path to weights file') 18 | flags.DEFINE_boolean('tiny', False, 'yolov3 or yolov3-tiny') 19 | flags.DEFINE_string('output', './serving/yolov3/1', 'path to saved_model') 20 | flags.DEFINE_string('classes', './data/coco.names', 'path to classes file') 21 | flags.DEFINE_string('image', './data/girl.png', 'path to input image') 22 | flags.DEFINE_integer('num_classes', 80, 'number of classes in the model') 23 | 24 | 25 | def main(_argv): 26 | if FLAGS.tiny: 27 | yolo = YoloV3Tiny(classes=FLAGS.num_classes) 28 | else: 29 | yolo = YoloV3(classes=FLAGS.num_classes) 30 | 31 | yolo.load_weights(FLAGS.weights) 32 | logging.info('weights loaded') 33 | 34 | tf.saved_model.save(yolo, FLAGS.output) 35 | logging.info("model saved to: {}".format(FLAGS.output)) 36 | 37 | model = tf.saved_model.load(FLAGS.output) 38 | infer = model.signatures[tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY] 39 | logging.info(infer.structured_outputs) 40 | 41 | class_names = [c.strip() for c in open(FLAGS.classes).readlines()] 42 | logging.info('classes loaded') 43 | 44 | img = tf.image.decode_image(open(FLAGS.image, 'rb').read(), channels=3) 45 | img = tf.expand_dims(img, 0) 46 | img = transform_images(img, 416) 47 | 48 | t1 = time.time() 49 | outputs = infer(img) 50 | boxes, scores, classes, nums = outputs["yolo_nms"], outputs[ 51 | "yolo_nms_1"], outputs["yolo_nms_2"], outputs["yolo_nms_3"] 52 | t2 = time.time() 53 | logging.info('time: {}'.format(t2 - t1)) 54 | 55 | logging.info('detections:') 56 | for i in range(nums[0]): 57 | logging.info('\t{}, {}, {}'.format(class_names[int(classes[0][i])], 58 | scores[0][i].numpy(), 59 | boxes[0][i].numpy())) 60 | 61 | 62 | if __name__ == '__main__': 63 | try: 64 | app.run(main) 65 | except SystemExit: 66 | pass 67 | -------------------------------------------------------------------------------- /tools/visualize_dataset.py: -------------------------------------------------------------------------------- 1 | import time 2 | from absl import app, flags, logging 3 | from absl.flags import FLAGS 4 | import cv2 5 | import numpy as np 6 | import tensorflow as tf 7 | from yolov3_tf2.models import ( 8 | YoloV3, YoloV3Tiny 9 | ) 10 | from yolov3_tf2.dataset import load_tfrecord_dataset, transform_images 11 | from yolov3_tf2.utils import draw_outputs 12 | 13 | flags.DEFINE_string('classes', './data/coco.names', 'path to classes file') 14 | flags.DEFINE_integer('size', 416, 'resize images to') 15 | flags.DEFINE_string( 16 | 'dataset', './data/voc2012_train.tfrecord', 'path to dataset') 17 | flags.DEFINE_string('output', './output.jpg', 'path to output image') 18 | 19 | 20 | def main(_argv): 21 | class_names = [c.strip() for c in open(FLAGS.classes).readlines()] 22 | logging.info('classes loaded') 23 | 24 | dataset = load_tfrecord_dataset(FLAGS.dataset, FLAGS.classes, FLAGS.size) 25 | dataset = dataset.shuffle(512) 26 | 27 | for image, labels in dataset.take(1): 28 | boxes = [] 29 | scores = [] 30 | classes = [] 31 | for x1, y1, x2, y2, label in labels: 32 | if x1 == 0 and x2 == 0: 33 | continue 34 | 35 | boxes.append((x1, y1, x2, y2)) 36 | scores.append(1) 37 | classes.append(label) 38 | nums = [len(boxes)] 39 | boxes = [boxes] 40 | scores = [scores] 41 | classes = [classes] 42 | 43 | logging.info('labels:') 44 | for i in range(nums[0]): 45 | logging.info('\t{}, {}, {}'.format(class_names[int(classes[0][i])], 46 | np.array(scores[0][i]), 47 | np.array(boxes[0][i]))) 48 | 49 | img = cv2.cvtColor(image.numpy(), cv2.COLOR_RGB2BGR) 50 | img = draw_outputs(img, (boxes, scores, classes, nums), class_names) 51 | cv2.imwrite(FLAGS.output, img) 52 | logging.info('output saved to: {}'.format(FLAGS.output)) 53 | 54 | 55 | if __name__ == '__main__': 56 | app.run(main) 57 | -------------------------------------------------------------------------------- /tools/voc2012.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import hashlib 4 | 5 | from absl import app, flags, logging 6 | from absl.flags import FLAGS 7 | import tensorflow as tf 8 | import lxml.etree 9 | import tqdm 10 | 11 | flags.DEFINE_string('data_dir', './data/voc2012_raw/VOCdevkit/VOC2012/', 12 | 'path to raw PASCAL VOC dataset') 13 | flags.DEFINE_enum('split', 'train', [ 14 | 'train', 'val'], 'specify train or val spit') 15 | flags.DEFINE_string('output_file', './data/voc2012_train.tfrecord', 'outpot dataset') 16 | flags.DEFINE_string('classes', './data/voc2012.names', 'classes file') 17 | 18 | 19 | def build_example(annotation, class_map): 20 | img_path = os.path.join( 21 | FLAGS.data_dir, 'JPEGImages', annotation['filename']) 22 | img_raw = open(img_path, 'rb').read() 23 | key = hashlib.sha256(img_raw).hexdigest() 24 | 25 | width = int(annotation['size']['width']) 26 | height = int(annotation['size']['height']) 27 | 28 | xmin = [] 29 | ymin = [] 30 | xmax = [] 31 | ymax = [] 32 | classes = [] 33 | classes_text = [] 34 | truncated = [] 35 | views = [] 36 | difficult_obj = [] 37 | if 'object' in annotation: 38 | for obj in annotation['object']: 39 | difficult = bool(int(obj['difficult'])) 40 | difficult_obj.append(int(difficult)) 41 | 42 | xmin.append(float(obj['bndbox']['xmin']) / width) 43 | ymin.append(float(obj['bndbox']['ymin']) / height) 44 | xmax.append(float(obj['bndbox']['xmax']) / width) 45 | ymax.append(float(obj['bndbox']['ymax']) / height) 46 | classes_text.append(obj['name'].encode('utf8')) 47 | classes.append(class_map[obj['name']]) 48 | truncated.append(int(obj['truncated'])) 49 | views.append(obj['pose'].encode('utf8')) 50 | 51 | example = tf.train.Example(features=tf.train.Features(feature={ 52 | 'image/height': tf.train.Feature(int64_list=tf.train.Int64List(value=[height])), 53 | 'image/width': tf.train.Feature(int64_list=tf.train.Int64List(value=[width])), 54 | 'image/filename': tf.train.Feature(bytes_list=tf.train.BytesList(value=[ 55 | annotation['filename'].encode('utf8')])), 56 | 'image/source_id': tf.train.Feature(bytes_list=tf.train.BytesList(value=[ 57 | annotation['filename'].encode('utf8')])), 58 | 'image/key/sha256': tf.train.Feature(bytes_list=tf.train.BytesList(value=[key.encode('utf8')])), 59 | 'image/encoded': tf.train.Feature(bytes_list=tf.train.BytesList(value=[img_raw])), 60 | 'image/format': tf.train.Feature(bytes_list=tf.train.BytesList(value=['jpeg'.encode('utf8')])), 61 | 'image/object/bbox/xmin': tf.train.Feature(float_list=tf.train.FloatList(value=xmin)), 62 | 'image/object/bbox/xmax': tf.train.Feature(float_list=tf.train.FloatList(value=xmax)), 63 | 'image/object/bbox/ymin': tf.train.Feature(float_list=tf.train.FloatList(value=ymin)), 64 | 'image/object/bbox/ymax': tf.train.Feature(float_list=tf.train.FloatList(value=ymax)), 65 | 'image/object/class/text': tf.train.Feature(bytes_list=tf.train.BytesList(value=classes_text)), 66 | 'image/object/class/label': tf.train.Feature(int64_list=tf.train.Int64List(value=classes)), 67 | 'image/object/difficult': tf.train.Feature(int64_list=tf.train.Int64List(value=difficult_obj)), 68 | 'image/object/truncated': tf.train.Feature(int64_list=tf.train.Int64List(value=truncated)), 69 | 'image/object/view': tf.train.Feature(bytes_list=tf.train.BytesList(value=views)), 70 | })) 71 | return example 72 | 73 | 74 | def parse_xml(xml): 75 | if not len(xml): 76 | return {xml.tag: xml.text} 77 | result = {} 78 | for child in xml: 79 | child_result = parse_xml(child) 80 | if child.tag != 'object': 81 | result[child.tag] = child_result[child.tag] 82 | else: 83 | if child.tag not in result: 84 | result[child.tag] = [] 85 | result[child.tag].append(child_result[child.tag]) 86 | return {xml.tag: result} 87 | 88 | 89 | def main(_argv): 90 | class_map = {name: idx for idx, name in enumerate( 91 | open(FLAGS.classes).read().splitlines())} 92 | logging.info("Class mapping loaded: %s", class_map) 93 | 94 | writer = tf.io.TFRecordWriter(FLAGS.output_file) 95 | image_list = open(os.path.join( 96 | FLAGS.data_dir, 'ImageSets', 'Main', '%s.txt' % FLAGS.split)).read().splitlines() 97 | logging.info("Image list loaded: %d", len(image_list)) 98 | for name in tqdm.tqdm(image_list): 99 | annotation_xml = os.path.join( 100 | FLAGS.data_dir, 'Annotations', name + '.xml') 101 | annotation_xml = lxml.etree.fromstring(open(annotation_xml).read()) 102 | annotation = parse_xml(annotation_xml)['annotation'] 103 | tf_example = build_example(annotation, class_map) 104 | writer.write(tf_example.SerializeToString()) 105 | writer.close() 106 | logging.info("Done") 107 | 108 | 109 | if __name__ == '__main__': 110 | app.run(main) 111 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | from absl import app, flags, logging 2 | from absl.flags import FLAGS 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | import cv2 7 | import time 8 | from tensorflow.keras.callbacks import ( 9 | ReduceLROnPlateau, 10 | EarlyStopping, 11 | ModelCheckpoint, 12 | TensorBoard 13 | ) 14 | from yolov3_tf2.models import ( 15 | YoloV3, YoloV3Tiny, YoloLoss, 16 | yolo_anchors, yolo_anchor_masks, 17 | yolo_tiny_anchors, yolo_tiny_anchor_masks 18 | ) 19 | from yolov3_tf2.utils import freeze_all 20 | import yolov3_tf2.dataset as dataset 21 | 22 | flags.DEFINE_string('dataset', '', 'path to dataset') 23 | flags.DEFINE_string('val_dataset', '', 'path to validation dataset') 24 | flags.DEFINE_boolean('tiny', False, 'yolov3 or yolov3-tiny') 25 | flags.DEFINE_string('weights', './checkpoints/yolov3.tf', 26 | 'path to weights file') 27 | flags.DEFINE_string('classes', './data/coco.names', 'path to classes file') 28 | flags.DEFINE_enum('mode', 'fit', ['fit', 'eager_fit', 'eager_tf'], 29 | 'fit: model.fit, ' 30 | 'eager_fit: model.fit(run_eagerly=True), ' 31 | 'eager_tf: custom GradientTape') 32 | flags.DEFINE_enum('transfer', 'none', 33 | ['none', 'darknet', 'no_output', 'frozen', 'fine_tune'], 34 | 'none: Training from scratch, ' 35 | 'darknet: Transfer darknet, ' 36 | 'no_output: Transfer all but output, ' 37 | 'frozen: Transfer and freeze all, ' 38 | 'fine_tune: Transfer all and freeze darknet only') 39 | flags.DEFINE_integer('size', 416, 'image size') 40 | flags.DEFINE_integer('epochs', 2, 'number of epochs') 41 | flags.DEFINE_integer('batch_size', 8, 'batch size') 42 | flags.DEFINE_float('learning_rate', 1e-3, 'learning rate') 43 | flags.DEFINE_integer('num_classes', 80, 'number of classes in the model') 44 | flags.DEFINE_integer('weights_num_classes', None, 'specify num class for `weights` file if different, ' 45 | 'useful in transfer learning with different number of classes') 46 | flags.DEFINE_boolean('multi_gpu', False, 'Use if wishing to train with more than 1 GPU.') 47 | 48 | 49 | def setup_model(): 50 | if FLAGS.tiny: 51 | model = YoloV3Tiny(FLAGS.size, training=True, 52 | classes=FLAGS.num_classes) 53 | anchors = yolo_tiny_anchors 54 | anchor_masks = yolo_tiny_anchor_masks 55 | else: 56 | model = YoloV3(FLAGS.size, training=True, classes=FLAGS.num_classes) 57 | anchors = yolo_anchors 58 | anchor_masks = yolo_anchor_masks 59 | 60 | # Configure the model for transfer learning 61 | if FLAGS.transfer == 'none': 62 | pass # Nothing to do 63 | elif FLAGS.transfer in ['darknet', 'no_output']: 64 | # Darknet transfer is a special case that works 65 | # with incompatible number of classes 66 | # reset top layers 67 | if FLAGS.tiny: 68 | model_pretrained = YoloV3Tiny( 69 | FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes) 70 | else: 71 | model_pretrained = YoloV3( 72 | FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes) 73 | model_pretrained.load_weights(FLAGS.weights) 74 | 75 | if FLAGS.transfer == 'darknet': 76 | model.get_layer('yolo_darknet').set_weights( 77 | model_pretrained.get_layer('yolo_darknet').get_weights()) 78 | freeze_all(model.get_layer('yolo_darknet')) 79 | elif FLAGS.transfer == 'no_output': 80 | for l in model.layers: 81 | if not l.name.startswith('yolo_output'): 82 | l.set_weights(model_pretrained.get_layer( 83 | l.name).get_weights()) 84 | freeze_all(l) 85 | else: 86 | # All other transfer require matching classes 87 | model.load_weights(FLAGS.weights) 88 | if FLAGS.transfer == 'fine_tune': 89 | # freeze darknet and fine tune other layers 90 | darknet = model.get_layer('yolo_darknet') 91 | freeze_all(darknet) 92 | elif FLAGS.transfer == 'frozen': 93 | # freeze everything 94 | freeze_all(model) 95 | 96 | optimizer = tf.keras.optimizers.Adam(lr=FLAGS.learning_rate) 97 | loss = [YoloLoss(anchors[mask], classes=FLAGS.num_classes) 98 | for mask in anchor_masks] 99 | 100 | model.compile(optimizer=optimizer, loss=loss, 101 | run_eagerly=(FLAGS.mode == 'eager_fit')) 102 | 103 | return model, optimizer, loss, anchors, anchor_masks 104 | 105 | 106 | def main(_argv): 107 | physical_devices = tf.config.experimental.list_physical_devices('GPU') 108 | 109 | # Setup 110 | if FLAGS.multi_gpu: 111 | for physical_device in physical_devices: 112 | tf.config.experimental.set_memory_growth(physical_device, True) 113 | 114 | strategy = tf.distribute.MirroredStrategy() 115 | print('Number of devices: {}'.format(strategy.num_replicas_in_sync)) 116 | BATCH_SIZE = FLAGS.batch_size * strategy.num_replicas_in_sync 117 | FLAGS.batch_size = BATCH_SIZE 118 | 119 | with strategy.scope(): 120 | model, optimizer, loss, anchors, anchor_masks = setup_model() 121 | else: 122 | model, optimizer, loss, anchors, anchor_masks = setup_model() 123 | 124 | if FLAGS.dataset: 125 | train_dataset = dataset.load_tfrecord_dataset( 126 | FLAGS.dataset, FLAGS.classes, FLAGS.size) 127 | else: 128 | train_dataset = dataset.load_fake_dataset() 129 | train_dataset = train_dataset.shuffle(buffer_size=512) 130 | train_dataset = train_dataset.batch(FLAGS.batch_size) 131 | train_dataset = train_dataset.map(lambda x, y: ( 132 | dataset.transform_images(x, FLAGS.size), 133 | dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size))) 134 | train_dataset = train_dataset.prefetch( 135 | buffer_size=tf.data.experimental.AUTOTUNE) 136 | 137 | if FLAGS.val_dataset: 138 | val_dataset = dataset.load_tfrecord_dataset( 139 | FLAGS.val_dataset, FLAGS.classes, FLAGS.size) 140 | else: 141 | val_dataset = dataset.load_fake_dataset() 142 | val_dataset = val_dataset.batch(FLAGS.batch_size) 143 | val_dataset = val_dataset.map(lambda x, y: ( 144 | dataset.transform_images(x, FLAGS.size), 145 | dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size))) 146 | 147 | if FLAGS.mode == 'eager_tf': 148 | # Eager mode is great for debugging 149 | # Non eager graph mode is recommended for real training 150 | avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32) 151 | avg_val_loss = tf.keras.metrics.Mean('val_loss', dtype=tf.float32) 152 | 153 | for epoch in range(1, FLAGS.epochs + 1): 154 | for batch, (images, labels) in enumerate(train_dataset): 155 | with tf.GradientTape() as tape: 156 | outputs = model(images, training=True) 157 | regularization_loss = tf.reduce_sum(model.losses) 158 | pred_loss = [] 159 | for output, label, loss_fn in zip(outputs, labels, loss): 160 | pred_loss.append(loss_fn(label, output)) 161 | total_loss = tf.reduce_sum(pred_loss) + regularization_loss 162 | 163 | grads = tape.gradient(total_loss, model.trainable_variables) 164 | optimizer.apply_gradients( 165 | zip(grads, model.trainable_variables)) 166 | 167 | logging.info("{}_train_{}, {}, {}".format( 168 | epoch, batch, total_loss.numpy(), 169 | list(map(lambda x: np.sum(x.numpy()), pred_loss)))) 170 | avg_loss.update_state(total_loss) 171 | 172 | for batch, (images, labels) in enumerate(val_dataset): 173 | outputs = model(images) 174 | regularization_loss = tf.reduce_sum(model.losses) 175 | pred_loss = [] 176 | for output, label, loss_fn in zip(outputs, labels, loss): 177 | pred_loss.append(loss_fn(label, output)) 178 | total_loss = tf.reduce_sum(pred_loss) + regularization_loss 179 | 180 | logging.info("{}_val_{}, {}, {}".format( 181 | epoch, batch, total_loss.numpy(), 182 | list(map(lambda x: np.sum(x.numpy()), pred_loss)))) 183 | avg_val_loss.update_state(total_loss) 184 | 185 | logging.info("{}, train: {}, val: {}".format( 186 | epoch, 187 | avg_loss.result().numpy(), 188 | avg_val_loss.result().numpy())) 189 | 190 | avg_loss.reset_states() 191 | avg_val_loss.reset_states() 192 | model.save_weights( 193 | 'checkpoints/yolov3_train_{}.tf'.format(epoch)) 194 | else: 195 | 196 | callbacks = [ 197 | ReduceLROnPlateau(verbose=1), 198 | EarlyStopping(patience=3, verbose=1), 199 | ModelCheckpoint('checkpoints/yolov3_train_{epoch}.tf', 200 | verbose=1, save_weights_only=True), 201 | TensorBoard(log_dir='logs') 202 | ] 203 | 204 | start_time = time.time() 205 | history = model.fit(train_dataset, 206 | epochs=FLAGS.epochs, 207 | callbacks=callbacks, 208 | validation_data=val_dataset) 209 | end_time = time.time() - start_time 210 | print(f'Total Training Time: {end_time}') 211 | 212 | 213 | if __name__ == '__main__': 214 | try: 215 | app.run(main) 216 | except SystemExit: 217 | pass 218 | -------------------------------------------------------------------------------- /yolov3_tf2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zzh8829/yolov3-tf2/2784050f2fc1ff060f4c8b3ac2db231370569aa8/yolov3_tf2/__init__.py -------------------------------------------------------------------------------- /yolov3_tf2/dataset.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from absl.flags import FLAGS 3 | 4 | @tf.function 5 | def transform_targets_for_output(y_true, grid_size, anchor_idxs): 6 | # y_true: (N, boxes, (x1, y1, x2, y2, class, best_anchor)) 7 | N = tf.shape(y_true)[0] 8 | 9 | # y_true_out: (N, grid, grid, anchors, [x1, y1, x2, y2, obj, class]) 10 | y_true_out = tf.zeros( 11 | (N, grid_size, grid_size, tf.shape(anchor_idxs)[0], 6)) 12 | 13 | anchor_idxs = tf.cast(anchor_idxs, tf.int32) 14 | 15 | indexes = tf.TensorArray(tf.int32, 1, dynamic_size=True) 16 | updates = tf.TensorArray(tf.float32, 1, dynamic_size=True) 17 | idx = 0 18 | for i in tf.range(N): 19 | for j in tf.range(tf.shape(y_true)[1]): 20 | if tf.equal(y_true[i][j][2], 0): 21 | continue 22 | anchor_eq = tf.equal( 23 | anchor_idxs, tf.cast(y_true[i][j][5], tf.int32)) 24 | 25 | if tf.reduce_any(anchor_eq): 26 | box = y_true[i][j][0:4] 27 | box_xy = (y_true[i][j][0:2] + y_true[i][j][2:4]) / 2 28 | 29 | anchor_idx = tf.cast(tf.where(anchor_eq), tf.int32) 30 | grid_xy = tf.cast(box_xy // (1/grid_size), tf.int32) 31 | 32 | # grid[y][x][anchor] = (tx, ty, bw, bh, obj, class) 33 | indexes = indexes.write( 34 | idx, [i, grid_xy[1], grid_xy[0], anchor_idx[0][0]]) 35 | updates = updates.write( 36 | idx, [box[0], box[1], box[2], box[3], 1, y_true[i][j][4]]) 37 | idx += 1 38 | 39 | # tf.print(indexes.stack()) 40 | # tf.print(updates.stack()) 41 | 42 | return tf.tensor_scatter_nd_update( 43 | y_true_out, indexes.stack(), updates.stack()) 44 | 45 | 46 | def transform_targets(y_train, anchors, anchor_masks, size): 47 | y_outs = [] 48 | grid_size = size // 32 49 | 50 | # calculate anchor index for true boxes 51 | anchors = tf.cast(anchors, tf.float32) 52 | anchor_area = anchors[..., 0] * anchors[..., 1] 53 | box_wh = y_train[..., 2:4] - y_train[..., 0:2] 54 | box_wh = tf.tile(tf.expand_dims(box_wh, -2), 55 | (1, 1, tf.shape(anchors)[0], 1)) 56 | box_area = box_wh[..., 0] * box_wh[..., 1] 57 | intersection = tf.minimum(box_wh[..., 0], anchors[..., 0]) * \ 58 | tf.minimum(box_wh[..., 1], anchors[..., 1]) 59 | iou = intersection / (box_area + anchor_area - intersection) 60 | anchor_idx = tf.cast(tf.argmax(iou, axis=-1), tf.float32) 61 | anchor_idx = tf.expand_dims(anchor_idx, axis=-1) 62 | 63 | y_train = tf.concat([y_train, anchor_idx], axis=-1) 64 | 65 | for anchor_idxs in anchor_masks: 66 | y_outs.append(transform_targets_for_output( 67 | y_train, grid_size, anchor_idxs)) 68 | grid_size *= 2 69 | 70 | return tuple(y_outs) 71 | 72 | 73 | def transform_images(x_train, size): 74 | x_train = tf.image.resize(x_train, (size, size)) 75 | x_train = x_train / 255 76 | return x_train 77 | 78 | 79 | # https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/using_your_own_dataset.md#conversion-script-outline-conversion-script-outline 80 | # Commented out fields are not required in our project 81 | IMAGE_FEATURE_MAP = { 82 | # 'image/width': tf.io.FixedLenFeature([], tf.int64), 83 | # 'image/height': tf.io.FixedLenFeature([], tf.int64), 84 | # 'image/filename': tf.io.FixedLenFeature([], tf.string), 85 | # 'image/source_id': tf.io.FixedLenFeature([], tf.string), 86 | # 'image/key/sha256': tf.io.FixedLenFeature([], tf.string), 87 | 'image/encoded': tf.io.FixedLenFeature([], tf.string), 88 | # 'image/format': tf.io.FixedLenFeature([], tf.string), 89 | 'image/object/bbox/xmin': tf.io.VarLenFeature(tf.float32), 90 | 'image/object/bbox/ymin': tf.io.VarLenFeature(tf.float32), 91 | 'image/object/bbox/xmax': tf.io.VarLenFeature(tf.float32), 92 | 'image/object/bbox/ymax': tf.io.VarLenFeature(tf.float32), 93 | 'image/object/class/text': tf.io.VarLenFeature(tf.string), 94 | # 'image/object/class/label': tf.io.VarLenFeature(tf.int64), 95 | # 'image/object/difficult': tf.io.VarLenFeature(tf.int64), 96 | # 'image/object/truncated': tf.io.VarLenFeature(tf.int64), 97 | # 'image/object/view': tf.io.VarLenFeature(tf.string), 98 | } 99 | 100 | 101 | def parse_tfrecord(tfrecord, class_table, size): 102 | x = tf.io.parse_single_example(tfrecord, IMAGE_FEATURE_MAP) 103 | x_train = tf.image.decode_jpeg(x['image/encoded'], channels=3) 104 | x_train = tf.image.resize(x_train, (size, size)) 105 | 106 | class_text = tf.sparse.to_dense( 107 | x['image/object/class/text'], default_value='') 108 | labels = tf.cast(class_table.lookup(class_text), tf.float32) 109 | y_train = tf.stack([tf.sparse.to_dense(x['image/object/bbox/xmin']), 110 | tf.sparse.to_dense(x['image/object/bbox/ymin']), 111 | tf.sparse.to_dense(x['image/object/bbox/xmax']), 112 | tf.sparse.to_dense(x['image/object/bbox/ymax']), 113 | labels], axis=1) 114 | 115 | paddings = [[0, FLAGS.yolo_max_boxes - tf.shape(y_train)[0]], [0, 0]] 116 | y_train = tf.pad(y_train, paddings) 117 | 118 | return x_train, y_train 119 | 120 | 121 | def load_tfrecord_dataset(file_pattern, class_file, size=416): 122 | LINE_NUMBER = -1 # TODO: use tf.lookup.TextFileIndex.LINE_NUMBER 123 | class_table = tf.lookup.StaticHashTable(tf.lookup.TextFileInitializer( 124 | class_file, tf.string, 0, tf.int64, LINE_NUMBER, delimiter="\n"), -1) 125 | 126 | files = tf.data.Dataset.list_files(file_pattern) 127 | dataset = files.flat_map(tf.data.TFRecordDataset) 128 | return dataset.map(lambda x: parse_tfrecord(x, class_table, size)) 129 | 130 | 131 | def load_fake_dataset(): 132 | x_train = tf.image.decode_jpeg( 133 | open('./data/girl.png', 'rb').read(), channels=3) 134 | x_train = tf.expand_dims(x_train, axis=0) 135 | 136 | labels = [ 137 | [0.18494931, 0.03049111, 0.9435849, 0.96302897, 0], 138 | [0.01586703, 0.35938117, 0.17582396, 0.6069674, 56], 139 | [0.09158827, 0.48252046, 0.26967454, 0.6403017, 67] 140 | ] + [[0, 0, 0, 0, 0]] * 5 141 | y_train = tf.convert_to_tensor(labels, tf.float32) 142 | y_train = tf.expand_dims(y_train, axis=0) 143 | 144 | return tf.data.Dataset.from_tensor_slices((x_train, y_train)) 145 | -------------------------------------------------------------------------------- /yolov3_tf2/models.py: -------------------------------------------------------------------------------- 1 | from absl import flags 2 | from absl.flags import FLAGS 3 | import numpy as np 4 | import tensorflow as tf 5 | from tensorflow.keras import Model 6 | from tensorflow.keras.layers import ( 7 | Add, 8 | Concatenate, 9 | Conv2D, 10 | Input, 11 | Lambda, 12 | LeakyReLU, 13 | MaxPool2D, 14 | UpSampling2D, 15 | ZeroPadding2D, 16 | BatchNormalization, 17 | ) 18 | from tensorflow.keras.regularizers import l2 19 | from tensorflow.keras.losses import ( 20 | binary_crossentropy, 21 | sparse_categorical_crossentropy 22 | ) 23 | from .utils import broadcast_iou 24 | 25 | flags.DEFINE_integer('yolo_max_boxes', 100, 26 | 'maximum number of boxes per image') 27 | flags.DEFINE_float('yolo_iou_threshold', 0.5, 'iou threshold') 28 | flags.DEFINE_float('yolo_score_threshold', 0.5, 'score threshold') 29 | 30 | yolo_anchors = np.array([(10, 13), (16, 30), (33, 23), (30, 61), (62, 45), 31 | (59, 119), (116, 90), (156, 198), (373, 326)], 32 | np.float32) / 416 33 | yolo_anchor_masks = np.array([[6, 7, 8], [3, 4, 5], [0, 1, 2]]) 34 | 35 | yolo_tiny_anchors = np.array([(10, 14), (23, 27), (37, 58), 36 | (81, 82), (135, 169), (344, 319)], 37 | np.float32) / 416 38 | yolo_tiny_anchor_masks = np.array([[3, 4, 5], [0, 1, 2]]) 39 | 40 | 41 | def DarknetConv(x, filters, size, strides=1, batch_norm=True): 42 | if strides == 1: 43 | padding = 'same' 44 | else: 45 | x = ZeroPadding2D(((1, 0), (1, 0)))(x) # top left half-padding 46 | padding = 'valid' 47 | x = Conv2D(filters=filters, kernel_size=size, 48 | strides=strides, padding=padding, 49 | use_bias=not batch_norm, kernel_regularizer=l2(0.0005))(x) 50 | if batch_norm: 51 | x = BatchNormalization()(x) 52 | x = LeakyReLU(alpha=0.1)(x) 53 | return x 54 | 55 | 56 | def DarknetResidual(x, filters): 57 | prev = x 58 | x = DarknetConv(x, filters // 2, 1) 59 | x = DarknetConv(x, filters, 3) 60 | x = Add()([prev, x]) 61 | return x 62 | 63 | 64 | def DarknetBlock(x, filters, blocks): 65 | x = DarknetConv(x, filters, 3, strides=2) 66 | for _ in range(blocks): 67 | x = DarknetResidual(x, filters) 68 | return x 69 | 70 | 71 | def Darknet(name=None): 72 | x = inputs = Input([None, None, 3]) 73 | x = DarknetConv(x, 32, 3) 74 | x = DarknetBlock(x, 64, 1) 75 | x = DarknetBlock(x, 128, 2) # skip connection 76 | x = x_36 = DarknetBlock(x, 256, 8) # skip connection 77 | x = x_61 = DarknetBlock(x, 512, 8) 78 | x = DarknetBlock(x, 1024, 4) 79 | return tf.keras.Model(inputs, (x_36, x_61, x), name=name) 80 | 81 | 82 | def DarknetTiny(name=None): 83 | x = inputs = Input([None, None, 3]) 84 | x = DarknetConv(x, 16, 3) 85 | x = MaxPool2D(2, 2, 'same')(x) 86 | x = DarknetConv(x, 32, 3) 87 | x = MaxPool2D(2, 2, 'same')(x) 88 | x = DarknetConv(x, 64, 3) 89 | x = MaxPool2D(2, 2, 'same')(x) 90 | x = DarknetConv(x, 128, 3) 91 | x = MaxPool2D(2, 2, 'same')(x) 92 | x = x_8 = DarknetConv(x, 256, 3) # skip connection 93 | x = MaxPool2D(2, 2, 'same')(x) 94 | x = DarknetConv(x, 512, 3) 95 | x = MaxPool2D(2, 1, 'same')(x) 96 | x = DarknetConv(x, 1024, 3) 97 | return tf.keras.Model(inputs, (x_8, x), name=name) 98 | 99 | 100 | def YoloConv(filters, name=None): 101 | def yolo_conv(x_in): 102 | if isinstance(x_in, tuple): 103 | inputs = Input(x_in[0].shape[1:]), Input(x_in[1].shape[1:]) 104 | x, x_skip = inputs 105 | 106 | # concat with skip connection 107 | x = DarknetConv(x, filters, 1) 108 | x = UpSampling2D(2)(x) 109 | x = Concatenate()([x, x_skip]) 110 | else: 111 | x = inputs = Input(x_in.shape[1:]) 112 | 113 | x = DarknetConv(x, filters, 1) 114 | x = DarknetConv(x, filters * 2, 3) 115 | x = DarknetConv(x, filters, 1) 116 | x = DarknetConv(x, filters * 2, 3) 117 | x = DarknetConv(x, filters, 1) 118 | return Model(inputs, x, name=name)(x_in) 119 | return yolo_conv 120 | 121 | 122 | def YoloConvTiny(filters, name=None): 123 | def yolo_conv(x_in): 124 | if isinstance(x_in, tuple): 125 | inputs = Input(x_in[0].shape[1:]), Input(x_in[1].shape[1:]) 126 | x, x_skip = inputs 127 | 128 | # concat with skip connection 129 | x = DarknetConv(x, filters, 1) 130 | x = UpSampling2D(2)(x) 131 | x = Concatenate()([x, x_skip]) 132 | else: 133 | x = inputs = Input(x_in.shape[1:]) 134 | x = DarknetConv(x, filters, 1) 135 | 136 | return Model(inputs, x, name=name)(x_in) 137 | return yolo_conv 138 | 139 | 140 | def YoloOutput(filters, anchors, classes, name=None): 141 | def yolo_output(x_in): 142 | x = inputs = Input(x_in.shape[1:]) 143 | x = DarknetConv(x, filters * 2, 3) 144 | x = DarknetConv(x, anchors * (classes + 5), 1, batch_norm=False) 145 | x = Lambda(lambda x: tf.reshape(x, (-1, tf.shape(x)[1], tf.shape(x)[2], 146 | anchors, classes + 5)))(x) 147 | return tf.keras.Model(inputs, x, name=name)(x_in) 148 | return yolo_output 149 | 150 | 151 | # As tensorflow lite doesn't support tf.size used in tf.meshgrid, 152 | # we reimplemented a simple meshgrid function that use basic tf function. 153 | def _meshgrid(n_a, n_b): 154 | 155 | return [ 156 | tf.reshape(tf.tile(tf.range(n_a), [n_b]), (n_b, n_a)), 157 | tf.reshape(tf.repeat(tf.range(n_b), n_a), (n_b, n_a)) 158 | ] 159 | 160 | 161 | def yolo_boxes(pred, anchors, classes): 162 | # pred: (batch_size, grid, grid, anchors, (x, y, w, h, obj, ...classes)) 163 | grid_size = tf.shape(pred)[1:3] 164 | box_xy, box_wh, objectness, class_probs = tf.split( 165 | pred, (2, 2, 1, classes), axis=-1) 166 | 167 | box_xy = tf.sigmoid(box_xy) 168 | objectness = tf.sigmoid(objectness) 169 | class_probs = tf.sigmoid(class_probs) 170 | pred_box = tf.concat((box_xy, box_wh), axis=-1) # original xywh for loss 171 | 172 | # !!! grid[x][y] == (y, x) 173 | grid = _meshgrid(grid_size[1],grid_size[0]) 174 | grid = tf.expand_dims(tf.stack(grid, axis=-1), axis=2) # [gx, gy, 1, 2] 175 | 176 | box_xy = (box_xy + tf.cast(grid, tf.float32)) / \ 177 | tf.cast(grid_size, tf.float32) 178 | box_wh = tf.exp(box_wh) * anchors 179 | 180 | box_x1y1 = box_xy - box_wh / 2 181 | box_x2y2 = box_xy + box_wh / 2 182 | bbox = tf.concat([box_x1y1, box_x2y2], axis=-1) 183 | 184 | return bbox, objectness, class_probs, pred_box 185 | 186 | 187 | def yolo_nms(outputs, anchors, masks, classes): 188 | # boxes, conf, type 189 | b, c, t = [], [], [] 190 | 191 | for o in outputs: 192 | b.append(tf.reshape(o[0], (tf.shape(o[0])[0], -1, tf.shape(o[0])[-1]))) 193 | c.append(tf.reshape(o[1], (tf.shape(o[1])[0], -1, tf.shape(o[1])[-1]))) 194 | t.append(tf.reshape(o[2], (tf.shape(o[2])[0], -1, tf.shape(o[2])[-1]))) 195 | 196 | bbox = tf.concat(b, axis=1) 197 | confidence = tf.concat(c, axis=1) 198 | class_probs = tf.concat(t, axis=1) 199 | 200 | # If we only have one class, do not multiply by class_prob (always 0.5) 201 | if classes == 1: 202 | scores = confidence 203 | else: 204 | scores = confidence * class_probs 205 | 206 | dscores = tf.squeeze(scores, axis=0) 207 | scores = tf.reduce_max(dscores,[1]) 208 | bbox = tf.reshape(bbox,(-1,4)) 209 | classes = tf.argmax(dscores,1) 210 | selected_indices, selected_scores = tf.image.non_max_suppression_with_scores( 211 | boxes=bbox, 212 | scores=scores, 213 | max_output_size=FLAGS.yolo_max_boxes, 214 | iou_threshold=FLAGS.yolo_iou_threshold, 215 | score_threshold=FLAGS.yolo_score_threshold, 216 | soft_nms_sigma=0.5 217 | ) 218 | 219 | num_valid_nms_boxes = tf.shape(selected_indices)[0] 220 | 221 | selected_indices = tf.concat([selected_indices,tf.zeros(FLAGS.yolo_max_boxes-num_valid_nms_boxes, tf.int32)], 0) 222 | selected_scores = tf.concat([selected_scores,tf.zeros(FLAGS.yolo_max_boxes-num_valid_nms_boxes,tf.float32)], -1) 223 | 224 | boxes=tf.gather(bbox, selected_indices) 225 | boxes = tf.expand_dims(boxes, axis=0) 226 | scores=selected_scores 227 | scores = tf.expand_dims(scores, axis=0) 228 | classes = tf.gather(classes,selected_indices) 229 | classes = tf.expand_dims(classes, axis=0) 230 | valid_detections=num_valid_nms_boxes 231 | valid_detections = tf.expand_dims(valid_detections, axis=0) 232 | 233 | return boxes, scores, classes, valid_detections 234 | 235 | 236 | def YoloV3(size=None, channels=3, anchors=yolo_anchors, 237 | masks=yolo_anchor_masks, classes=80, training=False): 238 | x = inputs = Input([size, size, channels], name='input') 239 | 240 | x_36, x_61, x = Darknet(name='yolo_darknet')(x) 241 | 242 | x = YoloConv(512, name='yolo_conv_0')(x) 243 | output_0 = YoloOutput(512, len(masks[0]), classes, name='yolo_output_0')(x) 244 | 245 | x = YoloConv(256, name='yolo_conv_1')((x, x_61)) 246 | output_1 = YoloOutput(256, len(masks[1]), classes, name='yolo_output_1')(x) 247 | 248 | x = YoloConv(128, name='yolo_conv_2')((x, x_36)) 249 | output_2 = YoloOutput(128, len(masks[2]), classes, name='yolo_output_2')(x) 250 | 251 | if training: 252 | return Model(inputs, (output_0, output_1, output_2), name='yolov3') 253 | 254 | boxes_0 = Lambda(lambda x: yolo_boxes(x, anchors[masks[0]], classes), 255 | name='yolo_boxes_0')(output_0) 256 | boxes_1 = Lambda(lambda x: yolo_boxes(x, anchors[masks[1]], classes), 257 | name='yolo_boxes_1')(output_1) 258 | boxes_2 = Lambda(lambda x: yolo_boxes(x, anchors[masks[2]], classes), 259 | name='yolo_boxes_2')(output_2) 260 | 261 | outputs = Lambda(lambda x: yolo_nms(x, anchors, masks, classes), 262 | name='yolo_nms')((boxes_0[:3], boxes_1[:3], boxes_2[:3])) 263 | 264 | return Model(inputs, outputs, name='yolov3') 265 | 266 | 267 | def YoloV3Tiny(size=None, channels=3, anchors=yolo_tiny_anchors, 268 | masks=yolo_tiny_anchor_masks, classes=80, training=False): 269 | x = inputs = Input([size, size, channels], name='input') 270 | 271 | x_8, x = DarknetTiny(name='yolo_darknet')(x) 272 | 273 | x = YoloConvTiny(256, name='yolo_conv_0')(x) 274 | output_0 = YoloOutput(256, len(masks[0]), classes, name='yolo_output_0')(x) 275 | 276 | x = YoloConvTiny(128, name='yolo_conv_1')((x, x_8)) 277 | output_1 = YoloOutput(128, len(masks[1]), classes, name='yolo_output_1')(x) 278 | 279 | if training: 280 | return Model(inputs, (output_0, output_1), name='yolov3') 281 | 282 | boxes_0 = Lambda(lambda x: yolo_boxes(x, anchors[masks[0]], classes), 283 | name='yolo_boxes_0')(output_0) 284 | boxes_1 = Lambda(lambda x: yolo_boxes(x, anchors[masks[1]], classes), 285 | name='yolo_boxes_1')(output_1) 286 | outputs = Lambda(lambda x: yolo_nms(x, anchors, masks, classes), 287 | name='yolo_nms')((boxes_0[:3], boxes_1[:3])) 288 | return Model(inputs, outputs, name='yolov3_tiny') 289 | 290 | 291 | def YoloLoss(anchors, classes=80, ignore_thresh=0.5): 292 | def yolo_loss(y_true, y_pred): 293 | # 1. transform all pred outputs 294 | # y_pred: (batch_size, grid, grid, anchors, (x, y, w, h, obj, ...cls)) 295 | pred_box, pred_obj, pred_class, pred_xywh = yolo_boxes( 296 | y_pred, anchors, classes) 297 | pred_xy = pred_xywh[..., 0:2] 298 | pred_wh = pred_xywh[..., 2:4] 299 | 300 | # 2. transform all true outputs 301 | # y_true: (batch_size, grid, grid, anchors, (x1, y1, x2, y2, obj, cls)) 302 | true_box, true_obj, true_class_idx = tf.split( 303 | y_true, (4, 1, 1), axis=-1) 304 | true_xy = (true_box[..., 0:2] + true_box[..., 2:4]) / 2 305 | true_wh = true_box[..., 2:4] - true_box[..., 0:2] 306 | 307 | # give higher weights to small boxes 308 | box_loss_scale = 2 - true_wh[..., 0] * true_wh[..., 1] 309 | 310 | # 3. inverting the pred box equations 311 | grid_size = tf.shape(y_true)[1] 312 | grid = tf.meshgrid(tf.range(grid_size), tf.range(grid_size)) 313 | grid = tf.expand_dims(tf.stack(grid, axis=-1), axis=2) 314 | true_xy = true_xy * tf.cast(grid_size, tf.float32) - \ 315 | tf.cast(grid, tf.float32) 316 | true_wh = tf.math.log(true_wh / anchors) 317 | true_wh = tf.where(tf.math.is_inf(true_wh), 318 | tf.zeros_like(true_wh), true_wh) 319 | 320 | # 4. calculate all masks 321 | obj_mask = tf.squeeze(true_obj, -1) 322 | # ignore false positive when iou is over threshold 323 | best_iou = tf.map_fn( 324 | lambda x: tf.reduce_max(broadcast_iou(x[0], tf.boolean_mask( 325 | x[1], tf.cast(x[2], tf.bool))), axis=-1), 326 | (pred_box, true_box, obj_mask), 327 | tf.float32) 328 | ignore_mask = tf.cast(best_iou < ignore_thresh, tf.float32) 329 | 330 | # 5. calculate all losses 331 | xy_loss = obj_mask * box_loss_scale * \ 332 | tf.reduce_sum(tf.square(true_xy - pred_xy), axis=-1) 333 | wh_loss = obj_mask * box_loss_scale * \ 334 | tf.reduce_sum(tf.square(true_wh - pred_wh), axis=-1) 335 | obj_loss = binary_crossentropy(true_obj, pred_obj) 336 | obj_loss = obj_mask * obj_loss + \ 337 | (1 - obj_mask) * ignore_mask * obj_loss 338 | # TODO: use binary_crossentropy instead 339 | class_loss = obj_mask * sparse_categorical_crossentropy( 340 | true_class_idx, pred_class) 341 | 342 | # 6. sum over (batch, gridx, gridy, anchors) => (batch, 1) 343 | xy_loss = tf.reduce_sum(xy_loss, axis=(1, 2, 3)) 344 | wh_loss = tf.reduce_sum(wh_loss, axis=(1, 2, 3)) 345 | obj_loss = tf.reduce_sum(obj_loss, axis=(1, 2, 3)) 346 | class_loss = tf.reduce_sum(class_loss, axis=(1, 2, 3)) 347 | 348 | return xy_loss + wh_loss + obj_loss + class_loss 349 | return yolo_loss 350 | -------------------------------------------------------------------------------- /yolov3_tf2/utils.py: -------------------------------------------------------------------------------- 1 | from absl import logging 2 | import numpy as np 3 | import tensorflow as tf 4 | import cv2 5 | 6 | YOLOV3_LAYER_LIST = [ 7 | 'yolo_darknet', 8 | 'yolo_conv_0', 9 | 'yolo_output_0', 10 | 'yolo_conv_1', 11 | 'yolo_output_1', 12 | 'yolo_conv_2', 13 | 'yolo_output_2', 14 | ] 15 | 16 | YOLOV3_TINY_LAYER_LIST = [ 17 | 'yolo_darknet', 18 | 'yolo_conv_0', 19 | 'yolo_output_0', 20 | 'yolo_conv_1', 21 | 'yolo_output_1', 22 | ] 23 | 24 | 25 | def load_darknet_weights(model, weights_file, tiny=False): 26 | wf = open(weights_file, 'rb') 27 | major, minor, revision, seen, _ = np.fromfile(wf, dtype=np.int32, count=5) 28 | 29 | if tiny: 30 | layers = YOLOV3_TINY_LAYER_LIST 31 | else: 32 | layers = YOLOV3_LAYER_LIST 33 | 34 | for layer_name in layers: 35 | sub_model = model.get_layer(layer_name) 36 | for i, layer in enumerate(sub_model.layers): 37 | if not layer.name.startswith('conv2d'): 38 | continue 39 | batch_norm = None 40 | if i + 1 < len(sub_model.layers) and \ 41 | sub_model.layers[i + 1].name.startswith('batch_norm'): 42 | batch_norm = sub_model.layers[i + 1] 43 | 44 | logging.info("{}/{} {}".format( 45 | sub_model.name, layer.name, 'bn' if batch_norm else 'bias')) 46 | 47 | filters = layer.filters 48 | size = layer.kernel_size[0] 49 | in_dim = layer.get_input_shape_at(0)[-1] 50 | 51 | if batch_norm is None: 52 | conv_bias = np.fromfile(wf, dtype=np.float32, count=filters) 53 | else: 54 | # darknet [beta, gamma, mean, variance] 55 | bn_weights = np.fromfile( 56 | wf, dtype=np.float32, count=4 * filters) 57 | # tf [gamma, beta, mean, variance] 58 | bn_weights = bn_weights.reshape((4, filters))[[1, 0, 2, 3]] 59 | 60 | # darknet shape (out_dim, in_dim, height, width) 61 | conv_shape = (filters, in_dim, size, size) 62 | conv_weights = np.fromfile( 63 | wf, dtype=np.float32, count=np.product(conv_shape)) 64 | # tf shape (height, width, in_dim, out_dim) 65 | conv_weights = conv_weights.reshape( 66 | conv_shape).transpose([2, 3, 1, 0]) 67 | 68 | if batch_norm is None: 69 | layer.set_weights([conv_weights, conv_bias]) 70 | else: 71 | layer.set_weights([conv_weights]) 72 | batch_norm.set_weights(bn_weights) 73 | 74 | assert len(wf.read()) == 0, 'failed to read all data' 75 | wf.close() 76 | 77 | 78 | def broadcast_iou(box_1, box_2): 79 | # box_1: (..., (x1, y1, x2, y2)) 80 | # box_2: (N, (x1, y1, x2, y2)) 81 | 82 | # broadcast boxes 83 | box_1 = tf.expand_dims(box_1, -2) 84 | box_2 = tf.expand_dims(box_2, 0) 85 | # new_shape: (..., N, (x1, y1, x2, y2)) 86 | new_shape = tf.broadcast_dynamic_shape(tf.shape(box_1), tf.shape(box_2)) 87 | box_1 = tf.broadcast_to(box_1, new_shape) 88 | box_2 = tf.broadcast_to(box_2, new_shape) 89 | 90 | int_w = tf.maximum(tf.minimum(box_1[..., 2], box_2[..., 2]) - 91 | tf.maximum(box_1[..., 0], box_2[..., 0]), 0) 92 | int_h = tf.maximum(tf.minimum(box_1[..., 3], box_2[..., 3]) - 93 | tf.maximum(box_1[..., 1], box_2[..., 1]), 0) 94 | int_area = int_w * int_h 95 | box_1_area = (box_1[..., 2] - box_1[..., 0]) * \ 96 | (box_1[..., 3] - box_1[..., 1]) 97 | box_2_area = (box_2[..., 2] - box_2[..., 0]) * \ 98 | (box_2[..., 3] - box_2[..., 1]) 99 | return int_area / (box_1_area + box_2_area - int_area) 100 | 101 | 102 | def draw_outputs(img, outputs, class_names): 103 | boxes, objectness, classes, nums = outputs 104 | boxes, objectness, classes, nums = boxes[0], objectness[0], classes[0], nums[0] 105 | wh = np.flip(img.shape[0:2]) 106 | for i in range(nums): 107 | x1y1 = tuple((np.array(boxes[i][0:2]) * wh).astype(np.int32)) 108 | x2y2 = tuple((np.array(boxes[i][2:4]) * wh).astype(np.int32)) 109 | img = cv2.rectangle(img, x1y1, x2y2, (255, 0, 0), 2) 110 | img = cv2.putText(img, '{} {:.4f}'.format( 111 | class_names[int(classes[i])], objectness[i]), 112 | x1y1, cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) 113 | return img 114 | 115 | 116 | def draw_labels(x, y, class_names): 117 | img = x.numpy() 118 | boxes, classes = tf.split(y, (4, 1), axis=-1) 119 | classes = classes[..., 0] 120 | wh = np.flip(img.shape[0:2]) 121 | for i in range(len(boxes)): 122 | x1y1 = tuple((np.array(boxes[i][0:2]) * wh).astype(np.int32)) 123 | x2y2 = tuple((np.array(boxes[i][2:4]) * wh).astype(np.int32)) 124 | img = cv2.rectangle(img, x1y1, x2y2, (255, 0, 0), 2) 125 | img = cv2.putText(img, class_names[classes[i]], 126 | x1y1, cv2.FONT_HERSHEY_COMPLEX_SMALL, 127 | 1, (0, 0, 255), 2) 128 | return img 129 | 130 | 131 | def freeze_all(model, frozen=True): 132 | model.trainable = not frozen 133 | if isinstance(model, tf.keras.Model): 134 | for l in model.layers: 135 | freeze_all(l, frozen) 136 | --------------------------------------------------------------------------------