├── .gitignore
├── LICENSE
├── README.md
├── detr_tf
    ├── bbox.py
    ├── data
    │   ├── __init__.py
    │   ├── coco.py
    │   ├── processing.py
    │   ├── tfcsv.py
    │   ├── transformation.py
    │   └── voc.py
    ├── inference.py
    ├── logger
    │   ├── training_logging.py
    │   └── wandb_logging.py
    ├── loss
    │   ├── compute_map.py
    │   ├── hungarian_matching.py
    │   └── loss.py
    ├── networks
    │   ├── custom_layers.py
    │   ├── detr.py
    │   ├── position_embeddings.py
    │   ├── resnet_backbone.py
    │   ├── transformer.py
    │   └── weights.py
    ├── optimizers.py
    ├── training.py
    └── training_config.py
├── eval.py
├── finetune_coco.py
├── finetune_hardhat.py
├── finetune_voc.py
├── images
    ├── datasetsupport.png
    ├── detr-figure.png
    ├── hardhatdataset.jpg
    ├── tutorials
    │   ├── data-pipeline.png
    │   └── download_hardhat_dataset.png
    ├── wandb_logging.png
    └── webcam_detr.png
├── notebooks
    ├── DETR Tensorflow -  Finetuning tutorial.ipynb
    ├── DETR Tensorflow -  How to setup a custom dataset.ipynb
    └── How to load a dataset.ipynb
├── requirements.txt
├── train_coco.py
└── webcam_inference.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | wandb
 10 | weights
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | pip-wheel-metadata/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Visual-Behavior
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DETR : End-to-End Object Detection with Transformers (Tensorflow)
  2 | 
  3 | Tensorflow implementation of DETR : Object Detection with Transformers, including code for inference, training, and finetuning. DETR is a promising model that brings widely adopted transformers to vision models. We believe that models based on convolution and transformers will soon become the default choice for most practitioners because of the simplicity of the training procedure: NMS and anchors free! Therefore this repository is a step toward making this type of architecture widely available. 
  4 | 
  5 | * [1. Install](#install)
  6 | * [2. Datasets](#datasets)
  7 | * [3. Tutorials](#tutorials)
  8 | * [4. Finetuning](#finetuning)
  9 | * [5. Training](#training)
 10 | * [5. inference](#inference)
 11 | * [6. Acknowledgement](#acknowledgement)
 12 | 
 13 | 
 14 | <b>DETR paper:</b> https://arxiv.org/pdf/2005.12872.pdf <br>
 15 | <b>Torch implementation: https://github.com/facebookresearch/detr</b>
 16 | 
 17 | <img src="images/detr-figure.png"></img>
 18 | 
 19 | <b>About this implementation:</b> This repository includes codes to run an inference with the original model's weights (based on the PyTorch weights), to train the model from scratch (multi-GPU training support coming soon) as well as examples to finetune the model on your dataset. Unlike the PyTorch implementation, the training uses fixed image sizes and a standard Adam optimizer with gradient norm clipping.
 20 | 
 21 | Additionally, our logging system is based on https://www.wandb.com/ so that you can get a great visualization of your model performance!
 22 | 
 23 | - Checkout our logging board with the reports here: https://wandb.ai/thibault-neveu/detr-tensorflow-log
 24 | 
 25 | <img src="images/wandb_logging.png"></img>
 26 | 
 27 | ## Install
 28 | 
 29 | The code has been currently tested with Tensorflow 2.3.0 and python 3.7. The following dependencies are required.
 30 | 
 31 | 
 32 | ```
 33 | wandb
 34 | matplotlib
 35 | numpy
 36 | pycocotools
 37 | scikit-image
 38 | imageio
 39 | pandas
 40 | ```
 41 | 
 42 | ```
 43 | pip install -r requirements.txt
 44 | ```
 45 | 
 46 | 
 47 | 
 48 | ## Datasets
 49 | 
 50 | 
 51 | This repository currently supports three dataset formats: **COCO**, **VOC**, and **Tensorflow Object detection csv**. The easiest way to get started is to set up your dataset based on one of these formats. Along with the datasets, we provide a code example to finetune your model.
 52 | Finally, we provide a jupyter notebook to help you understand how to load a dataset, set up a custom dataset, and finetune your model.
 53 | 
 54 | <img src="images/datasetsupport.png"></img>
 55 | 
 56 | ## Tutorials
 57 | 
 58 | To get started with the repository you can check the following Jupyter notebooks:
 59 | 
 60 | - ✍ [DETR Tensorflow - How to load a dataset.ipynb](https://github.com/Visual-Behavior/detr-tensorflow/blob/main/notebooks/How%20to%20load%20a%20dataset.ipynb)
 61 | - ✍ [DETR Tensorflow - Finetuning tutorial.ipynb](https://github.com/Visual-Behavior/detr-tensorflow/blob/main/notebooks/DETR%20Tensorflow%20-%20%20Finetuning%20tutorial.ipynb)
 62 | - ✍ [DETR Tensorflow - How to setup a custom dataset.ipynb](https://github.com/Visual-Behavior/detr-tensorflow/blob/main/notebooks/DETR%20Tensorflow%20-%20%20How%20to%20setup%20a%20custom%20dataset.ipynb)
 63 | 
 64 | As well as the logging board on wandb https://wandb.ai/thibault-neveu/detr-tensorflow-log and this report:
 65 | 
 66 | - 🚀 [Finetuning DETR on Tensorflow - A step by step guide](https://wandb.ai/thibault-neveu/detr-tensorflow-log/reports/Finetuning-DETR-on-Tensorflow-A-step-by-step-tutorial--VmlldzozOTYyNzQ)
 67 | 
 68 | 
 69 | ## Evaluation
 70 | 
 71 | Run the following to evaluate the model using the pre-trained weights. 
 72 | - **data_dir** is your coco dataset folder
 73 | - **img_dir** is the image folder relative to the data_dir
 74 | - **ann_file** is the validation annotation file relative to the data_dir
 75 | 
 76 | Checkout ✍ [DETR Tensorflow - How to load a dataset.ipynb](https://github.com/Visual-Behavior/detr-tensorflow/blob/main/notebooks/How%20to%20load%20a%20dataset.ipynb) for more information about the supported dataset ans their usage.
 77 | 
 78 | ```
 79 | python eval.py --data_dir /path/to/coco/dataset --img_dir val2017 --ann_file annotations/instances_val2017.json
 80 | ```
 81 | 
 82 | Outputs:
 83 | 
 84 | ```
 85 |        |  all  |  .50  |  .55  |  .60  |  .65  |  .70  |  .75  |  .80  |  .85  |  .90  |  .95  |
 86 | -------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
 87 |    box | 36.53 | 55.38 | 53.13 | 50.46 | 47.11 | 43.07 | 38.11 | 32.10 | 25.01 | 16.20 |  4.77 |
 88 |   mask |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |
 89 | -------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
 90 | 
 91 | ```
 92 | 
 93 | The result is not the same as reported in the paper because the evaluation is run on the <b>original image size</b> and not on the larger images. The actual implementation resizes the image so that the shorter side is at least 800pixels and the longer side at most 1333.
 94 | 
 95 | 
 96 | ## Finetuning
 97 | 
 98 | To fine-tune the model on a new dataset we siply need to set the number of class to detect in our new dataset (**nb_class**). The method will remove the last layers that predict the box class&positions and add new layers to finetune.
 99 | 
100 | ```python
101 | # Load the pretrained model
102 | detr = get_detr_model(config, include_top=False, nb_class=3, weights="detr", num_decoder_layers=6, num_encoder_layers=6)
103 | detr.summary()
104 | 
105 | # Load your dataset
106 | train_dt, class_names = load_tfcsv_dataset(config, config.batch_size, augmentation=True)
107 | 
108 | # Setup the optimziers and the trainable variables
109 | optimzers = setup_optimizers(detr, config)
110 | 
111 | # Train the model
112 | training.fit(detr, train_dt, optimzers, config, epoch_nb, class_names)
113 | ```
114 | The following commands gives some examples to finetune the model on new datasets:  (Pacal VOC) and (The Hard hat dataset), with a real ```batch_size``` of 8 and a virtual ```target_batch``` size (gradient aggregate) of 32. ```--log``` is used for logging the training into wandb. 
115 | 
116 | - **data_dir** is your voc dataset folder
117 | - **img_dir** is the image folder relative to the data_dir
118 | - **ann_file** is the validation annotation file relative to the data_dir
119 | 
120 | ```
121 | python finetune_voc.py --data_dir /home/thibault/data/VOCdevkit/VOC2012 --img_dir JPEGImages --ann_dir Annotations --batch_size 8 --target_batch 32  --log
122 | 
123 | ```
124 | - **data_dir** is the hardhatcsv dataset folder
125 | - **img_dir** and  **ann_file** set in the training file to load the training and validation differently
126 | 
127 | Checkout ✍ [DETR Tensorflow - How to load a dataset.ipynb](https://github.com/Visual-Behavior/detr-tensorflow/blob/main/notebooks/How%20to%20load%20a%20dataset.ipynb) for more information about the supported dataset ans their usage.
128 | 
129 | ```
130 | python  finetune_hardhat.py --data_dir /home/thibault/data/hardhat --batch_size 8 --target_batch 32 --log
131 | ```
132 | 
133 | ## Training
134 | 
135 | (Multi GPU training comming soon)
136 | 
137 | 
138 | - **data_dir** is the coco dataset folder
139 | - **img_dir** and  **ann_file** set in the training file to load the training and validation differently.
140 | 
141 | ```
142 | python train_coco.py --data_dir /path/to/COCO --batch_size 8  --target_batch 32 --log
143 | ```
144 | 
145 | ## Inference
146 | 
147 | Here is an example of running an inference with the model on your webcam.
148 | 
149 | ```
150 | python webcam_inference.py 
151 | ```
152 | 
153 | <img src="images/webcam_detr.png" width="400"></img>
154 | 
155 | 
156 | ## Acknowledgement
157 | 
158 | The pretrained weights of this models are originaly provide from the Facebook repository https://github.com/facebookresearch/detr and made avaiable in tensorflow in this repository: https://github.com/Leonardo-Blanger/detr_tensorflow
159 | 


--------------------------------------------------------------------------------
/detr_tf/bbox.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     This file is used to define all the function related to the manipulation
  3 |     and comparaison of bbox
  4 | """
  5 | 
  6 | from typing import Union,Dict,Tuple
  7 | import matplotlib.pyplot as plt
  8 | import tensorflow as tf
  9 | import numpy as np
 10 | import random
 11 | import cv2
 12 | 
 13 | 
 14 | def bbox_xcycwh_to_x1y1x2y2(bbox_xcycwh: np.array):
 15 |     """
 16 |         Rescale a list of bbox to the image size
 17 |         @bbox_xcycwh: [[xc, yc, w, h], ...]
 18 |         @img_size (height, width)
 19 |     """
 20 |     bbox_x1y1x2y2 = np.zeros_like((bbox_xcycwh))
 21 |     bbox_x1y1x2y2[:,0] = bbox_xcycwh[:,0] - (bbox_xcycwh[:,2] / 2)
 22 |     bbox_x1y1x2y2[:,2] = bbox_xcycwh[:,0] + (bbox_xcycwh[:,2] / 2)
 23 |     bbox_x1y1x2y2[:,1] = bbox_xcycwh[:,1] - (bbox_xcycwh[:,3] / 2)
 24 |     bbox_x1y1x2y2[:,3] = bbox_xcycwh[:,1] + (bbox_xcycwh[:,3] / 2)
 25 |     bbox_x1y1x2y2 = bbox_x1y1x2y2.astype(np.int32)
 26 |     return bbox_x1y1x2y2
 27 | 
 28 | 
 29 | def intersect(box_a: tf.Tensor, box_b: tf.Tensor) -> tf.Tensor:
 30 |     """
 31 |     Compute the intersection area between two sets of boxes.
 32 |     Args:
 33 |         box_a: A (tf.Tensor) list a bbox (a, 4) with a the number of bbox
 34 |         box_b: A (tf.Tensor) list a bbox (b, 4) with b the number of bbox
 35 |     Returns:
 36 |         The intersection area [a, b] between each bbox. zero if no intersection
 37 |     """
 38 |     # resize both tensors to [A,B,2] with the tile function to compare
 39 |     # each bbox with the anchors:
 40 |     # [A,2] -> [A,1,2] -> [A,B,2]
 41 |     # [B,2] -> [1,B,2] -> [A,B,2]
 42 |     # Then we compute the area of intersect between box_a and box_b.
 43 |     # box_a: (tensor) bounding boxes, Shape: [n, A, 4].
 44 |     # box_b: (tensor) bounding boxes, Shape: [n, B, 4].
 45 |     # Return: (tensor) intersection area, Shape: [n,A,B].
 46 | 
 47 |     A = tf.shape(box_a)[0] # Number of possible bbox
 48 |     B = tf.shape(box_b)[0] # Number of anchors
 49 | 
 50 |     #print(A, B, box_a.shape, box_b.shape)
 51 |     # Above Right Corner of Intersect Area
 52 |     # (b, A, 2) -> (b, A, B, 2)
 53 |     tiled_box_a_xymax = tf.tile(tf.expand_dims(box_a[:, 2:], axis=1), [1, B, 1])
 54 |     # (b, B, 2) -> (b, A, B, 2)
 55 |     tiled_box_b_xymax = tf.tile(tf.expand_dims(box_b[:, 2:], axis=0), [A, 1, 1])
 56 |     # Select the lower right corner of the intersect area
 57 |     above_right_corner = tf.math.minimum(tiled_box_a_xymax, tiled_box_b_xymax)
 58 | 
 59 | 
 60 |     # Upper Left Corner of Intersect Area
 61 |     # (b, A, 2) -> (b, A, B, 2)
 62 |     tiled_box_a_xymin = tf.tile(tf.expand_dims(box_a[:, :2], axis=1), [1, B, 1])
 63 |     # (b, B, 2) -> (b, A, B, 2)
 64 |     tiled_box_b_xymin = tf.tile(tf.expand_dims(box_b[:, :2], axis=0), [A, 1, 1])
 65 |     # Select the lower right corner of the intersect area
 66 |     upper_left_corner = tf.math.maximum(tiled_box_a_xymin, tiled_box_b_xymin)
 67 | 
 68 | 
 69 |     # If there is some intersection, both must be > 0
 70 |     inter = tf.nn.relu(above_right_corner - upper_left_corner)
 71 |     inter = inter[:, :, 0] * inter[:, :, 1]
 72 |     return inter
 73 | 
 74 | 
 75 | def jaccard(box_a: tf.Tensor, box_b: tf.Tensor, return_union=False) -> tf.Tensor:
 76 |     """
 77 |     Compute the IoU of two sets of boxes.
 78 |     Args:
 79 |         box_a: A (tf.Tensor) list a bbox (a, 4) with a the number of bbox
 80 |         box_b: A (tf.Tensor) list a bbox (b, 4) with b the number of bbox
 81 |     Returns:
 82 |         The Jaccard overlap [a, b] between each bbox
 83 |     """
 84 |     # Get the intersectin area
 85 |     inter = intersect(box_a, box_b)
 86 | 
 87 |     # Compute the A area
 88 |     # (xmax - xmin) * (ymax - ymin)
 89 |     area_a = (box_a[:, 2] - box_a[:, 0]) * (box_a[:, 3] - box_a[:, 1])
 90 |     # Tile the area to match the anchors area
 91 |     area_a = tf.tile(tf.expand_dims(area_a, axis=-1), [1, tf.shape(inter)[-1]])
 92 | 
 93 |     # Compute the B area
 94 |     # (xmax - xmin) * (ymax - ymin)
 95 |     area_b = (box_b[:, 2] - box_b[:, 0]) * (box_b[:, 3] - box_b[:, 1])
 96 |     # Tile the area to match the gt areas
 97 |     area_b = tf.tile(tf.expand_dims(area_b, axis=-2), [tf.shape(inter)[-2], 1])
 98 | 
 99 |     union = area_a + area_b - inter
100 | 
101 |     if return_union is False:
102 |         # Return the intesect over union
103 |         return inter / union
104 |     else:
105 |         return inter / union, union
106 | 
107 | def merge(box_a: tf.Tensor, box_b: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
108 |     """
109 |     Merged two set of boxes so that operations ca be run to compare them
110 |     Args:
111 |         box_a: A (tf.Tensor) list a bbox (a, 4) with a the number of bbox
112 |         box_b: A (tf.Tensor) list a bbox (b, 4) with b the number of bbox
113 |     Returns:
114 |         Return the two same tensor tiled: (a, b, 4)
115 |     """
116 |     A = tf.shape(box_a)[0] # Number of bbox in box_a
117 |     B = tf.shape(box_b)[0] # Number of bbox in box b
118 |     # Above Right Corner of Intersect Area
119 |     # (b, A, 2) -> (b, A, B, 2)
120 |     tiled_box_a = tf.tile(tf.expand_dims(box_a, axis=1), [1, B, 1])
121 |     # (b, B, 2) -> (b, A, B, 2)
122 |     tiled_box_b = tf.tile(tf.expand_dims(box_b, axis=0), [A, 1, 1])
123 | 
124 |     return tiled_box_a, tiled_box_b
125 | 
126 | def xy_min_xy_max_to_yx_min_yx_max(bbox: tf.Tensor) -> tf.Tensor:
127 |     """
128 |     Convert bbox from shape [xmin, ymin, xmax, ymax] to [ymin, xmin, ymax, xmax]
129 |     Args:
130 |         bbox A (tf.Tensor) list a bbox (n, 4) with n the number of bbox to convert
131 |     Returns:
132 |         The converted bbox
133 |     """
134 |     return tf.concat([
135 |         bbox[:,1:2],
136 |         bbox[:,0:1],
137 |         bbox[:,3:4],
138 |         bbox[:,2:3]
139 |     ], axis=-1)
140 | 
141 | def yx_min_yx_max_to_xy_min_xy_max(bbox: tf.Tensor) -> tf.Tensor:
142 |     """
143 |     Convert bbox from shape [ymin, xmin, ymax, xmax] to [xmin, ymin, xmax, ymax]
144 |     Args:
145 |         bbox A (tf.Tensor) list a bbox (n, 4) with n the number of bbox to convert
146 |     Returns:
147 |         The converted bbox
148 |     """
149 |     return tf.concat([
150 |         bbox[:,1:2],
151 |         bbox[:,0:1],
152 |         bbox[:,3:4],
153 |         bbox[:,2:3]
154 |     ], axis=-1)
155 | 
156 | 
157 | def xy_min_xy_max_to_xcycwh(bbox: tf.Tensor) -> tf.Tensor:
158 |     """
159 |     Convert bbox from shape [xmin, ymin, xmax, ymax] to [xc, yc, w, h]
160 |     Args:
161 |         bbox A (tf.Tensor) list a bbox (n, 4) with n the number of bbox to convert
162 |     Returns:
163 |         The converted bbox
164 |     """
165 |     # convert the bbox from [xmin, ymin, xmax, ymax] to [x_center, y_center, w, h]
166 |     bbox_xcycwh = tf.concat([bbox[:, :2] + ((bbox[:, 2:] - bbox[:, :2]) / 2), bbox[:, 2:] - bbox[:, :2]], axis=-1)
167 |     return bbox_xcycwh
168 | 
169 | 
170 | 
171 | def xcycwh_to_xy_min_xy_max(bbox: tf.Tensor) -> tf.Tensor:
172 |     """
173 |     Convert bbox from shape [xc, yc, w, h] to [xmin, ymin, xmax, ymax]
174 |     Args:
175 |         bbox A (tf.Tensor) list a bbox (n, 4) with n the number of bbox to convert
176 |     Returns:
177 |         The converted bbox
178 |     """
179 |     # convert the bbox from [xc, yc, w, h] to [xmin, ymin, xmax, ymax].
180 |     bbox_xyxy = tf.concat([bbox[:, :2] - (bbox[:, 2:] / 2), bbox[:, :2] + (bbox[:, 2:] / 2)], axis=-1)
181 |     # Be sure to keep the values btw 0 and 1
182 |     bbox_xyxy = tf.clip_by_value(bbox_xyxy, 0.0, 1.0)
183 |     return bbox_xyxy
184 | 
185 | 
186 | def xcycwh_to_yx_min_yx_max(bbox: tf.Tensor) -> tf.Tensor:
187 |     """
188 |     Convert bbox from shape [xc, yc, w, h] to [ymin, xmin, ymax, xmax]
189 |     Args:
190 |         bbox A (tf.Tensor) list a bbox (n, 4) with n the number of bbox to convert
191 |     Returns:
192 |         The converted bbox
193 |     """
194 |     bbox = xcycwh_to_xy_min_xy_max(bbox)
195 |     bbox = xy_min_xy_max_to_yx_min_yx_max(bbox)
196 |     return bbox
197 | 
198 | 
199 | def yx_min_yx_max_to_xcycwh(bbox: tf.Tensor) -> tf.Tensor:
200 |     """
201 |     Convert bbox from shape [ymin, xmin, ymax, xmax] to [xc, yc, w, h]
202 |     Args:
203 |         bbox A (tf.Tensor) list a bbox (n, 4) with n the number of bbox to convert
204 |     Returns:
205 |         The converted bbox
206 |     """
207 |     bbox = yx_min_yx_max_to_xy_min_xy_max(bbox)
208 |     bbox = xy_min_xy_max_to_xcycwh(bbox)
209 |     return bbox
210 | 
211 | 
212 | 
213 | """
214 | Numpy Transformations
215 | """
216 | 
217 | def xy_min_xy_max_to_xcycwh(bbox: np.array) -> np.array:
218 |     """
219 |     Convert bbox from shape [xmin, ymin, xmax, ymax] to [xc, yc, w, h]
220 |     Args:
221 |         bbox A (np.array) list a bbox (n, 4) with n the number of bbox to convert
222 |     Returns:
223 |         The converted bbox
224 |     """
225 |     # convert the bbox from [xmin, ymin, xmax, ymax] to [x_center, y_center, w, h]
226 |     bbox_xcycwh = np.concatenate([bbox[:, :2] + ((bbox[:, 2:] - bbox[:, :2]) / 2), bbox[:, 2:] - bbox[:, :2]], axis=-1)
227 |     return bbox_xcycwh
228 | 
229 | 
230 | def np_xcycwh_to_xy_min_xy_max(bbox: np.array) -> np.array:
231 |     """
232 |     Convert bbox from shape [xc, yc, w, h] to [xmin, ymin, xmax, ymax]
233 |     Args:
234 |         bbox A (tf.Tensor) list a bbox (n, 4) with n the number of bbox to convert
235 |     Returns:
236 |         The converted bbox
237 |     """
238 |     # convert the bbox from [xc, yc, w, h] to [xmin, ymin, xmax, ymax].
239 |     bbox_xy = np.concatenate([bbox[:, :2] - (bbox[:, 2:] / 2), bbox[:, :2] + (bbox[:, 2:] / 2)], axis=-1)
240 |     return bbox_xy
241 | 
242 | 
243 | 
244 | def np_yx_min_yx_max_to_xy_min_xy_max(bbox: np.array) -> np.array:
245 |     """
246 |     Convert bbox from shape [ymin, xmin, ymax, xmax] to [xmin, ymin, xmax, ymax]
247 |     Args:
248 |         bbox A (np.array) list a bbox (n, 4) with n the number of bbox to convert
249 |     Returns:
250 |         The converted bbox
251 |     """
252 |     return np.concatenate([
253 |         bbox[:,1:2],
254 |         bbox[:,0:1],
255 |         bbox[:,3:4],
256 |         bbox[:,2:3]
257 |     ], axis=-1)
258 | 
259 | 
260 | 
261 | def np_rescale_bbox_xcycwh(bbox_xcycwh: np.array, img_size: tuple):
262 |     """
263 |         Rescale a list of bbox to the image size
264 |         @bbox_xcycwh: [[xc, yc, w, h], ...]
265 |         @img_size (height, width)
266 |     """
267 |     bbox_xcycwh = np.array(bbox_xcycwh) # Be sure to work with a numpy array
268 |     scale = np.array([img_size[1], img_size[0], img_size[1], img_size[0]])
269 |     bbox_xcycwh_rescaled = bbox_xcycwh * scale
270 |     return bbox_xcycwh_rescaled
271 | 
272 | 
273 | def np_rescale_bbox_yx_min_yx_max(bbox_xcycwh: np.array, img_size: tuple):
274 |     """
275 |         Rescale a list of bbox to the image size
276 |         @bbox_xcycwh: [[y_min, x_min, y_max, x_max], ...]
277 |         @img_size (height, width)
278 |     """
279 |     bbox_xcycwh = np.array(bbox_xcycwh) # Be sure to work with a numpy array
280 |     scale = np.array([img_size[0], img_size[1], img_size[0], img_size[1]])
281 |     bbox_xcycwh_rescaled = bbox_xcycwh * scale
282 |     return bbox_xcycwh_rescaled
283 | 
284 | 
285 | def np_rescale_bbox_xy_min_xy_max(bbox: np.array, img_size: tuple):
286 |     """
287 |         Rescale a list of bbox to the image size
288 |         @bbox: [[x_min, y_min, x_max, y_max], ...]
289 |         @img_size (height, width)
290 |     """
291 |     bbox = np.array(bbox) # Be sure to work with a numpy array
292 |     scale = np.array([img_size[1], img_size[0], img_size[1], img_size[0]])
293 |     bbox_rescaled = bbox * scale
294 |     return bbox_rescaled
295 | 
296 | 


--------------------------------------------------------------------------------
/detr_tf/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .coco import load_coco_dataset, COCO_CLASS_NAME
2 | from .voc import load_voc_dataset
3 | from .tfcsv import load_tfcsv_dataset
4 | #import processing
5 | #import transformation


--------------------------------------------------------------------------------
/detr_tf/data/coco.py:
--------------------------------------------------------------------------------
  1 | from pycocotools.coco import COCO
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | import imageio
  5 | from skimage.color import gray2rgb
  6 | from random import sample, shuffle
  7 | import os
  8 | 
  9 | from . import transformation
 10 | from . import processing
 11 | import matplotlib.pyplot as plt
 12 | 
 13 | COCO_CLASS_NAME = [
 14 |     'N/A', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
 15 |     'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A',
 16 |     'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
 17 |     'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack',
 18 |     'umbrella', 'N/A', 'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
 19 |     'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
 20 |     'skateboard', 'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass',
 21 |     'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
 22 |     'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
 23 |     'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 'N/A',
 24 |     'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
 25 |     'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A',
 26 |     'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
 27 |     'toothbrush', "back"
 28 | ]
 29 | 
 30 | def get_coco_labels(coco, img_id, image_shape, augmentation):
 31 |     # Load the labels the instances
 32 |     ann_ids = coco.getAnnIds(imgIds=img_id)
 33 |     anns = coco.loadAnns(ann_ids)
 34 |     # Setup bbox
 35 |     bbox = []
 36 |     t_class = []
 37 |     crowd_bbox = 0
 38 |     for a, ann in enumerate(anns):
 39 |         bbox_x, bbox_y, bbox_w, bbox_h = ann['bbox'] 
 40 |         # target class
 41 |         t_cls = ann["category_id"]
 42 |         if ann["iscrowd"]:
 43 |             crowd_bbox = 1
 44 |         # Convert bbox to xc, yc, w, h formast
 45 |         x_center = bbox_x + (bbox_w / 2)
 46 |         y_center = bbox_y + (bbox_h / 2)
 47 |         x_center = x_center / float(image_shape[1])
 48 |         y_center = y_center / float(image_shape[0])
 49 |         bbox_w = bbox_w / float(image_shape[1])
 50 |         bbox_h = bbox_h / float(image_shape[0])
 51 |         # Add bbox and class
 52 |         bbox.append([x_center, y_center, bbox_w, bbox_h])
 53 |         t_class.append([t_cls])
 54 |     # Set bbox header
 55 |     bbox = np.array(bbox)
 56 |     t_class = np.array(t_class)
 57 |     return bbox.astype(np.float32), t_class.astype(np.int32), crowd_bbox
 58 | 
 59 | 
 60 | def get_coco_from_id(coco_id, coco, augmentation, config, img_dir):
 61 |     # Load imag
 62 |     img = coco.loadImgs([coco_id])[0]
 63 |     # Load image
 64 |     #data_type = "train2017" if train_val == "train" else "val2017"
 65 |     filne_name = img['file_name']
 66 |     image_path = os.path.join(img_dir, filne_name) #f"{config.}/{data_type}/{filne_name}"
 67 |     image = imageio.imread(image_path)
 68 |     # Graycale to RGB if needed
 69 |     if len(image.shape) == 2: image = gray2rgb(image)
 70 |     # Retrieve the image label
 71 |     t_bbox, t_class, is_crowd = get_coco_labels(coco, img['id'], image.shape, augmentation)
 72 |     # Apply augmentations
 73 |     if len(t_bbox) > 0 and augmentation is not None:
 74 |         image, t_bbox, t_class = transformation.detr_transform(image, t_bbox,  t_class, config, augmentation)
 75 |     # Normalized images
 76 |     image = processing.normalized_images(image, config)
 77 |     # Set type for tensorflow        
 78 |     image = image.astype(np.float32)
 79 |     t_bbox = t_bbox.astype(np.float32)
 80 |     t_class = t_class.astype(np.int64)
 81 |     is_crowd = np.array(is_crowd, dtype=np.int64)
 82 |     return image, t_bbox, t_class, is_crowd
 83 | 
 84 | 
 85 | def load_coco_dataset(config, batch_size, augmentation=False, ann_dir=None, ann_file=None, img_dir=None):
 86 |     """ Load a coco dataset
 87 |     """
 88 |     ann_dir = config.data.ann_dir if ann_dir is None else ann_dir
 89 |     ann_file = config.data.ann_file if ann_file is None else ann_file
 90 |     img_dir = config.data.img_dir if img_dir is None else img_dir
 91 | 
 92 | 
 93 | 
 94 |     coco = COCO(ann_file)
 95 | 
 96 |     # Extract CLASS names
 97 |     cats = coco.loadCats(coco.getCatIds())
 98 |     # Get the max class ID
 99 |     max_id = np.array([cat["id"] for cat in cats]).max()
100 |     class_names = ["N/A"] * (max_id + 2) # + 2 for the background class
101 |     # Add the backgrund class at the end
102 |     class_names[-1] = "back"
103 |     config.background_class = max_id + 1
104 |     for cat in cats:
105 |         class_names[cat["id"]] = cat["name"]
106 | 
107 |     # Setup the data pipeline
108 |     img_ids = coco.getImgIds()
109 |     shuffle(img_ids)
110 |     dataset = tf.data.Dataset.from_tensor_slices(img_ids)
111 |     # Shuffle the dataset
112 |     dataset = dataset.shuffle(1000)
113 |     # Retrieve img and labels
114 |     outputs_types=(tf.float32, tf.float32, tf.int64, tf.int64)
115 |     dataset = dataset.map(lambda idx: processing.numpy_fc(
116 |         idx, get_coco_from_id, outputs_types=outputs_types, coco=coco, augmentation=augmentation, config=config, img_dir=img_dir)
117 |     , num_parallel_calls=tf.data.experimental.AUTOTUNE)
118 |     dataset = dataset.filter(lambda imgs, tbbox, tclass, iscrowd: tf.shape(tbbox)[0] > 0 and iscrowd != 1)
119 |     dataset = dataset.map(lambda imgs, tbbox, tclass, iscrowd: (imgs, tbbox, tclass), num_parallel_calls=tf.data.experimental.AUTOTUNE)
120 |     
121 |     # Pad bbox and labels
122 |     dataset = dataset.map(processing.pad_labels, num_parallel_calls=tf.data.experimental.AUTOTUNE)
123 | 
124 |     dataset = dataset.batch(batch_size, drop_remainder=True)
125 |     dataset = dataset.prefetch(32)
126 |     
127 |     return dataset, class_names


--------------------------------------------------------------------------------
/detr_tf/data/processing.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | 
 5 | 
 6 | def normalized_images(image, config):
 7 |     """ Normalized images. torch_resnet is used on finetuning
 8 |     since the weights are based on the  original paper training code
 9 |     from pytorch. tf_resnet is used when training from scratch with a
10 |     resnet50 traine don tensorflow.
11 |     """
12 |     if config.normalized_method == "torch_resnet":
13 |         channel_avg = np.array([0.485, 0.456, 0.406])
14 |         channel_std = np.array([0.229, 0.224, 0.225])
15 |         image = (image / 255.0 - channel_avg) / channel_std
16 |         return image.astype(np.float32)
17 |     elif config.normalized_method == "tf_resnet":
18 |         mean = [103.939, 116.779, 123.68]
19 |         image = image[..., ::-1]
20 |         image = image - mean
21 |         return image.astype(np.float32)
22 |     else:
23 |         raise Exception("Can't handler thid normalized method")
24 | 
25 | 
26 | def numpy_fc(idx, fc, outputs_types=(tf.float32, tf.float32, tf.int64), **params):
27 |     """
28 |     Call a numpy function on each given ID (`idx`) and load the associated image and labels (bbbox and cls)
29 |     """
30 |     def _np_function(_idx):
31 |         return fc(_idx, **params)
32 |     return tf.numpy_function(_np_function, [idx], outputs_types)
33 | 
34 | 
35 | def pad_labels(images: tf.Tensor, t_bbox: tf.Tensor, t_class: tf.Tensor):
36 |     """ Pad the bbox by adding [0, 0, 0, 0] at the end
37 |     and one header to indicate how maby bbox are set.
38 |     Do the same with the labels. 
39 |     """
40 |     nb_bbox = tf.shape(t_bbox)[0]
41 | 
42 |     bbox_header = tf.expand_dims(nb_bbox, axis=0)
43 |     bbox_header = tf.expand_dims(bbox_header, axis=0)
44 |     bbox_header = tf.pad(bbox_header, [[0, 0], [0, 3]])
45 |     bbox_header = tf.cast(bbox_header, tf.float32)
46 |     cls_header = tf.constant([[0]], dtype=tf.int64)
47 | 
48 |     # Padd bbox and class
49 |     t_bbox = tf.pad(t_bbox, [[0, 100 - 1 - nb_bbox], [0, 0]], mode='CONSTANT', constant_values=0)
50 |     t_class = tf.pad(t_class, [[0, 100 - 1 - nb_bbox], [0, 0]], mode='CONSTANT', constant_values=0)
51 | 
52 |     t_bbox = tf.concat([bbox_header, t_bbox], axis=0)
53 |     t_class = tf.concat([cls_header, t_class], axis=0)
54 | 
55 |     return images, t_bbox, t_class
56 | 
57 | 


--------------------------------------------------------------------------------
/detr_tf/data/tfcsv.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from random import shuffle
 3 | import pandas as pd
 4 | import numpy as np
 5 | import imageio
 6 | import os
 7 | 
 8 | from .import processing
 9 | from .transformation import detr_transform
10 | from .. import bbox
11 | 
12 | def load_data_from_index(index, class_names, filenames, anns, config, augmentation, img_dir):
13 |     # Open the image
14 |     image = imageio.imread(os.path.join(config.data.data_dir, img_dir, filenames[index]))
15 |     # Select all the annotatiom (bbox and class) on this image
16 |     image_anns = anns[anns["filename"] == filenames[index]]    
17 |     
18 |     # Convert all string class to number (the target class)
19 |     t_class = image_anns["class"].map(lambda x: class_names.index(x)).to_numpy()
20 |     # Select the width&height of each image (should be the same since all the ann belongs to the same image)
21 |     width = image_anns["width"].to_numpy()
22 |     height = image_anns["height"].to_numpy()
23 |     # Select the xmin, ymin, xmax and ymax of each bbox, Then, normalized the bbox to be between and 0 and 1
24 |     # Finally, convert the bbox from xmin,ymin,xmax,ymax to x_center,y_center,width,height
25 |     bbox_list = image_anns[["xmin", "ymin", "xmax", "ymax"]].to_numpy()
26 |     bbox_list = bbox_list / [width[0], height[0], width[0], height[0]] 
27 |     t_bbox = bbox.xy_min_xy_max_to_xcycwh(bbox_list)
28 |     
29 |     # Transform and augment image with bbox and class if needed
30 |     image, t_bbox, t_class = detr_transform(image, t_bbox, t_class, config, augmentation=augmentation)
31 | 
32 |     # Normalized image
33 |     image = processing.normalized_images(image, config)
34 | 
35 |     return image.astype(np.float32), t_bbox.astype(np.float32), np.expand_dims(t_class, axis=-1).astype(np.int64)
36 | 
37 | 
38 | def load_tfcsv_dataset(config, batch_size, augmentation=False, exclude=[], ann_dir=None, ann_file=None, img_dir=None):
39 |     """ Load the hardhat dataset
40 |     """
41 |     ann_dir = config.data.ann_dir if ann_dir is None else ann_dir
42 |     ann_file = config.data.ann_file if ann_file is None else ann_file
43 |     img_dir = config.data.img_dir if img_dir is None else img_dir
44 | 
45 |     anns = pd.read_csv(os.path.join(config.data.data_dir, ann_file))
46 |     for name  in exclude:
47 |         anns = anns[anns["class"] != name]
48 | 
49 |     unique_class = anns["class"].unique()
50 |     unique_class.sort()
51 |     
52 | 
53 |     # Set the background class to 0
54 |     config.background_class = 0
55 |     class_names = ["background"] + unique_class.tolist()
56 | 
57 | 
58 |     filenames = anns["filename"].unique().tolist()
59 |     indexes = list(range(0, len(filenames)))
60 |     shuffle(indexes)
61 | 
62 |     dataset = tf.data.Dataset.from_tensor_slices(indexes)
63 |     dataset = dataset.map(lambda idx: processing.numpy_fc(
64 |         idx, load_data_from_index, 
65 |         class_names=class_names, filenames=filenames, anns=anns, config=config, augmentation=augmentation, img_dir=img_dir)
66 |     ,num_parallel_calls=tf.data.experimental.AUTOTUNE)
67 |     
68 | 
69 |     # Filter labels to be sure to keep only sample with at least one bbox
70 |     dataset = dataset.filter(lambda imgs, tbbox, tclass: tf.shape(tbbox)[0] > 0)
71 |     # Pad bbox and labels
72 |     dataset = dataset.map(processing.pad_labels, num_parallel_calls=tf.data.experimental.AUTOTUNE)
73 |     # Batch images
74 |     dataset = dataset.batch(batch_size, drop_remainder=True)
75 |     
76 |     return dataset, class_names
77 | 
78 | 


--------------------------------------------------------------------------------
/detr_tf/data/transformation.py:
--------------------------------------------------------------------------------
  1 | import imageio
  2 | import imgaug as ia
  3 | import imgaug.augmenters as iaa
  4 | import numpy as np
  5 | 
  6 | from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage
  7 | from imgaug.augmentables.segmaps import SegmentationMapsOnImage
  8 | 
  9 | import tensorflow as tf
 10 | 
 11 | def bbox_xcyc_wh_to_imgaug_bbox(bbox, target_class, height, width):
 12 | 
 13 |     img_aug_bbox = []
 14 |     
 15 |     for b in range(0, len(bbox)):
 16 |         bbox_xcyc_wh = bbox[b]
 17 |         # Convert size form 0.1 to height/width
 18 |         bbox_xcyc_wh = [
 19 |             bbox_xcyc_wh[0] * width,
 20 |             bbox_xcyc_wh[1] * height,
 21 |             bbox_xcyc_wh[2] * width,
 22 |             bbox_xcyc_wh[3] * height
 23 |         ]
 24 |         x1 = bbox_xcyc_wh[0] - (bbox_xcyc_wh[2] / 2)
 25 |         x2 = bbox_xcyc_wh[0] + (bbox_xcyc_wh[2] / 2)
 26 |         y1 = bbox_xcyc_wh[1] - (bbox_xcyc_wh[3] / 2)
 27 |         y2 = bbox_xcyc_wh[1] + (bbox_xcyc_wh[3] / 2)
 28 | 
 29 |         n_bbox = BoundingBox(x1=x1, y1=y1, x2=x2, y2=y2, label=target_class[b])
 30 | 
 31 |         img_aug_bbox.append(n_bbox)
 32 |     img_aug_bbox
 33 |     return img_aug_bbox
 34 | 
 35 | 
 36 | def prepare_aug_inputs(image, bbox, t_class):
 37 | 
 38 |     images_batch = []
 39 |     bbox_batch = []
 40 | 
 41 |     images_batch.append(image)
 42 | 
 43 |     # Create the Imgaug bbox
 44 |     bbs_original = bbox_xcyc_wh_to_imgaug_bbox(bbox, t_class, image.shape[0], image.shape[1])
 45 |     bbs_original = BoundingBoxesOnImage(bbs_original, shape=image.shape)
 46 |     bbox_batch.append(bbs_original)
 47 | 
 48 |     for i in range(len(images_batch)):
 49 |         images_batch[i] = images_batch[i].astype(np.uint8)
 50 |     
 51 |     return images_batch, bbox_batch
 52 | 
 53 | 
 54 | def detr_aug_seq(image, config, augmenation):
 55 | 
 56 | 
 57 |     sometimes = lambda aug: iaa.Sometimes(0.5, aug)
 58 | 
 59 |     target_min_side_size = 480
 60 | 
 61 |     # According to the paper
 62 |     min_side_min = 480
 63 |     min_side_max = 800 
 64 |     max_side_max = 1333
 65 | 
 66 |     image_size = config.image_size
 67 |     if augmenation:
 68 | 
 69 |         seq = iaa.Sequential([
 70 |             iaa.Fliplr(0.5), # horizontal flips
 71 |             sometimes(iaa.OneOf([
 72 |                 # Resize complety the image
 73 |                 iaa.Resize({"width": image_size[1], "height": image_size[0]}, interpolation=ia.ALL),
 74 |                 # Crop into the image
 75 |                 iaa.CropToFixedSize(image_size[1], image_size[0]),
 76 |                 # Affine transform
 77 |                 iaa.Affine(
 78 |                     scale={"x": (0.5, 1.5), "y": (0.5, 1.5)}, 
 79 |                 )
 80 |             ])),
 81 |             # Be sure to resize to the target image size
 82 |             iaa.Resize({"width": image_size[1], "height": image_size[0]}, interpolation=ia.ALL)
 83 |         ], random_order=False) # apply augmenters in random order
 84 |     
 85 |         return seq
 86 | 
 87 |     else:
 88 | 
 89 |         seq = iaa.Sequential([
 90 |             # Be sure to resize to the target image size
 91 |             iaa.Resize({"width": image_size[1], "height": image_size[0]})
 92 |         ], random_order=False) # apply augmenters in random order
 93 |     
 94 |         return seq
 95 | 
 96 |         """ Mode paper evaluation
 97 |         # Evaluation mode, we took the largest min side the model is trained on
 98 |         target_min_side_size = 480 
 99 |         image_min_side = min(float(image.shape[0]), float(image.shape[1]))
100 |         image_max_side = max(float(image.shape[0]), float(image.shape[1]))
101 |         
102 |         min_side_scaling = target_min_side_size / image_min_side
103 |         max_side_scaling = max_side_max / image_max_side
104 |         scaling = min(min_side_scaling, max_side_scaling)
105 | 
106 |         n_height = int(scaling * image.shape[0])
107 |         n_width = int(scaling * image.shape[1])
108 | 
109 |         seq = iaa.Sequential([
110 |             iaa.Resize({"height": n_height, "width": n_width}),
111 |         ])
112 |         """
113 | 
114 |     return seq
115 | 
116 | 
117 | def imgaug_bbox_to_xcyc_wh(bbs_aug, height, width):
118 | 
119 |     bbox_xcyc_wh = []
120 |     t_class = []
121 | 
122 |     nb_bbox = 0
123 | 
124 |     for b, bbox in enumerate(bbs_aug):
125 | 
126 |         h = bbox.y2 - bbox.y1
127 |         w = bbox.x2 - bbox.x1
128 |         xc = bbox.x1 + (w/2)
129 |         yc = bbox.y1 + (h/2)
130 |  
131 |         assert bbox.label != None
132 | 
133 |         bbox_xcyc_wh.append([xc / width, yc / height, w / width, h / height])
134 |         t_class.append(bbox.label)
135 | 
136 |         nb_bbox += 1
137 | 
138 |     #bbox_xcyc_wh[0][0] = nb_bbox
139 |     bbox_xcyc_wh = np.array(bbox_xcyc_wh)
140 | 
141 |     return bbox_xcyc_wh, t_class
142 | 
143 | 
144 | def retrieve_outputs(augmented_images, augmented_bbox):
145 | 
146 |     outputs_dict = {}
147 |     image_shape = None
148 | 
149 | 
150 |     # We expect only one image here for now
151 |     image = augmented_images[0].astype(np.float32)
152 |     augmented_bbox = augmented_bbox[0]
153 | 
154 |     bbox, t_class = imgaug_bbox_to_xcyc_wh(augmented_bbox, image.shape[0], image.shape[1])
155 | 
156 |     bbox = np.array(bbox)
157 |     t_class = np.array(t_class)
158 | 
159 |     return image, bbox, t_class
160 | 
161 | 
162 | 
163 | def detr_transform(image, bbox, t_class, config, augmentation):
164 | 
165 | 
166 |     images_batch, bbox_batch = prepare_aug_inputs(image, bbox, t_class)
167 | 
168 | 
169 |     seq = detr_aug_seq(image, config, augmentation)
170 | 
171 |     # Run the pipeline in a deterministic manner
172 |     seq_det = seq.to_deterministic()
173 | 
174 |     augmented_images = []
175 |     augmented_bbox = []
176 |     augmented_class = []
177 | 
178 |     for img, bbox, t_cls in zip(images_batch, bbox_batch, t_class):
179 | 
180 |         img_aug = seq_det.augment_image(img)
181 |         bbox_aug = seq_det.augment_bounding_boxes(bbox)
182 | 
183 | 
184 |         for b, bbox_instance in enumerate(bbox_aug.items):
185 |             setattr(bbox_instance, "instance_id", b+1)
186 | 
187 |         bbox_aug = bbox_aug.remove_out_of_image_fraction(0.7)
188 |         segmap_aug = None
189 |         bbox_aug = bbox_aug.clip_out_of_image()
190 | 
191 | 
192 |         augmented_images.append(img_aug)
193 |         augmented_bbox.append(bbox_aug)
194 | 
195 |     return retrieve_outputs(augmented_images, augmented_bbox)
196 | 
197 | 
198 | 


--------------------------------------------------------------------------------
/detr_tf/data/voc.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import xml.etree.ElementTree as ET
  3 | from random import sample, shuffle
  4 | import numpy as np
  5 | import imageio
  6 | import numpy as np
  7 | import os
  8 | 
  9 | 
 10 | from . import processing
 11 | from . import transformation
 12 | 
 13 | 
 14 | VOC_CLASS_NAME = [
 15 |     "back", 'aeroplane', 'bicycle', 'bird', 'boat',
 16 |     'bottle', 'bus', 'car', 'cat', 'chair',
 17 |     'cow', 'diningtable', 'dog', 'horse',
 18 |     'motorbike', 'person', 'pottedplant',
 19 |     'sheep', 'sofa', 'train', 'tvmonitor'
 20 | ]
 21 | 
 22 | def load_voc_labels(img_id, class_names, voc_dir, augmentation, config):
 23 | 
 24 |     anno_path = os.path.join(voc_dir, config.data.ann_dir, img_id + '.xml')
 25 |     objects = ET.parse(anno_path).findall('object')
 26 |     size = ET.parse(anno_path).find('size')
 27 |     width = float(size.find("width").text)
 28 |     height = float(size.find("height").text)
 29 | 
 30 |     # Set up bbox with headers
 31 |     t_bbox = []
 32 |     t_class = []
 33 | 
 34 |     for obj in objects:
 35 |         # Open bbox and retrieve info
 36 |         name = obj.find('name').text.lower().strip()
 37 |         bndbox = obj.find('bndbox')
 38 |         xmin = (float(bndbox.find('xmin').text) - 1) / width
 39 |         ymin = (float(bndbox.find('ymin').text) - 1) / height
 40 |         xmax = (float(bndbox.find('xmax').text) - 1) / width
 41 |         ymax = (float(bndbox.find('ymax').text) - 1) / height
 42 |         # Convert bbox to xc, yc center
 43 |         xc = xmin + ((xmax - xmin) / 2)
 44 |         yc = ymin + ((ymax - ymin) / 2)
 45 |         w = xmax - xmin
 46 |         h = ymax - ymin
 47 |         # Add bbox
 48 |         t_bbox.append([xc, yc, w, h])
 49 |         # Add target class
 50 |         t_class.append([class_names.index(name)])
 51 | 
 52 |     t_bbox = np.array(t_bbox)
 53 |     t_class = np.array(t_class)
 54 | 
 55 |     return t_bbox, t_class
 56 | 
 57 | 
 58 | def load_voc_from_id(img_id, class_names, voc_dir, augmentation, config, img_dir):
 59 |     img_id = str(img_id.decode())
 60 |     # Load image
 61 |     img_path = os.path.join(voc_dir, config.data.img_dir, img_id + '.jpg')
 62 |     image = imageio.imread(img_path)
 63 |     # Load labels
 64 |     t_bbox, t_class = load_voc_labels(img_id, class_names, voc_dir, augmentation, config)
 65 |     # Apply augmentations
 66 |     if augmentation is not None:
 67 |         image, t_bbox, t_class = transformation.detr_transform(image, t_bbox,  t_class, config, augmentation)
 68 |     # Normalized images
 69 |     image = processing.normalized_images(image, config)
 70 |     # Set type for tensorflow        
 71 |     image = image.astype(np.float32)
 72 |     t_bbox = t_bbox.astype(np.float32)
 73 |     t_class = t_class.astype(np.int64)
 74 | 
 75 | 
 76 |     return (image, t_bbox, t_class)
 77 | 
 78 | 
 79 | def load_voc_dataset(config, batch_size, augmentation=False, ann_dir=None, ann_file=None, img_dir=None):
 80 |     """
 81 |     """
 82 |     ann_dir = config.data.ann_dir if ann_dir is None else ann_dir
 83 |     ann_file = config.data.ann_file if ann_file is None else ann_file
 84 |     img_dir = config.data.img_dir if img_dir is None else img_dir
 85 | 
 86 |     # Set the background class to 0
 87 |     config.background_class = 0
 88 | 
 89 |     image_dir = os.path.join(config.data.data_dir, img_dir)
 90 |     anno_dir = os.path.join(config.data.data_dir, ann_dir)
 91 |     # ids lists
 92 |     ids = list(map(lambda x: x[:-4], os.listdir(image_dir)))
 93 | 
 94 |     # Retrieve the class names in the dataset
 95 |     class_names = ['back']
 96 |     for img_id in ids:
 97 |         anno_path = os.path.join(config.data.data_dir, anno_dir, img_id + '.xml')
 98 |         for obj in ET.parse(anno_path).findall('object'):
 99 |             # Open bbox and retrieve info
100 |             name = obj.find('name').text.lower().strip()
101 |             if name not in class_names:
102 |                 try: # Faster than checking
103 |                     class_names[name]
104 |                 except:
105 |                     class_names.append(name)
106 | 
107 |     ids = list(map(lambda x: x[:-4], os.listdir(image_dir)))
108 |     
109 |     #ids = ids[:int(len(ids) * 0.75)] if train_val == "train" else ids[int(len(ids) * 0.75):]
110 |     # Shuffle all the dataset
111 |     shuffle(ids)
112 | 
113 |     # Setup data pipeline
114 |     dataset = tf.data.Dataset.from_tensor_slices(ids)
115 |     dataset = dataset.shuffle(1000)
116 |     # Retrieve img and labels
117 |     dataset = dataset.map(lambda idx: processing.numpy_fc(idx, load_voc_from_id, 
118 |         class_names=class_names, voc_dir=config.data.data_dir, augmentation=augmentation, config=config, img_dir=img_dir)
119 |     , num_parallel_calls=tf.data.experimental.AUTOTUNE)
120 |     # Filter labels to be sure to keep only sample with at least one bbox
121 |     dataset = dataset.filter(lambda imgs, tbbox, tclass: tf.shape(tbbox)[0] > 0)
122 |     # Pad bbox and labels
123 |     dataset = dataset.map(processing.pad_labels, num_parallel_calls=tf.data.experimental.AUTOTUNE)
124 |     # Batch images
125 |     dataset = dataset.batch(batch_size, drop_remainder=True)
126 |     # Prefetch
127 |     dataset = dataset.prefetch(32)
128 |     return dataset, class_names


--------------------------------------------------------------------------------
/detr_tf/inference.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | import cv2
 4 | 
 5 | 
 6 | CLASS_COLOR_MAP = np.random.randint(0, 255, (100, 3))
 7 | 
 8 | from detr_tf import bbox
 9 | 
10 | def numpy_bbox_to_image(image, bbox_list, labels=None, scores=None, class_name=[], config=None):
11 |     """ Numpy function used to display the bbox (target or prediction)
12 |     """
13 |     assert(image.dtype == np.float32 and image.dtype == np.float32 and len(image.shape) == 3)
14 | 
15 |     if config is not None and config.normalized_method == "torch_resnet":
16 |         channel_avg = np.array([0.485, 0.456, 0.406])
17 |         channel_std = np.array([0.229, 0.224, 0.225])
18 |         image = (image * channel_std) + channel_avg
19 |         image = (image*255).astype(np.uint8)
20 |     elif config is not None and config.normalized_method == "tf_resnet":
21 |         image = image + mean
22 |         image = image[..., ::-1]
23 |         image = image  / 255
24 |         
25 |     bbox_xcycwh = bbox.np_rescale_bbox_xcycwh(bbox_list, (image.shape[0], image.shape[1])) 
26 |     bbox_x1y1x2y2 = bbox.np_xcycwh_to_xy_min_xy_max(bbox_xcycwh)
27 | 
28 |     # Set the labels if not defined
29 |     if labels is None: labels = np.zeros((bbox_x1y1x2y2.shape[0]))
30 | 
31 |     bbox_area = []
32 |     # Go through each bbox
33 |     for b in range(0, bbox_x1y1x2y2.shape[0]):
34 |         x1, y1, x2, y2 = bbox_x1y1x2y2[b]
35 |         bbox_area.append((x2-x1)*(y2-y1))
36 | 
37 |     # Go through each bbox
38 |     for b in np.argsort(bbox_area)[::-1]:
39 |         # Take a new color at reandon for this instance
40 |         instance_color = np.random.randint(0, 255, (3))
41 |         
42 | 
43 |         x1, y1, x2, y2 = bbox_x1y1x2y2[b]
44 |         x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
45 |         x1, y1, x2, y2 = max(0, x1), max(0, y1), min(image.shape[1], x2), min(image.shape[0], y2)
46 | 
47 |         # Select the class associated with this bbox
48 |         class_id = labels[int(b)]
49 | 
50 |         if scores is not None and len(scores) > 0:
51 |             label_name = class_name[int(class_id)]   
52 |             label_name = "%s:%.2f" % (label_name, scores[b])
53 |         else:
54 |             label_name = class_name[int(class_id)]    
55 | 
56 |         class_color = CLASS_COLOR_MAP[int(class_id)]
57 |     
58 |         color = instance_color
59 |         
60 |         multiplier = image.shape[0] / 500
61 |         cv2.rectangle(image, (x1, y1), (x1 + int(multiplier*15)*len(label_name), y1 + 20), class_color.tolist(), -10)
62 |         cv2.putText(image, label_name, (x1+2, y1 + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.6 * multiplier, (0, 0, 0), 1)
63 |         cv2.rectangle(image, (x1, y1), (x2, y2), tuple(class_color.tolist()), 2)
64 | 
65 |     return image
66 | 
67 | 
68 | def get_model_inference(m_outputs: dict, background_class, bbox_format="xy_center"):
69 | 
70 |     predicted_bbox = m_outputs["pred_boxes"][0]
71 |     predicted_labels = m_outputs["pred_logits"][0]
72 | 
73 |     softmax = tf.nn.softmax(predicted_labels)
74 |     predicted_scores = tf.reduce_max(softmax, axis=-1)
75 |     predicted_labels = tf.argmax(softmax, axis=-1)
76 | 
77 | 
78 |     indices = tf.where(predicted_labels != background_class)
79 |     indices = tf.squeeze(indices, axis=-1)
80 | 
81 |     predicted_scores = tf.gather(predicted_scores, indices)
82 |     predicted_labels = tf.gather(predicted_labels, indices)
83 |     predicted_bbox = tf.gather(predicted_bbox, indices)
84 | 
85 | 
86 |     if bbox_format == "xy_center":
87 |         predicted_bbox = predicted_bbox
88 |     elif bbox_format == "xyxy":
89 |         predicted_bbox = bbox.xcycwh_to_xy_min_xy_max(predicted_bbox)
90 |     elif bbox_format == "yxyx":
91 |         predicted_bbox = bbox.xcycwh_to_yx_min_yx_max(predicted_bbox)
92 |     else:
93 |         raise NotImplementedError()
94 | 
95 |     return predicted_bbox, predicted_labels, predicted_scores
96 | 


--------------------------------------------------------------------------------
/detr_tf/logger/training_logging.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     This script is used to process the results of the training (loss, model outputs and targets)
  3 |     In order to send everything on wandb.
  4 | """
  5 | from typing import Union,Dict,Tuple
  6 | import tensorflow as tf
  7 | 
  8 | from ..loss.compute_map import cal_map, calc_map, APDataObject
  9 | from .wandb_logging import WandbSender
 10 | from ..inference import get_model_inference
 11 | 
 12 | 
 13 | import numpy as np
 14 | import cv2
 15 | 
 16 | from ..import bbox
 17 | 
 18 | if int(tf.__version__.split('.')[1]) >= 4:
 19 |     RAGGED = True
 20 | else:
 21 |     RAGGED = False
 22 | 
 23 | 
 24 | def tf_send_batch_log_to_wandb(images, target_bbox, target_class,  m_outputs: dict, config, class_name=[], step=None, prefix=""): 
 25 | 
 26 |     # Warning: In graph mode, this class is init only once. In eager mode, this class is init at each step.
 27 |     img_sender = WandbSender()
 28 | 
 29 |     predicted_bbox = m_outputs["pred_boxes"]
 30 |     for b in range(predicted_bbox.shape[0]):
 31 |         # Select within the batch the elements at indice b
 32 |         image = images[b]
 33 |         
 34 |         elem_m_outputs = {key:m_outputs[key][b:b+1] if (m_outputs[key] is not None and not isinstance(m_outputs[key], list)) else m_outputs[key] for key in m_outputs}
 35 | 
 36 |         # Target
 37 |         t_bbox, t_class = target_bbox[b], target_class[b]
 38 | 
 39 |         if not RAGGED:
 40 |             size = tf.cast(t_bbox[0][0], tf.int32)
 41 |             t_bbox = tf.slice(t_bbox, [1, 0], [size, 4])
 42 |             t_bbox = bbox.xcycwh_to_xy_min_xy_max(t_bbox)
 43 |             t_class = tf.slice(t_class, [1, 0], [size, -1])
 44 |             t_class = tf.squeeze(t_class, axis=-1)
 45 | 
 46 |         # Predictions
 47 |         predicted_bbox, predicted_labels, predicted_scores = get_model_inference(elem_m_outputs, config.background_class, bbox_format="xyxy")
 48 | 
 49 |         np_func_params = {
 50 |             "image": image, "p_bbox": np.array(predicted_bbox), "p_scores": np.array(predicted_scores), "t_bbox": np.array(t_bbox),
 51 |             "p_labels": np.array(predicted_labels), "t_labels": np.array(t_class), "class_name": class_name
 52 |         }
 53 |         img_sender.gather_inference(**np_func_params)
 54 | 
 55 |     img_sender.send(step=step, prefix=prefix)
 56 | 
 57 | 
 58 | 
 59 | def compute_map_on_batch(images, target_bbox, target_class,  m_outputs: dict, config, class_name=[], step=None, send=True, prefix=""): 
 60 |     predicted_bbox = m_outputs["pred_boxes"]
 61 |     batch_size = predicted_bbox.shape[0]
 62 |     for b in range(batch_size):
 63 | 
 64 |         image = images[b]
 65 |         elem_m_outputs = {key:m_outputs[key][b:b+1] if (m_outputs[key] is not None and not isinstance(m_outputs[key], list)) else m_outputs[key] for key in m_outputs}
 66 | 
 67 |         # Target
 68 |         t_bbox, t_class = target_bbox[b], target_class[b]
 69 | 
 70 |         if not RAGGED:
 71 |             size = tf.cast(t_bbox[0][0], tf.int32)
 72 |             t_bbox = tf.slice(t_bbox, [1, 0], [size, 4])
 73 |             t_bbox = bbox.xcycwh_to_yx_min_yx_max(t_bbox)
 74 |             t_class = tf.slice(t_class, [1, 0], [size, -1])
 75 |             t_class = tf.squeeze(t_class, axis=-1)
 76 | 
 77 |         # Inference ops
 78 |         predicted_bbox, predicted_labels, predicted_scores = get_model_inference(elem_m_outputs, config.background_class, bbox_format="yxyx")
 79 |         pred_mask = None
 80 |   
 81 |         pred_mask = np.zeros((138, 138, len(predicted_bbox)))
 82 |         target_mask = np.zeros((138, 138, len(t_bbox)))
 83 |         WandbSender.compute_map(
 84 |             np.array(predicted_bbox), 
 85 |             np.array(predicted_labels), np.array(predicted_scores), 
 86 |             np.array(t_bbox), 
 87 |             np.array(t_class), 
 88 |             b, batch_size, prefix, step, send, pred_mask, target_mask)
 89 | 
 90 | 
 91 | 
 92 | def train_log(images, t_bbox, t_class, m_outputs: dict, config, step, class_name=[], prefix="train/"):
 93 |     # Every 1000 steps, log some progress of the training
 94 |     # (Images with bbox and images logs)
 95 |     if step % 100 == 0:
 96 |         tf_send_batch_log_to_wandb(images, t_bbox, t_class, m_outputs, config, class_name=class_name, step=step, prefix=prefix)
 97 | 
 98 | 
 99 | def valid_log(images, t_bbox, t_class, m_outputs: dict, config, step, global_step, class_name=[], evaluation_step=200, prefix="train/"):
100 | 
101 |     # Set the number of class
102 |     WandbSender.init_ap_data(nb_class=len(class_name))
103 |     map_list = compute_map_on_batch(images, t_bbox, t_class, m_outputs, config, class_name=class_name, step=global_step, send=(step+1==evaluation_step),  prefix="val/")
104 |     
105 |     if step == 0:
106 |         tf_send_batch_log_to_wandb(images, t_bbox, t_class, m_outputs, config, class_name=class_name, step=global_step, prefix="val/")
107 | 


--------------------------------------------------------------------------------
/detr_tf/logger/wandb_logging.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     This scripts is used to send training logs to Wandb.
  3 | """
  4 | from typing import Union,Dict,Tuple
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | 
  8 | try:
  9 |     # Should be optional
 10 |     import wandb
 11 | except:
 12 |     wandb = None
 13 | 
 14 | import cv2
 15 | 
 16 | from ..loss.compute_map import cal_map, calc_map, APDataObject
 17 | 
 18 | class WandbSender(object):
 19 |     """
 20 |     Class used within the Yolact project to send data to Wandb to
 21 |     log experiments.
 22 |     """
 23 | 
 24 |     IOU_THRESHOLDS = [x / 100. for x in range(50, 100, 5)]
 25 |     AP_DATA = None
 26 |     NB_CLASS = None
 27 | 
 28 |     def __init__(self):
 29 |         self.init_buffer()
 30 |     
 31 |     @staticmethod
 32 |     def init_ap_data(nb_class=None):
 33 |         """ Init the ap data used to compute the Map metrics.
 34 |             If nb_class is not provided, used the last provided nb_class.
 35 |         """
 36 |         if nb_class is not None:
 37 |             WandbSender.NB_CLASS = nb_class
 38 | 
 39 |         if WandbSender.NB_CLASS is None:
 40 |             raise ValueError("NB_CLASS is not sed in WandbSender")
 41 |         
 42 |         if WandbSender.AP_DATA is None:
 43 |             WandbSender.AP_DATA = {
 44 |                 'box' : [[APDataObject() for _ in [f"class_{i}" for i in range(WandbSender.NB_CLASS)]] for _ in [x / 100. for x in range(50, 100, 5)]],
 45 |                 'mask': [[APDataObject() for _ in [f"class_{i}" for i in range(WandbSender.NB_CLASS)]] for _ in [x / 100. for x in range(50, 100, 5)]]
 46 |             }
 47 | 
 48 | 
 49 |     def init_buffer(self):
 50 |         """ Init list used to store the information from a batch of data.
 51 |         Onced the list is filled, the send method
 52 |         send all images online.
 53 |         """
 54 |         self.images = []
 55 |         self.queries = []
 56 |         self.images_mask_ground_truth = []
 57 |         self.images_mask_prediction = []
 58 |         self.p_labels_batch = []
 59 |         self.t_labels_batch = []
 60 |         self.batch_mAP = []
 61 | 
 62 | 
 63 |     @staticmethod
 64 |     @tf.autograph.experimental.do_not_convert()
 65 |     def compute_map(p_bbox: np.array, p_labels: np.array, p_scores: np.array, t_bbox: np.array, t_labels: np.array, 
 66 |                     b: int, batch: int, prefix: str, step: int, send: bool,
 67 |                     p_mask: np.array, t_mask: np.array):
 68 |         """
 69 |         For some reason, autograph is trying to understand what I'm doing here. With some failure. 
 70 |         Thus, @tf.autograph.experimental.do_not_convert() is used to prevent autograph to scan this method.
 71 | 
 72 |         Args:
 73 |             p_bbox/t_bbox: List of bbox (n, 4) [y1, x2, y2, x2]
 74 |             p_labels/t_labels: List of labels index (n)
 75 |             p_mask/t_mask: predicted/target mask (n, h, w) with h and w the size of the mask
 76 |             p_scores: List of predicted scores (n)
 77 |             b: Batch indice
 78 |             batch: size of a batch
 79 |             prefix: Prefix to use to log something on wandb
 80 |             step: Step number
 81 |             send: Whether to send the result of all computed map to wandb.
 82 |         """
 83 | 
 84 |         # Init Ap Data
 85 |         if WandbSender.AP_DATA is None:
 86 |             WandbSender.init_ap_data()
 87 | 
 88 |         # Set fake class name. (we do not really need the real name of each class at this point)
 89 |         class_name = [f"class_{i}" for i in range(WandbSender.NB_CLASS)]
 90 | 
 91 |         try:
 92 |             # Compyute
 93 |             cal_map(p_bbox, p_labels, p_scores, p_mask, t_bbox, t_labels, t_mask, WandbSender.AP_DATA, WandbSender.IOU_THRESHOLDS)
 94 | 
 95 |             # Last element of the validation set.
 96 | 
 97 |             if send and b + 1 == batch:
 98 | 
 99 |                 all_maps = calc_map(WandbSender.AP_DATA, WandbSender.IOU_THRESHOLDS, class_name, print_result=True)
100 |                 wandb.log({
101 |                     f"val/map50_bbox": all_maps["box"][50],
102 |                     f"val/map50_mask": all_maps["mask"][50],
103 |                     f"val/map_bbox": all_maps["box"]["all"],
104 |                     f"val/map_mask": all_maps["mask"]["all"]
105 |                 }, step=step)
106 |                 wandb.run.summary.update({
107 |                     f"val/map50_bbox": all_maps["box"][50],
108 |                     f"val/map50_mask": all_maps["mask"][50],
109 |                     f"val/map_bbox": all_maps["box"]["all"],
110 |                     f"val/map_mask": all_maps["mask"]["all"]
111 |                 })
112 | 
113 | 
114 |                 WandbSender.AP_DATA = None
115 |                 WandbSender.init_ap_data()
116 | 
117 |             return np.array([0.0, 0.0], np.float64)
118 | 
119 |         except Exception as e:
120 |             print("compute_map error. e=", e)
121 |             #raise e
122 |             return np.array([0.0, 0.0], np.float64)
123 |         return np.array([0.0, 0.0], np.float64)
124 | 
125 | 
126 |     def get_wandb_bbox_mask_image(self, image: np.array, bbox: np.array, labels : np.array, masks=None, scores=None, class_name=[]) -> Tuple[list, np.array]:
127 |         """
128 |         Serialize the model inference into a dict and an image ready to be send to wandb.
129 |         Args:
130 |             image: (550, 550, 3)
131 |             bbox: List of bbox (n, 4) [x1, y2, x2, y2]
132 |             labels: List of labels index (n)
133 |             masks: predicted/target mask (n, h, w) with h and w the size of the mask
134 |             scores: List of predicted scores (n) (Optional)
135 |             class_name; List of class name for each label
136 |         Return:
137 |            A dict with the box data for wandb
138 |            and a copy of the  original image with the instance masks
139 |         """
140 |         height, width = image.shape[0], image.shape[1]
141 |         image_mask = np.copy(image)
142 |         instance_id = 1
143 |         box_data = []
144 | 
145 |         for b in range(len(bbox)):
146 |             # Sample a new color for the mask instance
147 |             instance_color = np.random.uniform(0, 1, (3))
148 |             # Retrive bbox coordinates
149 |             x1, y1, x2, y2 = bbox[b]
150 |             x1, y1, x2, y2 = float(x1), float(y1), float(x2), float(y2)
151 | 
152 |             # Fill the mask
153 |             if masks is not None:
154 |                 mask = masks[:,:,b]
155 |                 mask = cv2.resize(mask, (width, height))
156 |                 mask = mask[int(y1*height):int(y2*height),int(x1*width):int(x2*width)]
157 |                 image_mask[int(y1*height):int(y2*height),int(x1*width):int(x2*width)][mask > 0.5] = 0.5*image[int(y1*height):int(y2*height),int(x1*width):int(x2*width)][mask > 0.5] + 0.5*instance_color
158 | 
159 |             image_mask = cv2.rectangle(image_mask, (int(x1*width), int(y1*height)), (int(x2*width), int(y2*height)), (1, 1, 0), 3)
160 | 
161 | 
162 |             #if scores is None:
163 |             box_caption = "%s" % (class_name[int(labels[b])])
164 |             #else:
165 |             #    box_caption = "%s-{:.2f}" % (class_name[int(labels[b])], float(scores[b]))
166 | 
167 |             
168 |             box_dict = {
169 |                 "position": {"minX": x1, "maxX": x2, "minY": y1, "maxY": y2},
170 |                 "class_id" : int(labels[b]),
171 |                 "box_caption" : box_caption
172 |             }
173 |             # b < len(scores) for some reason sometime scores is not of the same length than the bbox
174 |             if scores is not None and b < len(scores):
175 |                 box_dict["scores"] = {"conf": float(scores[b])}
176 |             #print("append", box_dict)
177 |             box_data.append(box_dict)
178 |             instance_id += 1        
179 | 
180 |         return box_data, image_mask
181 | 
182 |     def gather_inference(self, image: np.array, p_bbox: np.array, p_scores: np.array, t_bbox: np.array, 
183 |                          p_labels: np.array, t_labels: np.array, p_masks=None, t_masks=None, class_name=[]):
184 |         self.class_name = class_name
185 |  
186 |         # This is what wandb expext to get as input to display images with bbox.
187 |         boxes = {
188 |             "ground_truth": {"box_data": []}, 
189 |             "predictions": {"box_data": []}
190 |         }
191 | 
192 |         # Ground Truth
193 |         box_data, _ = self.get_wandb_bbox_mask_image(image, t_bbox, t_labels, t_masks, class_name=class_name, scores=p_scores)
194 |         boxes["ground_truth"]["box_data"] = box_data
195 |         boxes["ground_truth"]["class_labels"] = {_id:str(label) for _id, label in enumerate(class_name)}
196 | 
197 |         # Predictions        
198 |         box_data, _ = self.get_wandb_bbox_mask_image(image, p_bbox, p_labels, p_masks, class_name=class_name, scores=p_scores)
199 |         boxes["predictions"]["box_data"] = box_data
200 |         boxes["predictions"]["class_labels"] = {_id:str(label) for _id, label in enumerate(class_name)}
201 | 
202 | 
203 |         # Append the target and the predictions to the buffer
204 |         self.images.append(wandb.Image(image, boxes=boxes))
205 | 
206 |         return np.array(0, dtype=np.int64)
207 |     
208 |     @tf.autograph.experimental.do_not_convert()
209 |     def send(self, step: tf.Tensor, prefix=""):
210 |         """
211 |         For some reason, autograph is trying to understand what I'm doing here. With some failure. 
212 |         Thus, @tf.autograph.experimental.do_not_convert() is used to prevent autograph to scan this method.
213 | 
214 |         Send the buffer to wandb
215 |         Args:
216 |             step: The global training step as eager tensor
217 |             prefix: Prefix used before each log name.
218 |         """
219 |         step = int(step)
220 |         
221 |         wandb.log({f"{prefix}Images bbox": self.images}, step=step)
222 |         
223 |         if len(self.batch_mAP) > 0:
224 |             wandb.log({f"{prefix}mAp": np.mean(self.batch_mAP)}, step=step)
225 | 
226 |         self.init_buffer()
227 | 
228 |         return np.array(0, dtype=np.int64)
229 | 
230 |     @staticmethod
231 |     @tf.autograph.experimental.do_not_convert()
232 |     def send_depth(depth_map, step: np.array, prefix=""):
233 |         """
234 |         For some reason, autograph is trying to understand what I'm doing here. With some failure. 
235 |         Thus, @tf.autograph.experimental.do_not_convert() is used to prevent autograph to scan this method.
236 | 
237 |         Send the depth map to wandb
238 |         Args:
239 |            depth_map: (8, h, w, 1) Depth images used to train the model
240 |            step: The global training step as eager tensor
241 |            prefix: Prefix used before each log name
242 |         """
243 |         step = int(step)
244 |         depth_map_images = []
245 |         for depth in depth_map:
246 |             depth_map_images.append(wandb.Image(depth))
247 |         wandb.log({f"{prefix}Depth map": depth_map_images}, step=step)
248 |         return np.array(0, dtype=np.int64)
249 | 
250 | 
251 |     @staticmethod
252 |     @tf.autograph.experimental.do_not_convert()
253 |     def send_proto_sample(proto_map: np.array, proto_sample: np.array, proto_targets: np.array, step: np.array, prefix=""):
254 |         """
255 |         For some reason, autograph is trying to understand what I'm doing here. With some failure. 
256 |         Thus, @tf.autograph.experimental.do_not_convert() is used to prevent autograph to scan this method.
257 | 
258 |         Send the proto images logs to wandb.
259 |         Args:
260 |            proto_map: The k (32 by default) proto map of the proto network (h, w, k)
261 |            proto_sample: Some generated mask from the network for a batch (n, h, w) with n the number of mask
262 |            proto_targets: The target mask for each generated mask. (n, h, w) with n the number of mask
263 |            step: The global training step as eager tensor
264 |            prefix: Prefix used before each log name
265 |         """
266 |         step = int(step)
267 | 
268 |         proto_map_images = []
269 |         proto_sample_images = []
270 |         proto_targets_images = []
271 | 
272 |         for p in range(proto_map.shape[-1]):
273 |             proto_map_images.append(wandb.Image(np.clip(proto_map[:,:,p]*100, 0, 255)))
274 |         for p in range(len(proto_sample)):
275 |             proto_sample_images.append(wandb.Image(proto_sample[p]))
276 |             proto_targets_images.append(wandb.Image(proto_targets[p]))
277 |         
278 |         wandb.log({f"{prefix}Proto Map": proto_map_images}, step=step)
279 |         wandb.log({f"{prefix}Instance segmentation prediction": proto_sample_images}, step=step)
280 |         wandb.log({f"{prefix}Instance segmentation target": proto_targets_images}, step=step)
281 |         return np.array(0, dtype=np.int64)
282 | 
283 | 
284 | 
285 |     @staticmethod
286 |     @tf.autograph.experimental.do_not_convert()
287 |     def send_images(images, step: np.array, name: str, captions=None, masks_prediction=None, masks_target=None):
288 |         """
289 |         For some reason, autograph is trying to understand what I'm doing here. With some failure. 
290 |         Thus, @tf.autograph.experimental.do_not_convert() is used to prevent autograph to scan this method.
291 | 
292 |         Send some images to wandb
293 |         Args:
294 |            images: (8, h, w, c) Images to log in wandb
295 |            step: The global training step as eager tensor
296 |            name: Image names
297 |         """
298 |         class_labels = {
299 |             0: "background",
300 |             1: "0",
301 |             2: "1",
302 |             3: "2",
303 |             4: "3",
304 |             5: "4",
305 |             6: "5",
306 |             7: "6",
307 |             8: "7",
308 |             9: "8",
309 |             10: "9"
310 |         }
311 | 
312 |         step = int(step)
313 |         images_list = []
314 |         for i, img in enumerate(images):
315 |             img_params = {}
316 |             if captions is not None:
317 |                 img_params["caption"] = captions[i]
318 | 
319 |             if masks_prediction is not None:
320 |                 mask_pred = cv2.resize(masks_prediction[i], (img.shape[1], img.shape[0]), interpolation=cv2.INTER_NEAREST)
321 |                 mask_pred = mask_pred.astype(np.int32)
322 |                 if "masks" not in img_params:
323 |                     img_params["masks"] = {}
324 | 
325 |                 #seg = np.expand_dims(masks[i].astype(np.int32), axis=-1)
326 |                 img_params["masks"]["predictions"] = {
327 |                     "mask_data": mask_pred,
328 |                     "class_labels": class_labels
329 |                 }
330 |                 
331 |             
332 |             if masks_target is not None:
333 |                 if "masks" not in img_params:
334 |                     img_params["masks"] = {}
335 | 
336 |                 mask_target = masks_target[i].astype(np.int32)
337 |                 #seg = np.expand_dims(masks[i].astype(np.int32), axis=-1)
338 |                 print(mask_target.shape)
339 |                 img_params["masks"]["groud_truth"] = {
340 |                     "mask_data": mask_target,
341 |                      "class_labels": class_labels
342 |                 }
343 | 
344 |             images_list.append(wandb.Image(img, **img_params))
345 | 
346 |         wandb.log({name: images_list}, step=step)
347 |         return np.array(0, dtype=np.int64)
348 | 
349 | 


--------------------------------------------------------------------------------
/detr_tf/loss/compute_map.py:
--------------------------------------------------------------------------------
  1 | from scipy.special import softmax
  2 | import matplotlib.pyplot as plt
  3 | from itertools import product
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | import argparse
  7 | import random
  8 | import json
  9 | import time
 10 | import cv2
 11 | import os
 12 | 
 13 | from ..import bbox
 14 | from collections import OrderedDict
 15 | 
 16 | 
 17 | class APDataObject:
 18 |     """ Stores all the information necessary to calculate the AP for one IoU and one class.
 19 |     """
 20 | 
 21 |     def __init__(self):
 22 |         self.data_points = []
 23 |         self.num_gt_positives = 0
 24 | 
 25 |     def push(self, score:float, is_true:bool):
 26 |         self.data_points.append((score, is_true))
 27 |     
 28 |     def add_gt_positives(self, num_positives:int):
 29 |         """ Call this once per image. """
 30 |         self.num_gt_positives += num_positives
 31 | 
 32 |     def is_empty(self) -> bool:
 33 |         return len(self.data_points) == 0 and self.num_gt_positives == 0
 34 | 
 35 |     def get_ap(self) -> float:
 36 |         """ Warning: result not cached. """
 37 | 
 38 |         if self.num_gt_positives == 0:
 39 |             return 0
 40 | 
 41 |         # Sort descending by score
 42 |         self.data_points.sort(key=lambda x: -x[0])
 43 | 
 44 |         precisions = []
 45 |         recalls    = []
 46 |         num_true  = 0
 47 |         num_false = 0
 48 | 
 49 |         # Compute the precision-recall curve. The x axis is recalls and the y axis precisions.
 50 |         for datum in self.data_points:
 51 |             # datum[1] is whether the detection a true or false positive
 52 |             if datum[1]: num_true += 1
 53 |             else: num_false += 1
 54 |             
 55 |             precision = num_true / (num_true + num_false)
 56 |             recall    = num_true / self.num_gt_positives
 57 | 
 58 |             precisions.append(precision)
 59 |             recalls.append(recall)
 60 | 
 61 |         # Smooth the curve by computing [max(precisions[i:]) for i in range(len(precisions))]
 62 |         # Basically, remove any temporary dips from the curve.
 63 |         # At least that's what I think, idk. COCOEval did it so I do too.
 64 |         for i in range(len(precisions)-1, 0, -1):
 65 |             if precisions[i] > precisions[i-1]:
 66 |                 precisions[i-1] = precisions[i]
 67 | 
 68 |         # Compute the integral of precision(recall) d_recall from recall=0->1 using fixed-length riemann summation with 101 bars.
 69 |         y_range = [0] * 101 # idx 0 is recall == 0.0 and idx 100 is recall == 1.00
 70 |         x_range = np.array([x / 100 for x in range(101)])
 71 |         recalls = np.array(recalls)
 72 | 
 73 |         # I realize this is weird, but all it does is find the nearest precision(x) for a given x in x_range.
 74 |         # Basically, if the closest recall we have to 0.01 is 0.009 this sets precision(0.01) = precision(0.009).
 75 |         # I approximate the integral this way, because that's how COCOEval does it.
 76 |         indices = np.searchsorted(recalls, x_range, side='left')
 77 |         for bar_idx, precision_idx in enumerate(indices):
 78 |             if precision_idx < len(precisions):
 79 |                 y_range[bar_idx] = precisions[precision_idx]
 80 | 
 81 |         # Finally compute the riemann sum to get our integral.
 82 |         # avg([precision(x) for x in 0:0.01:1])
 83 |         return sum(y_range) / len(y_range)
 84 | 
 85 | def compute_overlaps_masks(masks1, masks2):
 86 |     """Computes IoU overlaps between two sets of masks.
 87 |     masks1, masks2: [Height, Width, instances]
 88 |     """
 89 |     # If either set of masks is empty return empty result
 90 |     if masks1.shape[-1] == 0 or masks2.shape[-1] == 0:
 91 |         return np.zeros((masks1.shape[-1], masks2.shape[-1]))
 92 |     # flatten masks and compute their areas
 93 |     masks1 = np.reshape(masks1 > .5, (-1, masks1.shape[-1])).astype(np.float32)
 94 |     masks2 = np.reshape(masks2 > .5, (-1, masks2.shape[-1])).astype(np.float32)
 95 |     area1 = np.sum(masks1, axis=0)
 96 |     area2 = np.sum(masks2, axis=0)
 97 | 
 98 |     # intersections and union
 99 |     intersections = np.dot(masks1.T, masks2)
100 |     union = area1[:, None] + area2[None, :] - intersections
101 |     overlaps = intersections / union
102 | 
103 |     return overlaps
104 | 
105 | def compute_iou(box, boxes, box_area, boxes_area):
106 |     """Calculates IoU of the given box with the array of the given boxes.
107 |     box: 1D vector [y1, x1, y2, x2]
108 |     boxes: [boxes_count, (y1, x1, y2, x2)]
109 |     box_area: float. the area of 'box'
110 |     boxes_area: array of length boxes_count.
111 |     Note: the areas are passed in rather than calculated here for
112 |     efficiency. Calculate once in the caller to avoid duplicate work.
113 |     """
114 |     # Calculate intersection areas
115 |     y1 = np.maximum(box[0], boxes[:, 0])
116 |     y2 = np.minimum(box[2], boxes[:, 2])
117 |     x1 = np.maximum(box[1], boxes[:, 1])
118 |     x2 = np.minimum(box[3], boxes[:, 3])
119 |     intersection = np.maximum(x2 - x1, 0) * np.maximum(y2 - y1, 0)
120 |     union = box_area + boxes_area[:] - intersection[:]
121 |     iou = intersection / union
122 |     return iou
123 | 
124 | def compute_overlaps(boxes1, boxes2):
125 |     """Computes IoU overlaps between two sets of boxes.
126 |     boxes1, boxes2: [N, (y1, x1, y2, x2)].
127 |     For better performance, pass the largest set first and the smaller second.
128 |     """
129 |     # Areas of anchors and GT boxes
130 |     area1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])
131 |     area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])
132 | 
133 |     # Compute overlaps to generate matrix [boxes1 count, boxes2 count]
134 |     # Each cell contains the IoU value.
135 |     overlaps = np.zeros((boxes1.shape[0], boxes2.shape[0]))
136 |     for i in range(overlaps.shape[1]):
137 |         box2 = boxes2[i]
138 |         overlaps[:, i] = compute_iou(box2, boxes1, area2[i], area1)
139 |     return overlaps
140 | 
141 | def calc_map(ap_data, iou_thresholds, class_name, print_result=False):
142 |     #print('Calculating mAP...')
143 |     aps = [{'box': [], 'mask': []} for _ in iou_thresholds]
144 | 
145 |     for _class in range(len(class_name)):
146 |         for iou_idx in range(len(iou_thresholds)):
147 |             for iou_type in ('box', 'mask'):
148 |                 ap_obj = ap_data[iou_type][iou_idx][_class]
149 | 
150 |                 if not ap_obj.is_empty():
151 |                     aps[iou_idx][iou_type].append(ap_obj.get_ap())
152 | 
153 |     all_maps = {'box': OrderedDict(), 'mask': OrderedDict()}
154 | 
155 |     # Looking back at it, this code is really hard to read :/
156 |     for iou_type in ('box', 'mask'):
157 |         all_maps[iou_type]['all'] = 0 # Make this first in the ordereddict
158 |         for i, threshold in enumerate(iou_thresholds):
159 |             mAP = sum(aps[i][iou_type]) / len(aps[i][iou_type]) * 100 if len(aps[i][iou_type]) > 0 else 0
160 |             all_maps[iou_type][int(threshold*100)] = mAP
161 |         all_maps[iou_type]['all'] = (sum(all_maps[iou_type].values()) / (len(all_maps[iou_type].values())-1))
162 |     
163 |     if print_result:
164 |         print_maps(all_maps)
165 |     
166 |     # Put in a prettier format so we can serialize it to json during training
167 |     all_maps = {k: {j: round(u, 2) for j, u in v.items()} for k, v in all_maps.items()}
168 |     return all_maps
169 | 
170 | def print_maps(all_maps):
171 |     # Warning: hacky 
172 |     make_row = lambda vals: (' %5s |' * len(vals)) % tuple(vals)
173 |     make_sep = lambda n:  ('-------+' * n)
174 | 
175 |     print()
176 |     print(make_row([''] + [('.%d ' % x if isinstance(x, int) else x + ' ') for x in all_maps['box'].keys()]))
177 |     print(make_sep(len(all_maps['box']) + 1))
178 |     for iou_type in ('box', 'mask'):
179 |         print(make_row([iou_type] + ['%.2f' % x if x < 100 else '%.1f' % x for x in all_maps[iou_type].values()]))
180 |     print(make_sep(len(all_maps['box']) + 1))
181 |     print()
182 | 
183 | def cal_map(p_bbox, p_labels, p_scores, p_mask, t_bbox, gt_classes, t_mask, ap_data, iou_thresholds):
184 | 
185 |     #print("p_bbox", p_bbox.shape)
186 |     #print("p_labels", p_labels.shape)
187 |     #print("p_scores", p_scores.shape)
188 |     #print("p_mask", p_mask.shape)
189 |     #print("t_bbox", t_bbox.shape)
190 |     #print("gt_classes", gt_classes)
191 |     #print("t_mask", t_mask.shape)
192 | 
193 |     num_crowd = 0
194 |     
195 |     classes = list(np.array(p_labels).astype(int))
196 |     scores = list(np.array(p_scores).astype(float))
197 |     
198 |     box_scores = scores
199 |     mask_scores = scores
200 |     
201 |     masks = p_mask
202 |     
203 |     num_pred = len(classes)
204 |     num_gt   = len(gt_classes)
205 |     
206 |     mask_iou_cache = compute_overlaps_masks(masks, t_mask)
207 |     bbox_iou_cache = compute_overlaps(p_bbox, t_bbox)
208 |     
209 |     crowd_mask_iou_cache = None
210 |     crowd_bbox_iou_cache = None
211 |     
212 |     box_indices = sorted(range(num_pred), key=lambda i: -box_scores[i])
213 |     mask_indices = sorted(box_indices, key=lambda i: -mask_scores[i])
214 | 
215 |     iou_types = [
216 |         ('box',  lambda i,j: bbox_iou_cache[i, j].item(),
217 |                  lambda i,j: crowd_bbox_iou_cache[i,j].item(),
218 |                  lambda i: box_scores[i], box_indices),
219 |         ('mask', lambda i,j: mask_iou_cache[i, j].item(),
220 |                  lambda i,j: crowd_mask_iou_cache[i,j].item(),
221 |                  lambda i: mask_scores[i], mask_indices)
222 |     ]
223 |     #print("run", list(classes), list(gt_classes))
224 |     #print(classes + gt_classes)
225 |     for _class in set(list(classes) + list(gt_classes)):
226 |         ap_per_iou = []
227 |         num_gt_for_class = sum([1 for x in gt_classes if x == _class])
228 |         
229 |         for iouIdx in range(len(iou_thresholds)):
230 |             iou_threshold = iou_thresholds[iouIdx]
231 |             for iou_type, iou_func, crowd_func, score_func, indices in iou_types:
232 |                 gt_used = [False] * len(gt_classes)
233 |                 ap_obj = ap_data[iou_type][iouIdx][_class]
234 |                 ap_obj.add_gt_positives(num_gt_for_class)
235 | 
236 |                 for i in indices:
237 |                     if classes[i] != _class:
238 |                         continue
239 |                     max_iou_found = iou_threshold
240 |                     max_match_idx = -1
241 |                     for j in range(num_gt):
242 |                         if gt_used[j] or gt_classes[j] != _class:
243 |                             continue        
244 |                         iou = iou_func(i, j)
245 | 
246 |                         if iou > max_iou_found:
247 |                             max_iou_found = iou
248 |                             max_match_idx = j
249 |                     
250 |                     if max_match_idx >= 0:
251 |                         gt_used[max_match_idx] = True
252 |                         ap_obj.push(score_func(i), True)
253 |                     else:
254 |                         # If the detection matches a crowd, we can just ignore it
255 |                         matched_crowd = False
256 | 
257 |                         if num_crowd > 0:
258 |                             for j in range(len(crowd_classes)):
259 |                                 if crowd_classes[j] != _class:
260 |                                     continue
261 |                                 
262 |                                 iou = crowd_func(i, j)
263 | 
264 |                                 if iou > iou_threshold:
265 |                                     matched_crowd = True
266 |                                     break
267 | 
268 |                         # All this crowd code so that we can make sure that our eval code gives the
269 |                         # same result as COCOEval. There aren't even that many crowd annotations to
270 |                         # begin with, but accuracy is of the utmost importance.
271 |                         if not matched_crowd:
272 |                             ap_obj.push(score_func(i), False)
273 | 


--------------------------------------------------------------------------------
/detr_tf/loss/hungarian_matching.py:
--------------------------------------------------------------------------------
  1 | from typing import Union,Dict,Tuple
  2 | from itertools import product
  3 | import tensorflow as tf
  4 | import numpy as np
  5 | 
  6 | from .. import bbox
  7 | from scipy.optimize import linear_sum_assignment
  8 | 
  9 | 
 10 | def get_offsets(anchors_xywh, target_bbox_xywh):
 11 |     # Return the offset between the boxes in anchors_xywh and the boxes
 12 |     # in anchors_xywh
 13 | 
 14 |     variances = [0.1, 0.2]
 15 | 
 16 |     tiled_a_bbox, tiled_t_bbox = bbox.merge(anchors_xywh, target_bbox_xywh)
 17 | 
 18 |     g_cxcy = (tiled_t_bbox[:,:,:2] - tiled_a_bbox[:,:,:2])
 19 |     g_cxcy = g_cxcy / (variances[0] * tiled_a_bbox[:,:,2:])
 20 | 
 21 |     g_wh = tiled_t_bbox[:,:,2:] / tiled_a_bbox[:,:,2:]
 22 |     g_wh = tf.math.log(g_wh) / variances[1]
 23 | 
 24 |     return tf.concat([g_cxcy, g_wh], axis=-1)
 25 | 
 26 | 
 27 | def np_tf_linear_sum_assignment(matrix):
 28 | 
 29 |     indices = linear_sum_assignment(matrix)
 30 |     target_indices = indices[0]
 31 |     pred_indices = indices[1]
 32 | 
 33 |     #print(matrix.shape, target_indices, pred_indices)
 34 | 
 35 |     target_selector = np.zeros(matrix.shape[0])
 36 |     target_selector[target_indices] = 1
 37 |     target_selector = target_selector.astype(np.bool)
 38 | 
 39 |     pred_selector = np.zeros(matrix.shape[1])
 40 |     pred_selector[pred_indices] = 1
 41 |     pred_selector = pred_selector.astype(np.bool)
 42 | 
 43 |     #print('target_indices', target_indices)
 44 |     #print("pred_indices", pred_indices)
 45 | 
 46 |     return [target_indices, pred_indices, target_selector, pred_selector]
 47 | 
 48 | 
 49 | def box_cxcywh_to_xyxy(x):
 50 |     x_c, y_c, w, h = x.unbind(-1)
 51 |     b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
 52 |          (x_c + 0.5 * w), (y_c + 0.5 * h)]
 53 |     return torch.stack(b, dim=-1)
 54 | 
 55 | 
 56 | def generalized_box_iou(boxes1, boxes2):
 57 |     """
 58 |     Generalized IoU from https://giou.stanford.edu/
 59 |     The boxes should be in [x0, y0, x1, y1] format
 60 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
 61 |     and M = len(boxes2)
 62 |     """
 63 |     # degenerate boxes gives inf / nan results
 64 |     # so do an early check
 65 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
 66 | 
 67 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
 68 |     iou, union = box_iou(boxes1, boxes2)
 69 | 
 70 |     lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
 71 |     rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
 72 | 
 73 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 74 |     area = wh[:, :, 0] * wh[:, :, 1]
 75 | 
 76 |     return iou - (area - union) / area
 77 | 
 78 | def box_iou(boxes1, boxes2):
 79 |     area1 = box_area(boxes1)
 80 |     area2 = box_area(boxes2)
 81 | 
 82 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
 83 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
 84 | 
 85 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 86 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
 87 | 
 88 |     union = area1[:, None] + area2 - inter
 89 | 
 90 |     iou = inter / union
 91 |     return iou, union
 92 | 
 93 | 
 94 | 
 95 | def _get_src_permutation_idx(indices):
 96 |     # permute predictions following indices
 97 |     batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
 98 |     src_idx = torch.cat([src for (src, _) in indices])
 99 |     return batch_idx, src_idx
100 | 
101 | 
102 | def loss_labels(outputs, targets, indices, num_boxes, log=True):
103 |     """Classification loss (NLL)
104 |     targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
105 |     """
106 |     assert 'pred_logits' in outputs
107 |     src_logits = outputs['pred_logits']
108 | 
109 |     idx = _get_src_permutation_idx(indices)
110 |     target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
111 |     target_classes = torch.full(src_logits.shape[:2], 0,
112 |                                 dtype=torch.int64, device=src_logits.device)
113 |     target_classes[idx] = target_classes_o
114 | 
115 |     empty_weight = torch.ones(81)
116 |     empty_weight[0] = 0.1
117 | 
118 |     #print("log_softmax(input, 1)", F.softmax(src_logits, 1).mean())
119 |     #print("src_logits", src_logits.shape)
120 |     #print("target_classes", target_classes, target_classes.shape)
121 | 
122 |     #print("target_classes", target_classes)
123 | 
124 |     loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, empty_weight)
125 |     #print('>loss_ce', loss_ce)
126 |     losses = {'loss_ce': loss_ce}
127 | 
128 |     #if log:
129 |     #    # TODO this should probably be a separate loss, not hacked in this one here
130 |     #    losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0]
131 |     return losses
132 | 
133 | 
134 | 
135 | def loss_boxes(outputs, targets, indices, num_boxes):
136 |     """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
137 |        targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
138 |        The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
139 |     """
140 |     #print("------")
141 |     #assert 'pred_boxes' in outputs
142 |     idx = _get_src_permutation_idx(indices)
143 |     src_boxes = outputs['pred_boxes'][idx]
144 |     target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
145 | 
146 |     #print("target_boxes", target_boxes)
147 |     #print("src_boxes", src_boxes)
148 | 
149 |     loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
150 |     #print("loss_bbox", loss_bbox)
151 |     losses = {}
152 |     losses['loss_bbox'] = loss_bbox.sum() / target_boxes.shape[0]
153 |     #print(">loss_bbox", losses['loss_bbox'])
154 | 
155 |     loss_giou = 1 - torch.diag(generalized_box_iou(
156 |         box_cxcywh_to_xyxy(src_boxes),
157 |         box_cxcywh_to_xyxy(target_boxes)))
158 |     #print('>loss_giou', loss_giou)
159 |     losses['loss_giou'] = loss_giou.sum() / target_boxes.shape[0]
160 |     #print(">loss_giou", losses['loss_giou'])
161 |     return losses
162 | 
163 | def hungarian_matching(t_bbox, t_class, p_bbox, p_class, fcost_class=1, fcost_bbox=5, fcost_giou=2, slice_preds=True) -> tuple:
164 | 
165 |     if slice_preds:
166 |         size = tf.cast(t_bbox[0][0], tf.int32)
167 |         t_bbox = tf.slice(t_bbox, [1, 0], [size, 4])
168 |         t_class = tf.slice(t_class, [1, 0], [size, -1])
169 |         t_class = tf.squeeze(t_class, axis=-1)
170 | 
171 |     # Convert frpm [xc, yc, w, h] to [xmin, ymin, xmax, ymax]
172 |     p_bbox_xy = bbox.xcycwh_to_xy_min_xy_max(p_bbox)
173 |     t_bbox_xy = bbox.xcycwh_to_xy_min_xy_max(t_bbox)
174 | 
175 |     softmax = tf.nn.softmax(p_class)
176 | 
177 |     # Classification cost for the Hungarian algorithom 
178 |     # On each prediction. We select the prob of the expected class
179 |     cost_class = -tf.gather(softmax, t_class, axis=1)
180 | 
181 |     # L1 cost for the hungarian algorithm
182 |     _p_bbox, _t_bbox = bbox.merge(p_bbox, t_bbox)
183 |     cost_bbox = tf.norm(_p_bbox - _t_bbox, ord=1, axis=-1)
184 | 
185 |     # Generalized IOU
186 |     iou, union = bbox.jaccard(p_bbox_xy, t_bbox_xy, return_union=True)
187 |     _p_bbox_xy, _t_bbox_xy = bbox.merge(p_bbox_xy, t_bbox_xy)
188 |     top_left = tf.math.minimum(_p_bbox_xy[:,:,:2], _t_bbox_xy[:,:,:2])
189 |     bottom_right =  tf.math.maximum(_p_bbox_xy[:,:,2:], _t_bbox_xy[:,:,2:])
190 |     size = tf.nn.relu(bottom_right - top_left)
191 |     area = size[:,:,0] * size[:,:,1]
192 |     cost_giou = -(iou - (area - union) / area)
193 | 
194 |     # Final hungarian cost matrix
195 |     cost_matrix = fcost_bbox * cost_bbox + fcost_class * cost_class + fcost_giou * cost_giou
196 | 
197 |     selectors = tf.numpy_function(np_tf_linear_sum_assignment, [cost_matrix], [tf.int64, tf.int64, tf.bool, tf.bool] )
198 |     target_indices = selectors[0]
199 |     pred_indices = selectors[1]
200 |     target_selector = selectors[2]
201 |     pred_selector = selectors[3]
202 | 
203 |     return pred_indices, target_indices, pred_selector, target_selector, t_bbox, t_class
204 | 


--------------------------------------------------------------------------------
/detr_tf/loss/loss.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from .. import bbox
  3 | from .hungarian_matching import hungarian_matching
  4 | 
  5 | 
  6 | def get_total_losss(losses):
  7 |     """
  8 |     Get model total losss including auxiliary loss
  9 |     """
 10 |     train_loss = ["label_cost", "giou_loss", "l1_loss"]
 11 |     loss_weights = [1, 2, 5]
 12 | 
 13 |     total_loss = 0
 14 |     for key in losses:
 15 |         selector = [l for l, loss_name in enumerate(train_loss) if loss_name in key]
 16 |         if len(selector) == 1:
 17 |             #print("Add to the total loss", key, losses[key], loss_weights[selector[0]])
 18 |             total_loss += losses[key]*loss_weights[selector[0]]
 19 |     return total_loss
 20 | 
 21 | 
 22 | def get_losses(m_outputs, t_bbox, t_class, config):
 23 |     losses = get_detr_losses(m_outputs, t_bbox, t_class, config)
 24 | 
 25 |     # Get auxiliary loss for each auxiliary output
 26 |     if "aux" in m_outputs:
 27 |         for a, aux_m_outputs in enumerate(m_outputs["aux"]):
 28 |             aux_losses = get_detr_losses(aux_m_outputs, t_bbox, t_class, config, suffix="_{}".format(a))
 29 |             losses.update(aux_losses)
 30 |     
 31 |     # Compute the total loss
 32 |     total_loss = get_total_losss(losses)
 33 | 
 34 |     return total_loss, losses
 35 | 
 36 | 
 37 | def loss_labels(p_bbox, p_class, t_bbox, t_class, t_indices, p_indices, t_selector, p_selector, background_class=0):
 38 | 
 39 |     neg_indices = tf.squeeze(tf.where(p_selector == False), axis=-1)
 40 |     neg_p_class = tf.gather(p_class, neg_indices)
 41 |     neg_t_class = tf.zeros((tf.shape(neg_p_class)[0],), tf.int64) + background_class
 42 |     
 43 |     neg_weights = tf.zeros((tf.shape(neg_indices)[0],)) + 0.1
 44 |     pos_weights = tf.zeros((tf.shape(t_indices)[0],)) + 1.0
 45 |     weights = tf.concat([neg_weights, pos_weights], axis=0)
 46 |     
 47 |     pos_p_class = tf.gather(p_class, p_indices)
 48 |     pos_t_class = tf.gather(t_class, t_indices)
 49 | 
 50 |     #############
 51 |     # Metrics
 52 |     #############
 53 |     # True negative
 54 |     cls_neg_p_class = tf.argmax(neg_p_class, axis=-1)
 55 |     true_neg  = tf.reduce_mean(tf.cast(cls_neg_p_class == background_class, tf.float32))
 56 |     # True positive
 57 |     cls_pos_p_class = tf.argmax(pos_p_class, axis=-1)
 58 |     true_pos = tf.reduce_mean(tf.cast(cls_pos_p_class != background_class, tf.float32))
 59 |     # True accuracy
 60 |     cls_pos_p_class = tf.argmax(pos_p_class, axis=-1)
 61 |     pos_accuracy = tf.reduce_mean(tf.cast(cls_pos_p_class == pos_t_class, tf.float32))
 62 | 
 63 |     targets = tf.concat([neg_t_class, pos_t_class], axis=0)
 64 |     preds = tf.concat([neg_p_class, pos_p_class], axis=0)
 65 | 
 66 |     loss = tf.nn.sparse_softmax_cross_entropy_with_logits(targets, preds)
 67 |     loss = tf.reduce_sum(loss * weights) / tf.reduce_sum(weights)
 68 | 
 69 |     return loss, true_neg, true_pos, pos_accuracy
 70 | 
 71 | 
 72 | def loss_boxes(p_bbox, p_class, t_bbox, t_class, t_indices, p_indices, t_selector, p_selector):
 73 |     #print("------")
 74 |     p_bbox = tf.gather(p_bbox, p_indices)
 75 |     t_bbox = tf.gather(t_bbox, t_indices)
 76 | 
 77 | 
 78 |     p_bbox_xy = bbox.xcycwh_to_xy_min_xy_max(p_bbox)
 79 |     t_bbox_xy = bbox.xcycwh_to_xy_min_xy_max(t_bbox)
 80 | 
 81 |     l1_loss = tf.abs(p_bbox-t_bbox)
 82 |     l1_loss = tf.reduce_sum(l1_loss) / tf.cast(tf.shape(p_bbox)[0], tf.float32)
 83 | 
 84 |     iou, union = bbox.jaccard(p_bbox_xy, t_bbox_xy, return_union=True)
 85 | 
 86 |     _p_bbox_xy, _t_bbox_xy = bbox.merge(p_bbox_xy, t_bbox_xy)
 87 |     top_left = tf.math.minimum(_p_bbox_xy[:,:,:2], _t_bbox_xy[:,:,:2])
 88 |     bottom_right =  tf.math.maximum(_p_bbox_xy[:,:,2:], _t_bbox_xy[:,:,2:])
 89 |     size = tf.nn.relu(bottom_right - top_left)
 90 |     area = size[:,:,0] * size[:,:,1]
 91 |     giou = (iou - (area - union) / area)
 92 |     loss_giou = 1 - tf.linalg.diag_part(giou)
 93 | 
 94 |     loss_giou = tf.reduce_sum(loss_giou) / tf.cast(tf.shape(p_bbox)[0], tf.float32)
 95 | 
 96 |     return loss_giou, l1_loss
 97 | 
 98 | def get_detr_losses(m_outputs, target_bbox, target_label, config, suffix=""):
 99 | 
100 |     predicted_bbox = m_outputs["pred_boxes"]
101 |     predicted_label = m_outputs["pred_logits"]
102 | 
103 |     all_target_bbox = []
104 |     all_target_class = []
105 |     all_predicted_bbox = []
106 |     all_predicted_class = []
107 |     all_target_indices = []
108 |     all_predcted_indices = []
109 |     all_target_selector = []
110 |     all_predcted_selector = []
111 | 
112 |     t_offset = 0
113 |     p_offset = 0
114 | 
115 |     for b in range(predicted_bbox.shape[0]):
116 | 
117 |         p_bbox, p_class, t_bbox, t_class = predicted_bbox[b], predicted_label[b], target_bbox[b], target_label[b]
118 |         t_indices, p_indices, t_selector, p_selector, t_bbox, t_class = hungarian_matching(t_bbox, t_class, p_bbox, p_class, slice_preds=True)
119 | 
120 |         t_indices = t_indices + tf.cast(t_offset, tf.int64)
121 |         p_indices = p_indices + tf.cast(p_offset, tf.int64)
122 | 
123 |         all_target_bbox.append(t_bbox)
124 |         all_target_class.append(t_class)
125 |         all_predicted_bbox.append(p_bbox)
126 |         all_predicted_class.append(p_class)
127 |         all_target_indices.append(t_indices)
128 |         all_predcted_indices.append(p_indices)
129 |         all_target_selector.append(t_selector)
130 |         all_predcted_selector.append(p_selector)
131 | 
132 |         t_offset += tf.shape(t_bbox)[0]
133 |         p_offset += tf.shape(p_bbox)[0]
134 | 
135 |     all_target_bbox = tf.concat(all_target_bbox, axis=0)
136 |     all_target_class = tf.concat(all_target_class, axis=0)
137 |     all_predicted_bbox = tf.concat(all_predicted_bbox, axis=0)
138 |     all_predicted_class = tf.concat(all_predicted_class, axis=0)
139 |     all_target_indices = tf.concat(all_target_indices, axis=0)
140 |     all_predcted_indices = tf.concat(all_predcted_indices, axis=0)
141 |     all_target_selector = tf.concat(all_target_selector, axis=0)
142 |     all_predcted_selector = tf.concat(all_predcted_selector, axis=0)
143 | 
144 | 
145 |     label_cost, true_neg, true_pos, pos_accuracy = loss_labels(
146 |         all_predicted_bbox,
147 |         all_predicted_class,
148 |         all_target_bbox,
149 |         all_target_class,
150 |         all_target_indices,
151 |         all_predcted_indices,
152 |         all_target_selector,
153 |         all_predcted_selector,
154 |         background_class=config.background_class,
155 |     )
156 | 
157 |     giou_loss, l1_loss = loss_boxes(
158 |         all_predicted_bbox,
159 |         all_predicted_class,
160 |         all_target_bbox,
161 |         all_target_class,
162 |         all_target_indices,
163 |         all_predcted_indices,
164 |         all_target_selector,
165 |         all_predcted_selector
166 |     )
167 | 
168 |     label_cost = label_cost
169 |     giou_loss = giou_loss
170 |     l1_loss = l1_loss
171 | 
172 |     return {
173 |         "label_cost{}".format(suffix): label_cost,
174 |         "true_neg{}".format(suffix): true_neg,
175 |         "true_pos{}".format(suffix): true_pos,
176 |         "pos_accuracy{}".format(suffix): pos_accuracy,
177 |         "giou_loss{}".format(suffix): giou_loss,
178 |         "l1_loss{}".format(suffix): l1_loss
179 |     }
180 | 


--------------------------------------------------------------------------------
/detr_tf/networks/custom_layers.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | class FrozenBatchNorm2D(tf.keras.layers.Layer):
 5 |     def __init__(self, eps=1e-5, **kwargs):
 6 |         super().__init__(**kwargs)
 7 |         self.eps = eps
 8 | 
 9 | 
10 |     def build(self, input_shape):
11 |         self.weight = self.add_weight(name='weight', shape=[input_shape[-1]],
12 |                                       initializer=tf.keras.initializers.GlorotUniform(), trainable=False)
13 |         self.bias = self.add_weight(name='bias', shape=[input_shape[-1]],
14 |                                     initializer=tf.keras.initializers.GlorotUniform(), trainable=False)
15 |         self.running_mean = self.add_weight(name='running_mean', shape=[input_shape[-1]],
16 |                                             initializer='zeros', trainable=False)
17 |         self.running_var = self.add_weight(name='running_var', shape=[input_shape[-1]],
18 |                                            initializer='ones', trainable=False)
19 | 
20 | 
21 |     def call(self, x):
22 |         scale = self.weight * tf.math.rsqrt(self.running_var + self.eps)
23 |         shift = self.bias - self.running_mean * scale
24 |         return x * scale + shift
25 | 
26 | 
27 |     def compute_output_shape(self, input_shape):
28 |         return input_shape
29 | 
30 | 
31 | class Linear(tf.keras.layers.Layer):
32 |     '''
33 |     Use this custom layer instead of tf.keras.layers.Dense to allow
34 |     loading converted PyTorch Dense weights that have shape (output_dim, input_dim)
35 |     '''
36 |     def __init__(self, output_dim, **kwargs):
37 |         super().__init__(**kwargs)
38 |         self.output_dim = output_dim
39 | 
40 | 
41 |     def build(self, input_shape):
42 |         self.kernel = self.add_weight(name='kernel',
43 |                                       shape=[self.output_dim, input_shape[-1]],
44 |                                       initializer=tf.keras.initializers.GlorotUniform(), trainable=True)
45 |         self.bias = self.add_weight(name='bias',
46 |                                     shape=[self.output_dim],
47 |                                     initializer=tf.keras.initializers.GlorotUniform(), trainable=True)
48 | 
49 |     def call(self, x):
50 |         return tf.matmul(x, self.kernel, transpose_b=True) + self.bias
51 | 
52 | 
53 |     def compute_output_shape(self, input_shape):
54 |         return input_shape.as_list()[:-1] + [self.output_dim]
55 | 
56 | 
57 | class FixedEmbedding(tf.keras.layers.Layer):
58 |     def __init__(self, embed_shape, **kwargs):
59 |         super().__init__(**kwargs)
60 |         self.embed_shape = embed_shape
61 | 
62 |     def build(self, input_shape):
63 |         self.w = self.add_weight(name='kernel', shape=self.embed_shape,
64 |                                  initializer=tf.keras.initializers.GlorotUniform(), trainable=True)
65 | 
66 |     def call(self, x=None):
67 |         return self.w
68 | 


--------------------------------------------------------------------------------
/detr_tf/networks/detr.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | import time
  5 | import cv2
  6 | import matplotlib.pyplot as plt
  7 | import os
  8 | from pathlib import Path
  9 | 
 10 | 
 11 | from .resnet_backbone import ResNet50Backbone
 12 | from .custom_layers import Linear, FixedEmbedding
 13 | from .position_embeddings import PositionEmbeddingSine
 14 | from .transformer import Transformer
 15 | from .. bbox import xcycwh_to_xy_min_xy_max
 16 | from .weights import load_weights
 17 | 
 18 | 
 19 | class DETR(tf.keras.Model):
 20 |     def __init__(self, num_classes=92, num_queries=100,
 21 |                  backbone=None,
 22 |                  pos_encoder=None,
 23 |                  transformer=None,
 24 |                  num_encoder_layers=6,
 25 |                  num_decoder_layers=6,
 26 |                  return_intermediate_dec=True,
 27 |                  **kwargs):
 28 |         super().__init__(**kwargs)
 29 |         self.num_queries = num_queries
 30 | 
 31 |         self.backbone = ResNet50Backbone(name='backbone')
 32 |         self.transformer = transformer or Transformer(
 33 |                 num_encoder_layers=num_encoder_layers,
 34 |                 num_decoder_layers=num_decoder_layers,
 35 |                 return_intermediate_dec=return_intermediate_dec,
 36 |                 name='transformer'
 37 |         )
 38 |         
 39 |         self.model_dim = self.transformer.model_dim
 40 | 
 41 |         self.pos_encoder = pos_encoder or PositionEmbeddingSine(
 42 |             num_pos_features=self.model_dim // 2, normalize=True, name="position_embedding_sine")
 43 | 
 44 |         self.input_proj = tf.keras.layers.Conv2D(self.model_dim, kernel_size=1, name='input_proj')
 45 | 
 46 |         self.query_embed = FixedEmbedding((num_queries, self.model_dim),
 47 |                                           name='query_embed')
 48 | 
 49 |         self.class_embed = Linear(num_classes, name='class_embed')
 50 | 
 51 |         self.bbox_embed_linear1 = Linear(self.model_dim, name='bbox_embed_0')
 52 |         self.bbox_embed_linear2 = Linear(self.model_dim, name='bbox_embed_1')
 53 |         self.bbox_embed_linear3 = Linear(4, name='bbox_embed_2')
 54 |         self.activation = tf.keras.layers.ReLU(name='re_lu')
 55 | 
 56 | 
 57 |     def downsample_masks(self, masks, x):
 58 |         masks = tf.cast(masks, tf.int32)
 59 |         masks = tf.expand_dims(masks, -1)
 60 |         masks = tf.compat.v1.image.resize_nearest_neighbor(masks, tf.shape(x)[1:3], align_corners=False, half_pixel_centers=False)
 61 |         masks = tf.squeeze(masks, -1)
 62 |         masks = tf.cast(masks, tf.bool)
 63 |         return masks
 64 | 
 65 |     def call(self, inp, training=False, post_process=False):
 66 |         x, masks = inp
 67 |         x = self.backbone(x, training=training)
 68 |         masks = self.downsample_masks(masks, x)
 69 | 
 70 |         pos_encoding = self.pos_encoder(masks)
 71 | 
 72 |         hs = self.transformer(self.input_proj(x), masks, self.query_embed(None),
 73 |                               pos_encoding, training=training)[0]
 74 | 
 75 |         outputs_class = self.class_embed(hs)
 76 | 
 77 |         box_ftmps = self.activation(self.bbox_embed_linear1(hs))
 78 |         box_ftmps = self.activation(self.bbox_embed_linear2(box_ftmps))
 79 |         outputs_coord = tf.sigmoid(self.bbox_embed_linear3(box_ftmps))
 80 | 
 81 |         output = {'pred_logits': outputs_class[-1],
 82 |                   'pred_boxes': outputs_coord[-1]}
 83 | 
 84 |         if post_process:
 85 |             output = self.post_process(output)
 86 |         return output
 87 | 
 88 | 
 89 |     def build(self, input_shape=None, **kwargs):
 90 |         if input_shape is None:
 91 |             input_shape = [(None, None, None, 3), (None, None, None)]
 92 |         super().build(input_shape, **kwargs)
 93 | 
 94 | def add_heads_nlayers(config, detr, nb_class):
 95 |     image_input = tf.keras.Input((None, None, 3))
 96 |     # Setup the new layers
 97 |     cls_layer = tf.keras.layers.Dense(nb_class, name="cls_layer")
 98 |     pos_layer = tf.keras.models.Sequential([
 99 |         tf.keras.layers.Dense(256, activation="relu"),
100 |         tf.keras.layers.Dense(256, activation="relu"),
101 |         tf.keras.layers.Dense(4, activation="sigmoid"),
102 |     ], name="pos_layer")
103 |     config.add_nlayers([cls_layer, pos_layer])
104 | 
105 |     transformer_output = detr(image_input)
106 |     cls_preds = cls_layer(transformer_output)
107 |     pos_preds = pos_layer(transformer_output)
108 | 
109 |     # Define the main outputs along with the auxialiary loss
110 |     outputs = {'pred_logits': cls_preds[-1], 'pred_boxes': pos_preds[-1]}
111 |     outputs["aux"] = [ {"pred_logits": cls_preds[i], "pred_boxes": pos_preds[i]} for i in range(0, 5)]
112 | 
113 |     n_detr = tf.keras.Model(image_input, outputs, name="detr_finetuning")
114 |     return n_detr
115 | 
116 | def get_detr_model(config, include_top=False, nb_class=None, weights=None, tf_backbone=False, num_decoder_layers=6, num_encoder_layers=6):
117 |     """ Get the DETR model
118 | 
119 |     Parameters
120 |     ----------
121 |     include_top: bool
122 |         If false, the last layers of the transformers used to predict the bbox position
123 |         and cls will not be include. And therefore could be replace for finetuning if the `weight` parameter
124 |         is set.
125 |     nb_class: int
126 |         If include_top is False and nb_class is set, then, this method will automaticly add two new heads to predict
127 |         the bbox pos and the bbox class on the decoder.
128 |     weights: str
129 |         Name of the weights to load. Only "detr" is avaiable to get started for now.
130 |         More weight as detr-r101 will be added soon.
131 |     tf_backbone:
132 |         Using the pretrained weight from pytorch, the resnet backbone does not used
133 |         tf.keras.application to load the weight. If you do want to load the tf backbone, and not
134 |         laod the weights from pytorch, set this variable to True.
135 |     """
136 |     detr = DETR(num_decoder_layers=num_decoder_layers, num_encoder_layers=num_encoder_layers)
137 | 
138 |     if weights is not None:
139 |         load_weights(detr, weights)
140 | 
141 |     image_input = tf.keras.Input((None, None, 3))
142 | 
143 |     # Backbone
144 |     if not tf_backbone:
145 |         backbone = detr.get_layer("backbone")
146 |     else:
147 |         config.normalized_method = "tf_resnet"
148 |         backbone = tf.keras.applications.ResNet50(include_top=False, weights="imagenet", input_shape=(None, None, 3))
149 | 
150 |     # Transformer
151 |     transformer = detr.get_layer("transformer")
152 |     # Positional embedding of the feature map
153 |     position_embedding_sine = detr.get_layer("position_embedding_sine")
154 |     # Used to project the feature map before to fit in into the encoder
155 |     input_proj = detr.get_layer('input_proj')
156 |     # Decoder objects query embedding
157 |     query_embed = detr.get_layer('query_embed')
158 | 
159 | 
160 |     # Used to project the  output of the decoder into a class prediction
161 |     # This layer will be replace for finetuning
162 |     class_embed = detr.get_layer('class_embed')
163 | 
164 |     # Predict the bbox pos
165 |     bbox_embed_linear1 = detr.get_layer('bbox_embed_0')
166 |     bbox_embed_linear2 = detr.get_layer('bbox_embed_1')
167 |     bbox_embed_linear3 = detr.get_layer('bbox_embed_2')
168 |     activation = detr.get_layer("re_lu")
169 | 
170 |     x = backbone(image_input)
171 | 
172 |     masks = tf.zeros((tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2]), tf.bool)
173 |     pos_encoding = position_embedding_sine(masks)
174 | 
175 |     hs = transformer(input_proj(x), masks, query_embed(None), pos_encoding)[0]
176 | 
177 |     detr = tf.keras.Model(image_input, hs, name="detr")
178 |     if include_top is False and nb_class is None:
179 |         return detr
180 |     elif include_top is False and nb_class is not None:
181 |         return add_heads_nlayers(config, detr, nb_class) 
182 | 
183 |     transformer_output = detr(image_input)
184 | 
185 |     outputs_class = class_embed(transformer_output)
186 |     box_ftmps = activation(bbox_embed_linear1(transformer_output))
187 |     box_ftmps = activation(bbox_embed_linear2(box_ftmps))
188 |     outputs_coord = tf.sigmoid(bbox_embed_linear3(box_ftmps))
189 | 
190 |     outputs = {}
191 | 
192 |     output = {'pred_logits': outputs_class[-1],
193 |                 'pred_boxes': outputs_coord[-1]}
194 | 
195 |     output["aux"] = []
196 |     for i in range(0, num_decoder_layers - 1):
197 |         out_class = outputs_class[i]
198 |         pred_boxes = outputs_coord[i]
199 |         output["aux"].append({
200 |             "pred_logits": out_class,
201 |             "pred_boxes": pred_boxes
202 |         })
203 | 
204 |     return tf.keras.Model(image_input, output, name="detr_finetuning")
205 | 
206 | 


--------------------------------------------------------------------------------
/detr_tf/networks/position_embeddings.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | 
 5 | class PositionEmbeddingSine(tf.keras.Model):
 6 | 
 7 | 
 8 |     def __init__(self, num_pos_features=64, temperature=10000,
 9 |                  normalize=False, scale=None, eps=1e-6, **kwargs):
10 |         super().__init__(**kwargs)
11 | 
12 |         self.num_pos_features = num_pos_features
13 |         self.temperature = temperature
14 |         self.normalize = normalize
15 |         if scale is not None and normalize is False:
16 |             raise ValueError('normalize should be True if scale is passed')
17 |         if scale is None:
18 |             scale = 2 * np.pi
19 |         self.scale = scale
20 |         self.eps = eps
21 | 
22 | 
23 |     def call(self, mask):
24 |         not_mask = tf.cast(~mask, tf.float32)
25 |         y_embed = tf.math.cumsum(not_mask, axis=1)
26 |         x_embed = tf.math.cumsum(not_mask, axis=2)
27 | 
28 |         if self.normalize:
29 |             y_embed = y_embed / (y_embed[:, -1:, :] + self.eps) * self.scale
30 |             x_embed = x_embed / (x_embed[:, :, -1:] + self.eps) * self.scale
31 | 
32 |         dim_t = tf.range(self.num_pos_features, dtype=tf.float32)
33 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_features)
34 | 
35 |         pos_x = x_embed[..., tf.newaxis] / dim_t
36 |         pos_y = y_embed[..., tf.newaxis] / dim_t
37 |         
38 |         pos_x = tf.stack([tf.math.sin(pos_x[..., 0::2]),
39 |                           tf.math.cos(pos_x[..., 1::2])], axis=4)
40 | 
41 |         pos_y = tf.stack([tf.math.sin(pos_y[..., 0::2]),
42 |                           tf.math.cos(pos_y[..., 1::2])], axis=4)
43 |         
44 | 
45 |         shape = [tf.shape(pos_x)[i] for i in range(3)] + [-1]
46 |         pos_x = tf.reshape(pos_x, shape)
47 |         pos_y = tf.reshape(pos_y, shape)
48 | 
49 |         pos_emb = tf.concat([pos_y, pos_x], axis=3)
50 |         return pos_emb
51 | 


--------------------------------------------------------------------------------
/detr_tf/networks/resnet_backbone.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from tensorflow.keras.layers import ZeroPadding2D, Conv2D, ReLU, MaxPool2D
  3 | 
  4 | from .custom_layers import FrozenBatchNorm2D
  5 | 
  6 | 
  7 | class ResNetBase(tf.keras.Model):
  8 |     def __init__(self, **kwargs):
  9 |         super().__init__(**kwargs)
 10 | 
 11 |         self.pad1 = ZeroPadding2D(3, name='pad1')
 12 |         self.conv1 = Conv2D(64, kernel_size=7, strides=2, padding='valid',
 13 |                             use_bias=False, name='conv1')
 14 |         self.bn1 = FrozenBatchNorm2D(name='bn1')
 15 |         self.relu = ReLU(name='relu')
 16 |         self.pad2 = ZeroPadding2D(1, name='pad2')
 17 |         self.maxpool = MaxPool2D(pool_size=3, strides=2, padding='valid')
 18 | 
 19 | 
 20 |     def call(self, x):
 21 |         x = self.pad1(x)
 22 |         x = self.conv1(x)
 23 |         x = self.bn1(x)
 24 |         x = self.relu(x)
 25 |         x = self.pad2(x)
 26 |         x = self.maxpool(x)
 27 | 
 28 |         x = self.layer1(x)
 29 |         x = self.layer2(x)
 30 |         x = self.layer3(x)
 31 |         x = self.layer4(x)
 32 |         return x
 33 | 
 34 | 
 35 | class ResNet50Backbone(ResNetBase):
 36 |     def __init__(self, replace_stride_with_dilation=[False, False, False], **kwargs):
 37 |         super().__init__(**kwargs)
 38 | 
 39 |         self.layer1 = ResidualBlock(num_bottlenecks=3, dim1=64, dim2=256, strides=1,
 40 |                                    replace_stride_with_dilation=False, name='layer1')
 41 |         self.layer2 = ResidualBlock(num_bottlenecks=4, dim1=128, dim2=512, strides=2,
 42 |                                     replace_stride_with_dilation=replace_stride_with_dilation[0],
 43 |                                     name='layer2')
 44 |         self.layer3 = ResidualBlock(num_bottlenecks=6, dim1=256, dim2=1024, strides=2,
 45 |                                     replace_stride_with_dilation=replace_stride_with_dilation[1],
 46 |                                     name='layer3')
 47 |         self.layer4 = ResidualBlock(num_bottlenecks=3, dim1=512, dim2=2048, strides=2,
 48 |                                     replace_stride_with_dilation=replace_stride_with_dilation[2],
 49 |                                     name='layer4')
 50 | 
 51 | 
 52 | class ResNet101Backbone(ResNetBase):
 53 |     def __init__(self, replace_stride_with_dilation=[False, False, False], **kwargs):
 54 |         super().__init__(**kwargs)
 55 | 
 56 |         self.layer1 = ResidualBlock(num_bottlenecks=3, dim1=64, dim2=256, strides=1,
 57 |                                     replace_stride_with_dilation=False, name='layer1')
 58 |         self.layer2 = ResidualBlock(num_bottlenecks=4, dim1=128, dim2=512, strides=2,
 59 |                                     replace_stride_with_dilation=replace_stride_with_dilation[0],
 60 |                                     name='layer2')
 61 |         self.layer3 = ResidualBlock(num_bottlenecks=23, dim1=256, dim2=1024, strides=2,
 62 |                                     replace_stride_with_dilation=replace_stride_with_dilation[1],
 63 |                                     name='layer3')
 64 |         self.layer4 = ResidualBlock(num_bottlenecks=3, dim1=512, dim2=2048, strides=2,
 65 |                                     replace_stride_with_dilation=replace_stride_with_dilation[2],
 66 |                                     name='layer4')
 67 | 
 68 | 
 69 | class ResidualBlock(tf.keras.Model):
 70 |     def __init__(self, num_bottlenecks, dim1, dim2, strides=1,
 71 |                  replace_stride_with_dilation=False, **kwargs):
 72 |         super().__init__(**kwargs)
 73 | 
 74 |         if replace_stride_with_dilation:
 75 |             strides = 1
 76 |             dilation = 2
 77 |         else:
 78 |             dilation = 1
 79 | 
 80 |         self.bottlenecks = [BottleNeck(dim1, dim2, strides=strides,
 81 |                                        downsample=True, name='0')]
 82 | 
 83 |         for idx in range(1, num_bottlenecks):
 84 |             self.bottlenecks.append(BottleNeck(dim1, dim2, name=str(idx),
 85 |                                                dilation=dilation))
 86 | 
 87 | 
 88 |     def call(self, x):
 89 |         for btn in self.bottlenecks:
 90 |             x = btn(x)
 91 |         return x
 92 | 
 93 | 
 94 | class BottleNeck(tf.keras.Model):
 95 |     def __init__(self, dim1, dim2, strides=1, dilation=1, downsample=False, **kwargs):
 96 |         super().__init__(**kwargs)
 97 |         self.downsample = downsample
 98 |         self.pad = ZeroPadding2D(dilation)
 99 |         self.relu = ReLU(name='relu')
100 |         
101 |         self.conv1 = Conv2D(dim1, kernel_size=1, use_bias=False, name='conv1')
102 |         self.bn1 = FrozenBatchNorm2D(name='bn1')
103 | 
104 |         self.conv2 = Conv2D(dim1, kernel_size=3, strides=strides, dilation_rate=dilation,
105 |                             use_bias=False, name='conv2')
106 |         self.bn2 = FrozenBatchNorm2D(name='bn2')
107 |         
108 |         self.conv3 = Conv2D(dim2, kernel_size=1, use_bias=False, name='conv3')
109 |         self.bn3 = FrozenBatchNorm2D(name='bn3')
110 | 
111 |         self.downsample_conv = Conv2D(dim2, kernel_size=1, strides=strides,
112 |                                       use_bias=False, name='downsample_0')
113 |         self.downsample_bn = FrozenBatchNorm2D(name='downsample_1')
114 | 
115 | 
116 |     def call(self, x):
117 |         identity = x
118 | 
119 |         out = self.conv1(x)
120 |         out = self.bn1(out)
121 |         out = self.relu(out)
122 | 
123 |         out = self.pad(out)
124 |         out = self.conv2(out)
125 |         out = self.bn2(out)
126 |         out = self.relu(out)
127 | 
128 |         out = self.conv3(out)
129 |         out = self.bn3(out)
130 | 
131 |         if self.downsample:
132 |             identity = self.downsample_bn(self.downsample_conv(x))
133 | 
134 |         out += identity
135 |         out = self.relu(out)
136 | 
137 |         return out


--------------------------------------------------------------------------------
/detr_tf/networks/transformer.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from tensorflow.keras.layers import Dropout, Activation, LayerNormalization
  3 | 
  4 | from .custom_layers import Linear
  5 | 
  6 | 
  7 | class Transformer(tf.keras.Model):
  8 |     def __init__(self, model_dim=256, num_heads=8, num_encoder_layers=6,
  9 |                  num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
 10 |                  activation='relu', normalize_before=False,
 11 |                  return_intermediate_dec=False, **kwargs):
 12 |         super().__init__(**kwargs)
 13 | 
 14 |         self.model_dim = model_dim
 15 |         self.num_heads = num_heads
 16 | 
 17 |         enc_norm = LayerNormalization(epsilon=1e-5, name='norm_pre') if normalize_before else None
 18 |         self.encoder = TransformerEncoder(model_dim, num_heads, dim_feedforward,
 19 |                                           dropout, activation, normalize_before, enc_norm,
 20 |                                           num_encoder_layers, name='encoder')
 21 | 
 22 |         dec_norm = LayerNormalization(epsilon=1e-5, name='norm')
 23 |         self.decoder = TransformerDecoder(model_dim, num_heads, dim_feedforward,
 24 |                                           dropout, activation, normalize_before, dec_norm,
 25 |                                           num_decoder_layers, name='decoder',
 26 |                                           return_intermediate=return_intermediate_dec)
 27 | 
 28 | 
 29 |     def call(self, source, mask, query_encoding, pos_encoding, training=False):
 30 | 
 31 |         batch_size, rows, cols = [tf.shape(source)[i] for i in range(3)]
 32 |         source = tf.reshape(source, [batch_size, -1, self.model_dim])
 33 |         source = tf.transpose(source, [1, 0, 2])
 34 | 
 35 | 
 36 | 
 37 |         pos_encoding = tf.reshape(pos_encoding, [batch_size, -1, self.model_dim])
 38 |         pos_encoding = tf.transpose(pos_encoding, [1, 0, 2])
 39 | 
 40 |         query_encoding = tf.expand_dims(query_encoding, axis=1)
 41 |         query_encoding = tf.tile(query_encoding, [1, batch_size, 1])
 42 | 
 43 |         mask = tf.reshape(mask, [batch_size, -1])
 44 | 
 45 |         target = tf.zeros_like(query_encoding)
 46 | 
 47 |         memory = self.encoder(source, source_key_padding_mask=mask,
 48 |                               pos_encoding=pos_encoding, training=training)
 49 |         hs = self.decoder(target, memory, memory_key_padding_mask=mask,
 50 |                           pos_encoding=pos_encoding, query_encoding=query_encoding,
 51 |                           training=training)
 52 | 
 53 |         hs = tf.transpose(hs, [0, 2, 1, 3])
 54 |         memory = tf.transpose(memory, [1, 0, 2])
 55 |         memory = tf.reshape(memory, [batch_size, rows, cols, self.model_dim])
 56 | 
 57 |         return hs, memory
 58 | 
 59 | 
 60 | class TransformerEncoder(tf.keras.Model):
 61 |     def __init__(self, model_dim=256, num_heads=8, dim_feedforward=2048,
 62 |                  dropout=0.1, activation='relu', normalize_before=False, norm=None,
 63 |                  num_encoder_layers=6, **kwargs):
 64 |         super().__init__(**kwargs)
 65 | 
 66 |         self.enc_layers = [EncoderLayer(model_dim, num_heads, dim_feedforward,
 67 |                                         dropout, activation, normalize_before,
 68 |                                         name='layer_%d'%i)
 69 |                            for i in range(num_encoder_layers)]
 70 |         
 71 |         self.norm = norm
 72 | 
 73 | 
 74 |     def call(self, source, mask=None, source_key_padding_mask=None,
 75 |              pos_encoding=None, training=False):
 76 |         x = source
 77 | 
 78 | 
 79 |         for layer in self.enc_layers:
 80 |             x = layer(x, source_mask=mask, source_key_padding_mask=source_key_padding_mask,
 81 |                       pos_encoding=pos_encoding, training=training)
 82 | 
 83 |         if self.norm:
 84 |             x = self.norm(x)
 85 | 
 86 |         return x
 87 | 
 88 | 
 89 | class TransformerDecoder(tf.keras.Model):
 90 |     def __init__(self, model_dim=256, num_heads=8, dim_feedforward=2048,
 91 |                  dropout=0.1, activation='relu', normalize_before=False, norm=None,
 92 |                  num_decoder_layers=6, return_intermediate=False, **kwargs):
 93 |         super().__init__(**kwargs)
 94 | 
 95 |         self.dec_layers = [DecoderLayer(model_dim, num_heads, dim_feedforward,
 96 |                                         dropout, activation, normalize_before,
 97 |                                         name='layer_%d'%i)
 98 |                            for i in range(num_decoder_layers)]
 99 | 
100 |         self.norm = norm
101 |         self.return_intermediate = return_intermediate
102 | 
103 | 
104 |     def call(self, target, memory, target_mask=None, memory_mask=None,
105 |              target_key_padding_mask=None, memory_key_padding_mask=None,
106 |              pos_encoding=None, query_encoding=None, training=False):
107 | 
108 |         x = target
109 |         intermediate = []
110 | 
111 | 
112 |         for layer in self.dec_layers:
113 |             x = layer(x, memory,
114 |                       target_mask=target_mask,
115 |                       memory_mask=memory_mask,
116 |                       target_key_padding_mask=target_key_padding_mask,
117 |                       memory_key_padding_mask=memory_key_padding_mask,
118 |                       pos_encoding=pos_encoding,
119 |                       query_encoding=query_encoding)
120 | 
121 |             if self.return_intermediate:
122 |                 if self.norm:
123 |                     intermediate.append(self.norm(x))
124 |                 else:
125 |                     intermediate.append(x)
126 | 
127 |         if self.return_intermediate:
128 |             return tf.stack(intermediate, axis=0)
129 | 
130 |         if self.norm:
131 |             x = self.norm(x)
132 | 
133 |         return x
134 | 
135 | 
136 | class EncoderLayer(tf.keras.layers.Layer):
137 |     def __init__(self, model_dim=256, num_heads=8, dim_feedforward=2048,
138 |                  dropout=0.1, activation='relu', normalize_before=False,
139 |                  **kwargs):
140 |         super().__init__(**kwargs)
141 | 
142 |         self.self_attn = MultiHeadAttention(model_dim, num_heads, dropout=dropout,
143 |                                             name='self_attn')
144 | 
145 |         self.dropout = Dropout(dropout)
146 |         self.activation = Activation(activation)
147 | 
148 |         self.linear1 = Linear(dim_feedforward, name='linear1')
149 |         self.linear2 = Linear(model_dim, name='linear2')
150 | 
151 |         self.norm1 = LayerNormalization(epsilon=1e-5, name='norm1')
152 |         self.norm2 = LayerNormalization(epsilon=1e-5, name='norm2')
153 | 
154 |         self.normalize_before = normalize_before
155 | 
156 | 
157 |     def call(self, source, source_mask=None, source_key_padding_mask=None,
158 |              pos_encoding=None, training=False):
159 | 
160 | 
161 |         if pos_encoding is None:
162 |             query = key = source
163 |         else:
164 |             query = key = source + pos_encoding
165 | 
166 |         attn_source = self.self_attn((query, key, source), attn_mask=source_mask,
167 |                                      key_padding_mask=source_key_padding_mask,
168 |                                      need_weights=False)
169 |         source += self.dropout(attn_source, training=training)
170 |         source = self.norm1(source)
171 | 
172 |         x = self.linear1(source)
173 |         x = self.activation(x)
174 |         x = self.dropout(x, training=training)
175 |         x = self.linear2(x)
176 |         source += self.dropout(x, training=training)
177 |         source = self.norm2(source)
178 |         
179 |         return source
180 | 
181 | 
182 | 
183 | class DecoderLayer(tf.keras.layers.Layer):
184 |     def __init__(self, model_dim=256, num_heads=8, dim_feedforward=2048,
185 |                  dropout=0.1, activation='relu', normalize_before=False,
186 |                  **kwargs):
187 |         super().__init__(**kwargs)
188 | 
189 |         self.self_attn = MultiHeadAttention(model_dim, num_heads, dropout=dropout,
190 |                                             name='self_attn')
191 |         self.multihead_attn = MultiHeadAttention(model_dim, num_heads, dropout=dropout,
192 |                                                  name='multihead_attn')
193 | 
194 |         self.dropout = Dropout(dropout)
195 |         self.activation = Activation(activation)
196 | 
197 |         self.linear1 = Linear(dim_feedforward, name='linear1')
198 |         self.linear2 = Linear(model_dim, name='linear2')
199 | 
200 |         self.norm1 = LayerNormalization(epsilon=1e-5, name='norm1')
201 |         self.norm2 = LayerNormalization(epsilon=1e-5, name='norm2')
202 |         self.norm3 = LayerNormalization(epsilon=1e-5, name='norm3')
203 | 
204 |         self.normalize_before = normalize_before
205 | 
206 | 
207 |     def call(self, target, memory, target_mask=None, memory_mask=None,
208 |              target_key_padding_mask=None, memory_key_padding_mask=None,
209 |              pos_encoding=None, query_encoding=None, training=False):
210 | 
211 |         query_tgt = key_tgt = target + query_encoding
212 |         attn_target = self.self_attn((query_tgt, key_tgt, target), attn_mask=target_mask,
213 |                                     key_padding_mask=target_key_padding_mask,
214 |                                     need_weights=False)
215 |         target += self.dropout(attn_target, training=training)
216 |         target = self.norm1(target)
217 | 
218 |         query_tgt = target + query_encoding
219 |         key_mem = memory + pos_encoding
220 |         
221 |         attn_target2 = self.multihead_attn((query_tgt, key_mem, memory), attn_mask=memory_mask,
222 |                                            key_padding_mask=memory_key_padding_mask,
223 |                                            need_weights=False)
224 |         target += self.dropout(attn_target2, training=training)
225 |         target = self.norm2(target)
226 | 
227 |         x = self.linear1(target)
228 |         x = self.activation(x)
229 |         x = self.dropout(x, training=training)
230 |         x = self.linear2(x)
231 |         target += self.dropout(x, training=training)
232 |         target = self.norm3(target)
233 |         
234 |         return target
235 | 
236 | 
237 | class MultiHeadAttention(tf.keras.layers.Layer):
238 |     def __init__(self, model_dim, num_heads, dropout=0.0, **kwargs):
239 |         super().__init__(**kwargs)
240 | 
241 |         self.model_dim = model_dim
242 |         self.num_heads = num_heads
243 | 
244 |         assert model_dim % num_heads == 0
245 |         self.head_dim = model_dim // num_heads
246 | 
247 |         self.dropout = Dropout(rate=dropout)
248 |         
249 | 
250 |     def build(self, input_shapes):
251 |         in_dim = sum([shape[-1] for shape in input_shapes[:3]])
252 | 
253 |         self.in_proj_weight = self.add_weight(
254 |             name='in_proj_kernel', shape=(in_dim, self.model_dim),
255 |             initializer=tf.keras.initializers.GlorotUniform(), dtype=tf.float32, trainable=True
256 |         )
257 |         self.in_proj_bias = self.add_weight(
258 |             name='in_proj_bias', shape=(in_dim,),
259 |             initializer=tf.keras.initializers.GlorotUniform(), dtype=tf.float32, trainable=True
260 |         )
261 |         self.out_proj_weight = self.add_weight(
262 |             name='out_proj_kernel', shape=(self.model_dim, self.model_dim),
263 |             initializer=tf.keras.initializers.GlorotUniform(), dtype=tf.float32, trainable=True
264 |         )
265 |         self.out_proj_bias = self.add_weight(
266 |             name='out_proj_bias', shape=(self.model_dim,),
267 |             initializer=tf.keras.initializers.GlorotUniform(), dtype=tf.float32, trainable=True
268 |         )
269 | 
270 | 
271 | 
272 | 
273 |         #self.in_proj_weight = tf.Variable(
274 |         #    tf.zeros((in_dim, self.model_dim), dtype=tf.float32), name='in_proj_kernel')
275 |         #self.in_proj_bias = tf.Variable(tf.zeros((in_dim,), dtype=tf.float32),
276 |         #                                name='in_proj_bias')
277 | 
278 |         #self.out_proj_weight = tf.Variable(
279 |         #    tf.zeros((self.model_dim, self.model_dim), dtype=tf.float32), name='out_proj_kernel')
280 |         #self.out_proj_bias = tf.Variable(
281 |         #    tf.zeros((self.model_dim,), dtype=tf.float32), name='out_proj_bias')
282 | 
283 | 
284 | 
285 |     def call(self, inputs, attn_mask=None, key_padding_mask=None,
286 |              need_weights=True, training=False):
287 | 
288 |         query, key, value = inputs
289 | 
290 |         batch_size = tf.shape(query)[1]
291 |         target_len = tf.shape(query)[0]
292 |         source_len = tf.shape(key)[0]
293 | 
294 |         W = self.in_proj_weight[:self.model_dim, :]
295 |         b = self.in_proj_bias[:self.model_dim]
296 | 
297 |         WQ = tf.matmul(query, W, transpose_b=True) + b
298 | 
299 |         W = self.in_proj_weight[self.model_dim:2*self.model_dim, :]
300 |         b = self.in_proj_bias[self.model_dim:2*self.model_dim]
301 |         WK = tf.matmul(key, W, transpose_b=True) + b
302 | 
303 |         W = self.in_proj_weight[2*self.model_dim:, :]
304 |         b = self.in_proj_bias[2*self.model_dim:]
305 |         WV = tf.matmul(value, W, transpose_b=True) + b
306 | 
307 |         WQ *= float(self.head_dim) ** -0.5
308 |         WQ = tf.reshape(WQ, [target_len, batch_size * self.num_heads, self.head_dim])
309 |         WQ = tf.transpose(WQ, [1, 0, 2])
310 |         
311 |         WK = tf.reshape(WK, [source_len, batch_size * self.num_heads, self.head_dim])
312 |         WK = tf.transpose(WK, [1, 0, 2])
313 | 
314 |         WV = tf.reshape(WV, [source_len, batch_size * self.num_heads, self.head_dim])
315 |         WV = tf.transpose(WV, [1, 0, 2])
316 |         
317 |         attn_output_weights = tf.matmul(WQ, WK, transpose_b=True)
318 | 
319 |         if attn_mask is not None:
320 |             attn_output_weights += attn_mask
321 | 
322 |         """
323 |         if key_padding_mask is not None:
324 |             attn_output_weights = tf.reshape(attn_output_weights,
325 |                                 [batch_size, self.num_heads, target_len, source_len])
326 | 
327 |             key_padding_mask = tf.expand_dims(key_padding_mask, 1)
328 |             key_padding_mask = tf.expand_dims(key_padding_mask, 2)
329 |             key_padding_mask = tf.tile(key_padding_mask, [1, self.num_heads, target_len, 1])
330 | 
331 |             #print("before attn_output_weights", attn_output_weights.shape)
332 |             attn_output_weights = tf.where(key_padding_mask,
333 |                                            tf.zeros_like(attn_output_weights) + float('-inf'),
334 |                                            attn_output_weights)
335 |             attn_output_weights = tf.reshape(attn_output_weights,
336 |                                 [batch_size * self.num_heads, target_len, source_len])
337 |         """
338 | 
339 | 
340 |         attn_output_weights = tf.nn.softmax(attn_output_weights, axis=-1)
341 |         attn_output_weights = self.dropout(attn_output_weights, training=training)
342 | 
343 |         attn_output = tf.matmul(attn_output_weights, WV)
344 |         attn_output = tf.transpose(attn_output, [1, 0, 2])
345 |         attn_output = tf.reshape(attn_output, [target_len, batch_size, self.model_dim])
346 |         attn_output = tf.matmul(attn_output, self.out_proj_weight,
347 |                                 transpose_b=True) + self.out_proj_bias
348 | 
349 |         if need_weights:
350 |             attn_output_weights = tf.reshape(attn_output_weights,
351 |                             [batch_size, self.num_heads, target_len, source_len])
352 |             # Retrun the average weight over the heads
353 |             avg_weights = tf.reduce_mean(attn_output_weights, axis=1)
354 |             return attn_output, avg_weights
355 |         
356 |         return attn_output
357 | 


--------------------------------------------------------------------------------
/detr_tf/networks/weights.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import requests
 3 | 
 4 | 
 5 | WEIGHT_NAME_TO_CKPT = {
 6 |     "detr": [
 7 |         "https://storage.googleapis.com/visualbehavior-publicweights/detr/checkpoint",
 8 |         "https://storage.googleapis.com/visualbehavior-publicweights/detr/detr.ckpt.data-00000-of-00001",
 9 |         "https://storage.googleapis.com/visualbehavior-publicweights/detr/detr.ckpt.index"
10 |     ]
11 | }
12 | 
13 | def load_weights(model, weights: str):
14 |     """ Load weight on a given model
15 |     weights are supposed to be sotred in the weight folder at the root of the repository. If weights
16 |     does not exists, but are publicly known, the weight will be download from gcloud.
17 |     """
18 |     if not os.path.exists('weights'):
19 |         os.makedirs('weights')
20 |     
21 |     if "ckpt" in "weights":
22 |         model.load(weights)
23 |     elif weights in WEIGHT_NAME_TO_CKPT:
24 |         wdir = f"weights/{weights}"
25 |         if not os.path.exists(wdir):
26 |             os.makedirs(wdir)
27 |         for f in WEIGHT_NAME_TO_CKPT[weights]:
28 |             fname = f.split("/")[-1]
29 |             if not os.path.exists(os.path.join(wdir, fname)):
30 |                 print("Download....", f)
31 |                 r = requests.get(f, allow_redirects=True)
32 |                 open(os.path.join(wdir, fname), 'wb').write(r.content)
33 |         print("Load weights from", os.path.join(wdir, f"{weights}.ckpt"))
34 |         l = model.load_weights(os.path.join(wdir, f"{weights}.ckpt"))
35 |         l.expect_partial()
36 |     else:
37 |         raise Exception(f'Cant load the weights: {weights}')


--------------------------------------------------------------------------------
/detr_tf/optimizers.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | def disable_batchnorm_training(model):
  4 |     for l in model.layers:
  5 |         if hasattr(l, "layers"):
  6 |             disable_batchnorm_training(l)
  7 |         elif isinstance(l, tf.keras.layers.BatchNormalization):
  8 |             l.trainable = False
  9 | 
 10 | def get_transformers_trainable_variables(model, exclude=[]):
 11 |     transformers_variables = []
 12 | 
 13 |     # Transformers variables
 14 |     transformers_variables = model.get_layer("detr").get_layer("transformer").trainable_variables
 15 | 
 16 |     for layer in model.layers[2:]:
 17 |         if layer.name not in exclude:
 18 |             transformers_variables += layer.trainable_variables
 19 |         else:
 20 |             pass
 21 | 
 22 |     return transformers_variables
 23 | 
 24 | 
 25 | def get_backbone_trainable_variables(model):
 26 |     backbone_variables = []
 27 |     # layer [1] is the detr model including the backbone and the transformers
 28 | 
 29 |     detr = model.get_layer("detr")
 30 |     tr_index = [l.name for l in detr.layers].index('transformer')
 31 | 
 32 |     for l, layer in enumerate(detr.layers):
 33 |         if l != tr_index:
 34 |             backbone_variables += layer.trainable_variables
 35 | 
 36 |     return backbone_variables
 37 | 
 38 | 
 39 | def get_nlayers_trainables_variables(model, nlayers_names):
 40 |     nlayers_variables = []
 41 |     for nlayer_name in nlayers_names:
 42 |         nlayers_variables += model.get_layer(nlayer_name).trainable_variables
 43 |     return nlayers_variables
 44 | 
 45 | 
 46 | def get_trainable_variables(model, config):
 47 | 
 48 |     disable_batchnorm_training(model)
 49 | 
 50 |     backbone_variables = []
 51 |     transformers_variables = []
 52 |     nlayers_variables = []
 53 | 
 54 | 
 55 |     # Retrieve the gradient ofr each trainable variables
 56 |     #if config.train_backbone:
 57 |     backbone_variables = get_backbone_trainable_variables(model)
 58 |     #if config.train_transformers:
 59 |     transformers_variables = get_transformers_trainable_variables(model, exclude=config.nlayers)
 60 |     #if config.train_nlayers:
 61 |     nlayers_variables = get_nlayers_trainables_variables(model, config.nlayers)
 62 | 
 63 |     
 64 |     return backbone_variables, transformers_variables, nlayers_variables
 65 | 
 66 | 
 67 | def setup_optimizers(model, config):
 68 |     """ Method call by the Scheduler to init user data
 69 |     """
 70 |     @tf.function
 71 |     def get_backbone_learning_rate():
 72 |         return config.backbone_lr
 73 | 
 74 |     @tf.function
 75 |     def get_transformers_learning_rate():
 76 |         return config.transformers_lr
 77 | 
 78 |     @tf.function
 79 |     def get_nlayers_learning_rate():
 80 |         return config.nlayers_lr
 81 | 
 82 |     # Disable batch norm on the backbone
 83 |     disable_batchnorm_training(model)
 84 | 
 85 |     # Optimizers
 86 |     backbone_optimizer = tf.keras.optimizers.Adam(learning_rate=get_backbone_learning_rate, clipnorm=config.gradient_norm_clipping)
 87 |     transformers_optimizer = tf.keras.optimizers.Adam(learning_rate=get_transformers_learning_rate, clipnorm=config.gradient_norm_clipping)
 88 |     nlayers_optimizer = tf.keras.optimizers.Adam(learning_rate=get_nlayers_learning_rate, clipnorm=config.gradient_norm_clipping)
 89 | 
 90 |     # Set trainable variables
 91 | 
 92 |     backbone_variables, transformers_variables, nlayers_variables = [], [], []
 93 | 
 94 |     backbone_variables = get_backbone_trainable_variables(model)
 95 |     transformers_variables = get_transformers_trainable_variables(model, exclude=config.nlayers)
 96 |     nlayers_variables = get_nlayers_trainables_variables(model, config.nlayers)
 97 | 
 98 | 
 99 |     return {
100 |         "backbone_optimizer": backbone_optimizer,
101 |         "transformers_optimizer": transformers_optimizer,
102 |         "nlayers_optimizer": nlayers_optimizer,
103 | 
104 |         "backbone_variables": backbone_variables,
105 |         "transformers_variables": transformers_variables,
106 |         "nlayers_variables": nlayers_variables,
107 |     }
108 | 
109 | 
110 | def gather_gradient(model, optimizers, total_loss, tape, config, log):
111 | 
112 |     backbone_variables, transformers_variables, nlayers_variables = get_trainable_variables(model, config)
113 |     trainables_variables = backbone_variables + transformers_variables + nlayers_variables
114 | 
115 |     gradients = tape.gradient(total_loss, trainables_variables)
116 | 
117 |     # Retrieve the gradients from the tap
118 |     backbone_gradients = gradients[:len(optimizers["backbone_variables"])]
119 |     transformers_gradients = gradients[len(optimizers["backbone_variables"]):len(optimizers["backbone_variables"])+len(optimizers["transformers_variables"])]
120 |     nlayers_gradients = gradients[len(optimizers["backbone_variables"])+len(optimizers["transformers_variables"]):]
121 | 
122 |     gradient_steps = {}
123 | 
124 |     gradient_steps["backbone"] = {"gradients": backbone_gradients}
125 |     gradient_steps["transformers"] = {"gradients": transformers_gradients}
126 |     gradient_steps["nlayers"] = {"gradients": nlayers_gradients}
127 | 
128 |     
129 |     log.update({"backbone_lr": optimizers["backbone_optimizer"]._serialize_hyperparameter("learning_rate")})
130 |     log.update({"transformers_lr": optimizers["transformers_optimizer"]._serialize_hyperparameter("learning_rate")})
131 |     log.update({"nlayers_lr": optimizers["nlayers_optimizer"]._serialize_hyperparameter("learning_rate")})
132 | 
133 |     return gradient_steps
134 | 
135 | 
136 | 
137 | def aggregate_grad_and_apply(name, optimizers, gradients, step, config):
138 | 
139 |     gradient_aggregate = None
140 |     if config.target_batch is not None:
141 |         gradient_aggregate = int(config.target_batch // config.batch_size)
142 | 
143 |     gradient_name = "{}_gradients".format(name)
144 |     optimizer_name = "{}_optimizer".format(name)
145 |     variables_name = "{}_variables".format(name)
146 |     train_part_name = "train_{}".format(name)
147 | 
148 |     if getattr(config, train_part_name):
149 | 
150 |         # Init the aggregate gradient
151 |         if gradient_aggregate is not None and step % gradient_aggregate == 0:
152 |             optimizers[gradient_name] = [tf.zeros_like(tv) for tv in optimizers[variables_name]]
153 | 
154 | 
155 |         if gradient_aggregate is not None:
156 |             # Aggregate the gradient
157 |             optimizers[gradient_name] = [(gradient+n_gradient) if n_gradient is not None else None for gradient, n_gradient in zip(optimizers[gradient_name], gradients) ]
158 |         else:
159 |             optimizers[gradient_name] = gradients
160 | 
161 |         # Apply gradient if no gradient aggregate or if we finished gathering gradient oversteps
162 |         if gradient_aggregate is None or (step+1) %  gradient_aggregate == 0:
163 |             optimizers[optimizer_name].apply_gradients(zip(optimizers[gradient_name], optimizers[variables_name]))


--------------------------------------------------------------------------------
/detr_tf/training.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from .optimizers import gather_gradient, aggregate_grad_and_apply
 4 | from .logger.training_logging import valid_log, train_log
 5 | from .loss.loss import get_losses
 6 | import time
 7 | import wandb
 8 | 
 9 | @tf.function
10 | def run_train_step(model, images, t_bbox, t_class, optimizers, config):
11 | 
12 |     if config.target_batch is not None:
13 |         gradient_aggregate = int(config.target_batch // config.batch_size)
14 |     else:
15 |         gradient_aggregate = 1
16 | 
17 |     with tf.GradientTape() as tape:
18 |         m_outputs = model(images, training=True)
19 |         total_loss, log = get_losses(m_outputs, t_bbox, t_class, config)
20 |         total_loss = total_loss / gradient_aggregate
21 | 
22 |     # Compute gradient for each part of the network
23 |     gradient_steps = gather_gradient(model, optimizers, total_loss, tape, config, log)
24 | 
25 |     return m_outputs, total_loss, log, gradient_steps
26 | 
27 | 
28 | @tf.function
29 | def run_val_step(model, images, t_bbox, t_class, config):
30 |     m_outputs = model(images, training=False)
31 |     total_loss, log = get_losses(m_outputs, t_bbox, t_class, config)
32 |     return m_outputs, total_loss, log
33 | 
34 | 
35 | def fit(model, train_dt, optimizers, config, epoch_nb, class_names):
36 |     """ Train the model for one epoch
37 |     """
38 |     # Aggregate the gradient for bigger batch and better convergence
39 |     gradient_aggregate = None
40 |     if config.target_batch is not None:
41 |         gradient_aggregate = int(config.target_batch // config.batch_size)
42 |     t = None
43 |     for epoch_step , (images, t_bbox, t_class) in enumerate(train_dt):
44 | 
45 |         # Run the prediction and retrieve the gradient step for each part of the network
46 |         m_outputs, total_loss, log, gradient_steps = run_train_step(model, images, t_bbox, t_class, optimizers, config)
47 |         
48 |         # Load the predictions
49 |         if config.log:
50 |             train_log(images, t_bbox, t_class, m_outputs, config, config.global_step,  class_names, prefix="train/")
51 |         
52 |         # Aggregate and apply the gradient
53 |         for name in gradient_steps:
54 |             aggregate_grad_and_apply(name, optimizers, gradient_steps[name]["gradients"], epoch_step, config)
55 | 
56 |         # Log every 100 steps
57 |         if epoch_step % 100 == 0:
58 |             t = t if t is not None else time.time()
59 |             elapsed = time.time() - t
60 |             print(f"Epoch: [{epoch_nb}], \t Step: [{epoch_step}], \t ce: [{log['label_cost']:.2f}] \t giou : [{log['giou_loss']:.2f}] \t l1 : [{log['l1_loss']:.2f}] \t time : [{elapsed:.2f}]")
61 |             if config.log:
62 |                 wandb.log({f"train/{k}":log[k] for k in log}, step=config.global_step)
63 |             t = time.time()
64 |         
65 |         config.global_step += 1
66 | 
67 | 
68 | def eval(model, valid_dt, config, class_name, evaluation_step=200):
69 |     """ Evaluate the model on the validation set
70 |     """
71 |     t = None
72 |     for val_step, (images, t_bbox, t_class) in enumerate(valid_dt):
73 |         # Run prediction
74 |         m_outputs, total_loss, log = run_val_step(model, images, t_bbox, t_class, config)
75 |         # Log the predictions
76 |         if config.log:
77 |             valid_log(images, t_bbox, t_class, m_outputs, config, val_step, config.global_step,  class_name, evaluation_step=evaluation_step, prefix="train/")
78 |         # Log the metrics
79 |         if config.log and val_step == 0:
80 |             wandb.log({f"val/{k}":log[k] for k in log}, step=config.global_step)
81 |         # Log the progress
82 |         if val_step % 10 == 0:
83 |             t = t if t is not None else time.time()
84 |             elapsed = time.time() - t
85 |             print(f"Validation step: [{val_step}], \t ce: [{log['label_cost']:.2f}] \t giou : [{log['giou_loss']:.2f}] \t l1 : [{log['l1_loss']:.2f}] \t time : [{elapsed:.2f}]")
86 |         if val_step+1 >= evaluation_step:
87 |             break
88 | 


--------------------------------------------------------------------------------
/detr_tf/training_config.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import argparse
  3 | import os
  4 | 
  5 | 
  6 | def training_config_parser():
  7 |     """ Training config class can be overide using the script arguments
  8 |     """
  9 |     parser = argparse.ArgumentParser()
 10 | 
 11 |     # Dataset info
 12 |     parser.add_argument("--data_dir",  type=str, required=False, help="Path to the dataset directory")
 13 |     parser.add_argument("--img_dir",  type=str, required=False, help="Image directory relative to data_dir")
 14 |     parser.add_argument("--ann_file",  type=str, required=False, help="Annotation file relative to data_dir")
 15 |     parser.add_argument("--ann_dir",  type=str, required=False, help="Annotation directory relative to data_dir")
 16 | 
 17 |     parser.add_argument("--background_class",  type=int, required=False, default=0, help="Default background class")
 18 | 
 19 |     # What to train
 20 |     parser.add_argument("--train_backbone", action='store_true',  required=False, default=False, help="Train backbone")
 21 |     parser.add_argument("--train_transformers", action='store_true',   required=False, default=False, help="Train transformers")
 22 |     parser.add_argument("--train_nlayers",  action='store_true',  required=False, default=False, help="Train new layers")
 23 | 
 24 |     # How to train
 25 |     parser.add_argument("--finetuning",  default=False, required=False, action='store_true', help="Load the model weight before to train")
 26 |     parser.add_argument("--batch_size",  type=int, required=False, default=1, help="Batch size to use to train the model")
 27 |     parser.add_argument("--gradient_norm_clipping",  type=float, required=False, default=0.1, help="Gradient norm clipping")
 28 |     parser.add_argument("--target_batch",  type=int, required=False, default=None, help="When running on a single GPU, aggretate the gradient before to apply.")
 29 | 
 30 |     # Learning rate
 31 |     parser.add_argument("--backbone_lr",  type=bool, required=False, default=1e-5, help="Train backbone")
 32 |     parser.add_argument("--transformers_lr",  type=bool, required=False, default=1e-4, help="Train transformers")
 33 |     parser.add_argument("--nlayers_lr",  type=bool, required=False, default=1e-4, help="Train new layers")
 34 | 
 35 |     # Logging
 36 |     parser.add_argument("--log",  required=False, action="store_true", default=False, help="Log into wandb")
 37 | 
 38 |     return parser
 39 | 
 40 | 
 41 | class TrainingConfig():
 42 | 
 43 |     def __init__(self):
 44 | 
 45 |         # Dataset info
 46 |         self.data_dir, self.img_dir, self.ann_dir, self.ann_file = None, None, None, None
 47 |         self.data = DataConfig(data_dir=None, img_dir=None, ann_file=None, ann_dir=None)
 48 |         self.background_class = 0
 49 |         self.image_size = 376, 672
 50 | 
 51 |         # What to train
 52 |         self.train_backbone = False
 53 |         self.train_transformers = False
 54 |         self.train_nlayers = False
 55 | 
 56 |         # How to train
 57 |         self.finetuning = False
 58 |         self.batch_size = 1
 59 |         self.gradient_norm_clipping = 0.1
 60 |         # Batch aggregate before to backprop
 61 |         self.target_batch = 1
 62 | 
 63 |         # Learning rate
 64 |         # Set as tf.Variable so that the variable can be update during the training while
 65 |         # keeping the same graph
 66 |         self.backbone_lr = tf.Variable(1e-5)
 67 |         self.transformers_lr = tf.Variable(1e-4)
 68 |         self.nlayers_lr = tf.Variable(1e-4)
 69 |         self.nlayers = []
 70 | 
 71 |         # Training progress
 72 |         self.global_step = 0
 73 |         self.log = False
 74 | 
 75 |         # Pipeline variables
 76 |         self.normalized_method = "torch_resnet"
 77 |     
 78 |     
 79 |     def add_nlayers(self, layers):
 80 |         """ Set the new layers to train on the training config
 81 |         """
 82 |         self.nlayers = [l.name for l in layers]
 83 | 
 84 | 
 85 |     def update_from_args(self, args):
 86 |         """ Update the training config from args
 87 |         """
 88 |         args = vars(args)
 89 |         for key in args:
 90 |             if isinstance(getattr(self, key), tf.Variable):
 91 |                 getattr(self, key).assign(args[key])
 92 |             else:
 93 |                 setattr(self, key, args[key])
 94 |         
 95 |         # Set the config on the data class
 96 | 
 97 | 
 98 |         self.data = DataConfig(
 99 |             data_dir=self.data_dir,
100 |             img_dir=self.img_dir,
101 |             ann_file=self.ann_file,
102 |             ann_dir=self.ann_dir
103 |         )
104 | 
105 | 
106 | class DataConfig():
107 | 
108 |     def __init__(self, data_dir=None, img_dir=None, ann_file=None, ann_dir=None):
109 |         self.data_dir = data_dir
110 |         self.img_dir = os.path.join(data_dir, img_dir) if data_dir is not None and img_dir is not None else None
111 |         self.ann_file = os.path.join(self.data_dir, ann_file) if ann_file is not None else None
112 |         self.ann_dir = os.path.join(self.data_dir, ann_dir) if ann_dir is not None else None
113 | 
114 | 
115 | if __name__ == "__main__":
116 |     args = training_config_parser()
117 |     config = TrainingConfig()
118 |     config.update_from_args(args)


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
 1 | """ Eval a model on the coco dataset
 2 | """
 3 | 
 4 | import argparse
 5 | import tensorflow as tf
 6 | import os
 7 | import matplotlib.pyplot as plt
 8 | import numpy as np
 9 | 
10 | from detr_tf.inference import get_model_inference
11 | from detr_tf.data.coco import load_coco_dataset
12 | from detr_tf.loss.compute_map import cal_map, calc_map, APDataObject
13 | from detr_tf.networks.detr import get_detr_model
14 | from detr_tf.bbox import xcycwh_to_xy_min_xy_max, xcycwh_to_yx_min_yx_max
15 | from detr_tf.inference import numpy_bbox_to_image
16 | from detr_tf.training_config import TrainingConfig, training_config_parser
17 | 
18 | 
19 | def build_model(config):
20 |     """ Build the model with the pretrained weights. In this example
21 |     we do not add new layers since the pretrained model is already trained on coco.
22 |     See examples/finetuning_voc.py to add new layers.
23 |     """
24 |     # Load the pretrained model
25 |     detr = get_detr_model(config, include_top=True, weights="detr")
26 |     detr.summary()
27 |     return detr
28 | 
29 | 
30 | def eval_model(model, config, class_names, valid_dt):
31 |     """ Run evaluation
32 |     """
33 | 
34 |     iou_thresholds = [x / 100. for x in range(50, 100, 5)]
35 |     ap_data = {
36 |         'box' : [[APDataObject() for _ in class_names] for _ in iou_thresholds],
37 |         'mask': [[APDataObject() for _ in class_names] for _ in iou_thresholds]
38 |     }
39 |     it = 0
40 | 
41 |     for images, target_bbox, target_class in valid_dt:
42 |         # Forward pass
43 |         m_outputs = model(images)
44 |         # Run predictions
45 |         p_bbox, p_labels, p_scores = get_model_inference(m_outputs, config.background_class, bbox_format="yxyx")
46 |         # Remove padding
47 |         t_bbox, t_class = target_bbox[0], target_class[0]
48 |         size = tf.cast(t_bbox[0][0], tf.int32)
49 |         t_bbox = tf.slice(t_bbox, [1, 0], [size, 4])
50 |         t_bbox = xcycwh_to_yx_min_yx_max(t_bbox)
51 |         t_class = tf.slice(t_class, [1, 0], [size, -1])
52 |         t_class = tf.squeeze(t_class, axis=-1)
53 |         # Compute map
54 |         cal_map(p_bbox, p_labels, p_scores,  np.zeros((138, 138, len(p_bbox))), np.array(t_bbox), np.array(t_class), np.zeros((138, 138, len(t_bbox))), ap_data, iou_thresholds)
55 |         print(f"Computing map.....{it}", end="\r")
56 |         it += 1
57 |         #if it > 10:
58 |         #    break
59 | 
60 |     # Compute the mAp over all thresholds
61 |     calc_map(ap_data, iou_thresholds, class_names, print_result=True)
62 | 
63 | if __name__ == "__main__":
64 | 
65 |     physical_devices = tf.config.list_physical_devices('GPU')
66 |     if len(physical_devices) == 1:
67 |         tf.config.experimental.set_memory_growth(physical_devices[0], True)
68 | 
69 |     config = TrainingConfig()
70 |     args = training_config_parser().parse_args()
71 |     config.update_from_args(args)
72 | 
73 |     # Load the model with the new layers to finetune
74 |     detr = build_model(config)
75 | 
76 |     valid_dt, class_names = load_coco_dataset(config, 1, augmentation=None)
77 | 
78 |     # Run training
79 |     eval_model(detr, config, class_names, valid_dt)
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/finetune_coco.py:
--------------------------------------------------------------------------------
 1 | """ Example on how to finetune on COCO dataset
 2 | """
 3 | 
 4 | import argparse
 5 | import matplotlib.pyplot as plt
 6 | import tensorflow as tf
 7 | import numpy as np
 8 | import time
 9 | import os
10 | 
11 | from detr_tf.data.coco import load_coco_dataset
12 | from detr_tf.networks.detr import get_detr_model
13 | from detr_tf.optimizers import setup_optimizers
14 | from detr_tf.optimizers import gather_gradient, aggregate_grad_and_apply
15 | from detr_tf.logger.training_logging import train_log, valid_log
16 | from detr_tf.loss.loss import get_losses
17 | from detr_tf.training_config import TrainingConfig, training_config_parser
18 | from detr_tf import training
19 | 
20 | try:
21 |     # Should be optional if --log is not set
22 |     import wandb
23 | except:
24 |     wandb = None
25 | 
26 | 
27 | import time
28 | 
29 | 
30 | def build_model(config):
31 |     """ Build the model with the pretrained weights. In this example
32 |     we do not add new layers since the pretrained model is already trained on coco.
33 |     See examples/finetuning_voc.py to add new layers.
34 |     """
35 |     # Load the pretrained model
36 |     detr = get_detr_model(config, include_top=True, weights="detr")
37 |     detr.summary()
38 |     return detr
39 | 
40 | 
41 | def run_finetuning(config):
42 | 
43 |     # Load the model with the new layers to finetune
44 |     detr = build_model(config)
45 | 
46 |     # Load the training and validation dataset
47 |     train_dt, coco_class_names = load_coco_dataset("train", config.batch_size, config, augmentation=True)
48 |     valid_dt, _ = load_coco_dataset("val", 1, config, augmentation=False)
49 | 
50 |     # Train/finetune the transformers only
51 |     config.train_backbone = False
52 |     config.train_transformers = True
53 | 
54 |     # Setup the optimziers and the trainable variables
55 |     optimzers = setup_optimizers(detr, config)
56 | 
57 |     # Run the training for 5 epochs
58 |     for epoch_nb in range(100):
59 |         training.eval(detr, valid_dt, config, coco_class_names, evaluation_step=200)
60 |         training.fit(detr, train_dt, optimzers, config, epoch_nb, coco_class_names)
61 | 
62 | 
63 | if __name__ == "__main__":
64 | 
65 |     physical_devices = tf.config.list_physical_devices('GPU')
66 |     if len(physical_devices) == 1:
67 |         tf.config.experimental.set_memory_growth(physical_devices[0], True)
68 | 
69 |     config = TrainingConfig()
70 |     args = training_config_parser().parse_args()
71 |     config.update_from_args(args)
72 | 
73 |     if config.log:
74 |         wandb.init(project="detr-tensorflow", reinit=True)
75 |         
76 |     # Run training
77 |     run_finetuning(config)
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/finetune_hardhat.py:
--------------------------------------------------------------------------------
  1 | """ Example on how to finetune on the HardHat dataset
  2 | using custom layers. This script assume the dataset is already download 
  3 | on your computer in raw and Tensorflow Object detection csv format. 
  4 | 
  5 | Please, for more information, checkout the following notebooks:
  6 |     - DETR : How to setup a custom dataset
  7 | """
  8 | 
  9 | import argparse
 10 | import matplotlib.pyplot as plt
 11 | import tensorflow as tf
 12 | import numpy as np
 13 | import time
 14 | import os
 15 | 
 16 | from detr_tf.data import load_tfcsv_dataset
 17 | 
 18 | from detr_tf.networks.detr import get_detr_model
 19 | from detr_tf.optimizers import setup_optimizers
 20 | from detr_tf.logger.training_logging import train_log, valid_log
 21 | from detr_tf.loss.loss import get_losses
 22 | from detr_tf.inference import numpy_bbox_to_image
 23 | from detr_tf.training_config import TrainingConfig, training_config_parser
 24 | from detr_tf import training
 25 | 
 26 | try:
 27 |     # Should be optional if --log is not set
 28 |     import wandb
 29 | except:
 30 |     wandb = None
 31 | 
 32 | import time
 33 | 
 34 | 
 35 | def build_model(config):
 36 |     """ Build the model with the pretrained weights
 37 |     and add new layers to finetune
 38 |     """
 39 |     # Load the pretrained model with new heads at the top
 40 |     # 3 class : background head and helmet (we exclude here person from the dataset)
 41 |     detr = get_detr_model(config, include_top=False, nb_class=3, weights="detr", num_decoder_layers=6, num_encoder_layers=6)
 42 |     detr.summary()
 43 |     return detr
 44 | 
 45 | 
 46 | def run_finetuning(config):
 47 | 
 48 |     # Load the model with the new layers to finetune
 49 |     detr = build_model(config)
 50 | 
 51 |     # Load the training and validation dataset and exclude the person class
 52 |     train_dt, class_names = load_tfcsv_dataset(
 53 |         config, config.batch_size, augmentation=True, exclude=["person"], ann_file="train/_annotations.csv", img_dir="train")
 54 |     valid_dt, _ = load_tfcsv_dataset(
 55 |         config, 4, augmentation=False, exclude=["person"], ann_file="test/_annotations.csv", img_dir="test")
 56 | 
 57 |     # Train/finetune the transformers only
 58 |     config.train_backbone = tf.Variable(False)
 59 |     config.train_transformers = tf.Variable(False)
 60 |     config.train_nlayers = tf.Variable(True)
 61 |     # Learning rate (NOTE: The transformers and the backbone are NOT trained with)
 62 |     # a 0 learning rate. They're not trained, but we set the LR to 0 just so that it is clear
 63 |     # in the log that both are not trained at the begining
 64 |     config.backbone_lr = tf.Variable(0.0)
 65 |     config.transformers_lr = tf.Variable(0.0)
 66 |     config.nlayers_lr = tf.Variable(1e-3)
 67 | 
 68 |     # Setup the optimziers and the trainable variables
 69 |     optimzers = setup_optimizers(detr, config)
 70 | 
 71 |     # Run the training for 180 epochs
 72 |     for epoch_nb in range(180):
 73 | 
 74 |         if epoch_nb > 0:
 75 |             # After the first epoch, we finetune the transformers and the new layers
 76 |             config.train_transformers.assign(True)
 77 |             config.transformers_lr.assign(1e-4)
 78 |             config.nlayers_lr.assign(1e-3)
 79 | 
 80 |         training.eval(detr, valid_dt, config, class_names, evaluation_step=100)
 81 |         training.fit(detr, train_dt, optimzers, config, epoch_nb, class_names)
 82 | 
 83 | 
 84 | if __name__ == "__main__":
 85 | 
 86 |     physical_devices = tf.config.list_physical_devices('GPU')
 87 |     if len(physical_devices) == 1:
 88 |         tf.config.experimental.set_memory_growth(physical_devices[0], True)
 89 | 
 90 |     config = TrainingConfig()
 91 |     args = training_config_parser().parse_args()
 92 |     config.update_from_args(args)
 93 | 
 94 |     if config.log:
 95 |         wandb.init(project="detr-tensorflow", reinit=True)
 96 |         
 97 |     # Run training
 98 |     run_finetuning(config)
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/finetune_voc.py:
--------------------------------------------------------------------------------
  1 | """ Example on how to finetune on the VOC dataset
  2 | using custom layers.
  3 | """
  4 | 
  5 | import argparse
  6 | import matplotlib.pyplot as plt
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | import time
 10 | 
 11 | try:
 12 |     # Should be optional if --log is not set
 13 |     import wandb
 14 | except:
 15 |     wandb = None
 16 | 
 17 | import os
 18 | 
 19 | from detr_tf.data import load_voc_dataset
 20 | from detr_tf.networks.detr import get_detr_model
 21 | from detr_tf.optimizers import setup_optimizers
 22 | from detr_tf.training_config import TrainingConfig, training_config_parser
 23 | from detr_tf import training
 24 | 
 25 | VOC_CLASS_NAME = [
 26 |     'aeroplane', 'bicycle', 'bird', 'boat',
 27 |     'bottle', 'bus', 'car', 'cat', 'chair',
 28 |     'cow', 'diningtable', 'dog', 'horse',
 29 |     'motorbike', 'person', 'pottedplant',
 30 |     'sheep', 'sofa', 'train', 'tvmonitor'
 31 | ]
 32 | 
 33 | def build_model(config):
 34 |     """ Build the model with the pretrained weights
 35 |     and add new layers to finetune
 36 |     """
 37 |     # Input
 38 |     image_input = tf.keras.Input((None, None, 3))
 39 | 
 40 |     # Load the pretrained model
 41 |     detr = get_detr_model(config, include_top=False, weights="detr", num_decoder_layers=6, num_encoder_layers=6)
 42 | 
 43 |     # Setup the new layers
 44 |     cls_layer = tf.keras.layers.Dense(len(VOC_CLASS_NAME) + 1, name="cls_layer")
 45 |     pos_layer = tf.keras.models.Sequential([
 46 |         tf.keras.layers.Dense(256, activation="relu"),
 47 |         tf.keras.layers.Dense(256, activation="relu"),
 48 |         tf.keras.layers.Dense(4, activation="sigmoid"),
 49 |     ], name="pos_layer")
 50 |     config.add_nlayers([cls_layer, pos_layer])
 51 | 
 52 |     transformer_output = detr(image_input)
 53 |     cls_preds = cls_layer(transformer_output)
 54 |     pos_preds = pos_layer(transformer_output)
 55 | 
 56 |     # Define the main outputs along with the auxialiary loss
 57 |     outputs = {'pred_logits': cls_preds[-1], 'pred_boxes': pos_preds[-1]}
 58 |     outputs["aux"] = [ {"pred_logits": cls_preds[i], "pred_boxes": pos_preds[i]} for i in range(0, 5)]
 59 | 
 60 |     detr = tf.keras.Model(image_input, outputs, name="detr_finetuning")
 61 |     detr.summary()
 62 |     return detr
 63 | 
 64 | 
 65 | def run_finetuning(config):
 66 | 
 67 |     # Load the model with the new layers to finetune
 68 |     detr = build_model(config)
 69 | 
 70 |     # Load the training and validation dataset (for the purpose of this example we're gonna load the training
 71 |     # as the validation, but in practise you should have different folder loader for the training and the validation)
 72 |     train_dt, class_names = load_voc_dataset(config,  config.batch_size, augmentation=True)
 73 |     valid_dt, _ = load_voc_dataset(config, 1, augmentation=False)
 74 | 
 75 |     # Train/finetune the transformers only
 76 |     config.train_backbone = tf.Variable(False)
 77 |     config.train_transformers = tf.Variable(False)
 78 |     config.train_nlayers = tf.Variable(True)
 79 |     # Learning rate (NOTE: The transformers and the backbone are NOT trained with)
 80 |     # a 0 learning rate. They're not trained, but we set the LR to 0 just so that it is clear
 81 |     # in the log that both are not trained at the begining
 82 |     config.backbone_lr = tf.Variable(0.0)
 83 |     config.transformers_lr = tf.Variable(0.0)
 84 |     config.nlayers_lr = tf.Variable(1e-3)
 85 | 
 86 |     # Setup the optimziers and the trainable variables
 87 |     optimzers = setup_optimizers(detr, config)
 88 | 
 89 |     # Run the training for 5 epochs
 90 |     for epoch_nb in range(10):
 91 | 
 92 |         if epoch_nb > 0:
 93 |             # After the first epoch, we finetune the transformers and the new layers
 94 |             config.train_transformers.assign(True)
 95 |             config.transformers_lr.assign(1e-4)
 96 |             config.nlayers_lr.assign(1e-3)
 97 | 
 98 |         training.eval(detr, valid_dt, config, class_names, evaluation_step=200)
 99 |         training.fit(detr, train_dt, optimzers, config, epoch_nb, class_names)
100 | 
101 | 
102 | if __name__ == "__main__":
103 | 
104 |     physical_devices = tf.config.list_physical_devices('GPU')
105 |     if len(physical_devices) == 1:
106 |         tf.config.experimental.set_memory_growth(physical_devices[0], True)
107 | 
108 |     config = TrainingConfig()
109 |     args = training_config_parser().parse_args()
110 |     config.update_from_args(args)
111 | 
112 |     if config.log:
113 |         wandb.init(project="detr-tensorflow", reinit=True)
114 |         
115 |     # Run training
116 |     run_finetuning(config)
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 


--------------------------------------------------------------------------------
/images/datasetsupport.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Visual-Behavior/detr-tensorflow/78fb71bca7b2ebf90e73151a51d29e8dbb46cb38/images/datasetsupport.png


--------------------------------------------------------------------------------
/images/detr-figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Visual-Behavior/detr-tensorflow/78fb71bca7b2ebf90e73151a51d29e8dbb46cb38/images/detr-figure.png


--------------------------------------------------------------------------------
/images/hardhatdataset.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Visual-Behavior/detr-tensorflow/78fb71bca7b2ebf90e73151a51d29e8dbb46cb38/images/hardhatdataset.jpg


--------------------------------------------------------------------------------
/images/tutorials/data-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Visual-Behavior/detr-tensorflow/78fb71bca7b2ebf90e73151a51d29e8dbb46cb38/images/tutorials/data-pipeline.png


--------------------------------------------------------------------------------
/images/tutorials/download_hardhat_dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Visual-Behavior/detr-tensorflow/78fb71bca7b2ebf90e73151a51d29e8dbb46cb38/images/tutorials/download_hardhat_dataset.png


--------------------------------------------------------------------------------
/images/wandb_logging.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Visual-Behavior/detr-tensorflow/78fb71bca7b2ebf90e73151a51d29e8dbb46cb38/images/wandb_logging.png


--------------------------------------------------------------------------------
/images/webcam_detr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Visual-Behavior/detr-tensorflow/78fb71bca7b2ebf90e73151a51d29e8dbb46cb38/images/webcam_detr.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | wandb
2 | matplotlib
3 | numpy
4 | pycocotools
5 | scikit-image
6 | imageio
7 | pandas


--------------------------------------------------------------------------------
/train_coco.py:
--------------------------------------------------------------------------------
 1 | """ Example on how to train on COCO from scratch
 2 | """
 3 | 
 4 | 
 5 | import argparse
 6 | import matplotlib.pyplot as plt
 7 | import tensorflow as tf
 8 | import numpy as np
 9 | import time
10 | import os
11 | 
12 | from detr_tf.data.coco import load_coco_dataset
13 | from detr_tf.networks.detr import get_detr_model
14 | from detr_tf.optimizers import setup_optimizers
15 | from detr_tf.optimizers import gather_gradient, aggregate_grad_and_apply
16 | from detr_tf.logger.training_logging import train_log, valid_log
17 | from detr_tf.loss.loss import get_losses
18 | from detr_tf.inference import numpy_bbox_to_image
19 | from detr_tf.training_config import TrainingConfig, training_config_parser
20 | from detr_tf import training
21 | 
22 | try:
23 |     # Should be optional if --log is not set
24 |     import wandb
25 | except:
26 |     wandb = None
27 | 
28 | 
29 | import time
30 | 
31 | 
32 | def build_model(config):
33 |     """ Build the model with the pretrained weights. In this example
34 |     we do not add new layers since the pretrained model is already trained on coco.
35 |     See examples/finetuning_voc.py to add new layers.
36 |     """
37 |     # Load detr model without weight. 
38 |     # Use the tensorflow backbone with the imagenet weights
39 |     detr = get_detr_model(config, include_top=True, weights=None, tf_backbone=True)
40 |     detr.summary()
41 |     return detr
42 | 
43 | 
44 | def run_finetuning(config):
45 | 
46 |     # Load the model with the new layers to finetune
47 |     detr = build_model(config)
48 | 
49 |     # Load the training and validation dataset
50 |     train_dt, coco_class_names = load_coco_dataset(
51 |         config, config.batch_size, augmentation=True, img_dir="train2017", ann_fil="annotations/instances_train2017.json")
52 |     valid_dt, _ = load_coco_dataset(
53 |         config, 1, augmentation=False, img_dir="val2017", ann_fil="annotations/instances_val2017.json")
54 | 
55 |     # Train the backbone and the transformers
56 |     # Check the training_config file for the other hyperparameters
57 |     config.train_backbone = True
58 |     config.train_transformers = True
59 | 
60 |     # Setup the optimziers and the trainable variables
61 |     optimzers = setup_optimizers(detr, config)
62 | 
63 |     # Run the training for 100 epochs
64 |     for epoch_nb in range(100):
65 |         training.eval(detr, valid_dt, config, coco_class_names, evaluation_step=200)
66 |         training.fit(detr, train_dt, optimzers, config, epoch_nb, coco_class_names)
67 | 
68 | 
69 | if __name__ == "__main__":
70 | 
71 |     physical_devices = tf.config.list_physical_devices('GPU')
72 |     if len(physical_devices) == 1:
73 |         tf.config.experimental.set_memory_growth(physical_devices[0], True)
74 | 
75 |     config = TrainingConfig()
76 |     args = training_config_parser().parse_args()
77 |     config.update_from_args(args)
78 | 
79 |     if config.log:
80 |         wandb.init(project="detr-tensorflow", reinit=True)
81 |         
82 |     # Run training
83 |     run_finetuning(config)
84 | 
85 | 
86 | 
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/webcam_inference.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | import cv2
 4 | 
 5 | from detr_tf.training_config import TrainingConfig, training_config_parser
 6 | from detr_tf.networks.detr import get_detr_model
 7 | from detr_tf.data import processing
 8 | from detr_tf.data.coco import COCO_CLASS_NAME
 9 | from detr_tf.inference import get_model_inference, numpy_bbox_to_image
10 | 
11 | @tf.function
12 | def run_inference(model, images, config):
13 |     m_outputs = model(images, training=False)
14 |     predicted_bbox, predicted_labels, predicted_scores = get_model_inference(m_outputs, config.background_class, bbox_format="xy_center")
15 |     return predicted_bbox, predicted_labels, predicted_scores
16 | 
17 | 
18 | def run_webcam_inference(detr):
19 | 
20 |     cap = cv2.VideoCapture(0)
21 | 
22 |     while(True):
23 |         ret, frame = cap.read()
24 | 
25 |         # Convert to RGB and process the input image
26 |         model_input = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
27 |         model_input = processing.normalized_images(model_input, config)
28 | 
29 |         # Run inference
30 |         predicted_bbox, predicted_labels, predicted_scores = run_inference(detr, np.expand_dims(model_input, axis=0), config)
31 | 
32 |         frame = frame.astype(np.float32)
33 |         frame = frame / 255
34 |         frame = numpy_bbox_to_image(frame, predicted_bbox, labels=predicted_labels, scores=predicted_scores, class_name=COCO_CLASS_NAME)
35 | 
36 |         cv2.imshow('frame', frame)
37 |         if cv2.waitKey(1) & 0xFF == ord('q'):
38 |             break
39 | 
40 |     # When everything done, release the capture
41 |     cap.release()
42 |     cv2.destroyAllWindows()
43 | 
44 | if __name__ == "__main__":
45 | 
46 |     physical_devices = tf.config.list_physical_devices('GPU')
47 |     if len(physical_devices) == 1:
48 |         tf.config.experimental.set_memory_growth(physical_devices[0], True)
49 | 
50 |     config = TrainingConfig()
51 |     args = training_config_parser().parse_args()
52 |     config.update_from_args(args)
53 | 
54 |     # Load the model with the new layers to finetune
55 |     detr = get_detr_model(config, include_top=True, weights="detr")
56 |     config.background_class = 91
57 | 
58 |     # Run webcam inference
59 |     run_webcam_inference(detr)
60 | 


--------------------------------------------------------------------------------