├── .DS_Store ├── .gitignore ├── __init__.py ├── callbacks ├── __init__.py └── model_checkpoint.py ├── configs ├── ssd300_mobilenetv1.json ├── ssd300_mobilenetv1_coco2017-train.json ├── ssd300_mobilenetv2.json ├── ssd300_mobilenetv2_coco2017-train.json ├── ssd300_vgg16.json ├── ssd300_vgg16_pascal-voc-07-12.json ├── ssd300_vgg16_pascal-voc-2007.json ├── ssd320_mobilenetv2_coco2017-train.json ├── tbpp384_vgg16.json └── tbpp768_vgg16.json ├── convert.py ├── custom_layers ├── __init__.py ├── decode_ssd_predictions.py ├── decode_tbpp_predictions.py ├── default_boxes.py └── l2_normalization.py ├── data_generators ├── __init__.py ├── ssd_data_generator.py └── tbpp_data_generator.py ├── display_default_boxes.py ├── evaluate.py ├── evaluate.sh ├── inference.py ├── inference.sh ├── losses ├── __init__.py ├── smooth_l1_loss.py ├── softmax_loss.py ├── ssd_loss.py └── tbpp_loss.py ├── networks ├── __init__.py ├── base_networks │ ├── __init__.py │ └── truncated_vgg16.py ├── ssd_mobilenet.py ├── ssd_mobilenetv2.py ├── ssd_vgg16.py └── tbpp_vgg16.py ├── playground.py ├── requirements.txt ├── test.py ├── test.sh ├── train.py ├── train.sh ├── utils ├── __init__.py ├── augmentation_utils │ ├── __init__.py │ ├── bboxes_filter.py │ ├── random_brightness.py │ ├── random_contrast.py │ ├── random_crop.py │ ├── random_crop_quad.py │ ├── random_expand.py │ ├── random_expand_quad.py │ ├── random_horizontal_flip.py │ ├── random_horizontal_flip_quad.py │ ├── random_hue.py │ ├── random_lighting_noise.py │ ├── random_saturation.py │ ├── random_vertical_flip.py │ ├── random_vertical_flip_quad.py │ └── resize_to_fixed_size.py ├── bbox_utils │ ├── __init__.py │ ├── center_to_corner.py │ ├── center_to_vertices.py │ ├── corner_to_center.py │ ├── iou.py │ └── object_coverage.py ├── command_line_utils │ ├── __init__.py │ └── str2bool.py ├── data_utils │ ├── __init__.py │ ├── coco_text.py │ └── get_samples_from_split.py ├── display_tbpp_data_sample.py ├── inference_utils │ ├── __init__.py │ ├── ssd_mobilenetv1.py │ ├── ssd_mobilenetv2.py │ ├── ssd_vgg16.py │ └── tbpp_vgg16.py ├── one_hot_class_label.py ├── pascal_voc_utils │ ├── __init__.py │ └── read_label.py ├── prepare_coco_dataset.py ├── prepare_cocotextv2_dataset.py ├── prepare_icdar-2013_dataset.py ├── prepare_icdar-2015_dataset.py ├── prepare_midv500_dataset.py ├── prepare_pascal-voc-2007-2012_dataset.py ├── prepare_pascal_voc_2007_dataset.py ├── prepare_pascal_voc_2012_dataset.py ├── prepare_svt_dataset.py ├── prepare_synthtext_dataset.py ├── ssd_utils │ ├── __init__.py │ ├── decode_predictions.py │ ├── encode_bboxes.py │ ├── generate_default_boxes_for_feature_map.py │ ├── get_number_default_boxes.py │ ├── match_gt_boxes_to_default_boxes.py │ └── read_sample.py ├── textboxes_utils │ ├── __init__.py │ ├── decode_predictions.py │ ├── encode_textboxes.py │ ├── get_bboxes_from_quads.py │ ├── get_num_quads.py │ ├── get_samples.py │ ├── read_sample.py │ └── sort_quads_vertices.py ├── training_utils │ ├── __init__.py │ ├── ssd_mobilenetv1.py │ ├── ssd_mobilenetv2.py │ ├── ssd_vgg16.py │ └── tbpp_vgg16.py └── visualize_training_metrics.py └── webcam.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Socret360/object-detection-in-keras/0b582a46f41623a1e1166c1e2b050f55788c3003/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Dataset 10 | data/ 11 | 12 | # VSCode 13 | .vscode/ 14 | 15 | # Distribution / packaging 16 | .Python 17 | env/ 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | .hypothesis/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # pyenv 80 | .python-version 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # SageMath parsed files 86 | *.sage.py 87 | 88 | # dotenv 89 | .env 90 | 91 | # virtualenv 92 | .venv 93 | venv/ 94 | ENV/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | 109 | # experiments data 110 | experiments/ 111 | 112 | #datasets 113 | datasets/ 114 | 115 | .idea/ 116 | 117 | temp.py 118 | 119 | output/ 120 | data/ 121 | base_networks/pretrained_weights 122 | sample_data 123 | experiment.py 124 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Socret360/object-detection-in-keras/0b582a46f41623a1e1166c1e2b050f55788c3003/__init__.py -------------------------------------------------------------------------------- /callbacks/__init__.py: -------------------------------------------------------------------------------- 1 | from .model_checkpoint import ModelCheckpoint 2 | -------------------------------------------------------------------------------- /callbacks/model_checkpoint.py: -------------------------------------------------------------------------------- 1 | import os 2 | from matplotlib import pyplot as plt 3 | from tensorflow.keras.callbacks import Callback 4 | 5 | 6 | class ModelCheckpoint(Callback): 7 | """ A callback to save a model checkpoint every n batches (iterations) or n epochs. 8 | 9 | Args: 10 | - output_dir: Path to output directory in which to save the checkpoint. 11 | """ 12 | 13 | def __init__(self, output_dir, epoch_frequency, iteration_frequency): 14 | self.output_dir = output_dir 15 | self.iteration_frequency = iteration_frequency 16 | self.epoch_frequency = epoch_frequency 17 | self.iterations = 1 18 | self.epochs = 1 19 | self.losses_by_iteration = [] 20 | self.losses_by_epoch = [] 21 | 22 | def on_epoch_end(self, epoch, logs={}): 23 | print(logs) 24 | if self.epoch_frequency is not None: 25 | loss = logs["loss"] 26 | self.losses_by_epoch.append(loss) 27 | if self.epochs % self.epoch_frequency == 0: 28 | loss = '%.4f' % loss 29 | name = f"cp_ep_{self.epochs}_loss_{loss}.h5" 30 | self.model.save_weights(os.path.join(self.output_dir, name)) 31 | plt.plot(list(range(1, self.epochs+1)), self.losses_by_epoch) 32 | plt.title('training loss') 33 | plt.ylabel('loss') 34 | plt.xlabel('epoch') 35 | plt.savefig(os.path.join(self.output_dir, "log.png")) 36 | self.epochs += 1 37 | -------------------------------------------------------------------------------- /configs/ssd300_mobilenetv1.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "name": "ssd_mobilenetv1", 4 | "input_size": 300, 5 | "l2_regularization": 0.0005, 6 | "kernel_initializer": "he_normal", 7 | "width_multiplier": 1, 8 | "depth_multiplier": 1, 9 | "default_boxes": { 10 | "extra_box_for_ar_1": true, 11 | "clip_boxes": true, 12 | "variances": [ 13 | 0.1, 14 | 0.1, 15 | 0.2, 16 | 0.2 17 | ], 18 | "min_scale": 0.2, 19 | "max_scale": 0.9, 20 | "layers": [ 21 | { 22 | "name": "conv_pw_11_relu", 23 | "size": 18, 24 | "offset": [ 25 | 0.5, 26 | 0.5 27 | ], 28 | "aspect_ratios": [ 29 | 2.0, 30 | 3.0 31 | ] 32 | }, 33 | { 34 | "name": "conv_pw_13_relu", 35 | "size": 9, 36 | "offset": [ 37 | 0.5, 38 | 0.5 39 | ], 40 | "aspect_ratios": [ 41 | 2.0, 42 | 3.0 43 | ] 44 | }, 45 | { 46 | "name": "conv14_2/relu", 47 | "size": 5, 48 | "offset": [ 49 | 0.5, 50 | 0.5 51 | ], 52 | "aspect_ratios": [ 53 | 2.0, 54 | 3.0 55 | ] 56 | }, 57 | { 58 | "name": "conv15_2/relu", 59 | "size": 3, 60 | "offset": [ 61 | 0.5, 62 | 0.5 63 | ], 64 | "aspect_ratios": [ 65 | 2.0, 66 | 3.0 67 | ] 68 | }, 69 | { 70 | "name": "conv16_2/relu", 71 | "size": 2, 72 | "offset": [ 73 | 0.5, 74 | 0.5 75 | ], 76 | "aspect_ratios": [ 77 | 2.0, 78 | 3.0 79 | ] 80 | }, 81 | { 82 | "name": "conv17_2/relu", 83 | "size": 1, 84 | "offset": [ 85 | 0.5, 86 | 0.5 87 | ], 88 | "aspect_ratios": [ 89 | 2.0, 90 | 3.0 91 | ] 92 | } 93 | ] 94 | } 95 | }, 96 | "training": { 97 | "match_threshold": 0.5, 98 | "neutral_threshold": 0.3, 99 | "min_negative_boxes": 0, 100 | "negative_boxes_ratio": 3, 101 | "alpha": 1 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /configs/ssd300_mobilenetv1_coco2017-train.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "name": "ssd300_mobilenetv1", 4 | "input_size": 300, 5 | "l2_regularization": 0.0005, 6 | "kernel_initializer": "he_normal", 7 | "width_multiplier": 1, 8 | "depth_multiplier": 1, 9 | "default_boxes": { 10 | "extra_box_for_ar_1": true, 11 | "clip_boxes": true, 12 | "variances": [ 13 | 0.1, 14 | 0.1, 15 | 0.2, 16 | 0.2 17 | ], 18 | "min_scale": 0.15, 19 | "max_scale": 0.9, 20 | "layers": [ 21 | { 22 | "name": "conv_pw_11_relu", 23 | "size": 18, 24 | "offset": [ 25 | 0.5, 26 | 0.5 27 | ], 28 | "aspect_ratios": [ 29 | 1.0, 30 | 2.0, 31 | 0.5, 32 | 3.0, 33 | 0.33 34 | ] 35 | }, 36 | { 37 | "name": "conv_pw_13_relu", 38 | "size": 9, 39 | "offset": [ 40 | 0.5, 41 | 0.5 42 | ], 43 | "aspect_ratios": [ 44 | 1.0, 45 | 2.0, 46 | 0.5, 47 | 3.0, 48 | 0.33 49 | ] 50 | }, 51 | { 52 | "name": "conv14_2/relu", 53 | "size": 5, 54 | "offset": [ 55 | 0.5, 56 | 0.5 57 | ], 58 | "aspect_ratios": [ 59 | 1.0, 60 | 2.0, 61 | 0.5, 62 | 3.0, 63 | 0.33 64 | ] 65 | }, 66 | { 67 | "name": "conv15_2/relu", 68 | "size": 3, 69 | "offset": [ 70 | 0.5, 71 | 0.5 72 | ], 73 | "aspect_ratios": [ 74 | 1.0, 75 | 2.0, 76 | 0.5 77 | ] 78 | }, 79 | { 80 | "name": "conv16_2/relu", 81 | "size": 2, 82 | "offset": [ 83 | 0.5, 84 | 0.5 85 | ], 86 | "aspect_ratios": [ 87 | 1.0, 88 | 2.0, 89 | 0.5 90 | ] 91 | }, 92 | { 93 | "name": "conv17_2/relu", 94 | "size": 1, 95 | "offset": [ 96 | 0.5, 97 | 0.5 98 | ], 99 | "aspect_ratios": [ 100 | 1.0, 101 | 2.0, 102 | 0.5 103 | ] 104 | } 105 | ] 106 | } 107 | }, 108 | "training": { 109 | "match_threshold": 0.5, 110 | "neutral_threshold": 0.3, 111 | "min_negative_boxes": 0, 112 | "negative_boxes_ratio": 3, 113 | "alpha": 1 114 | } 115 | } -------------------------------------------------------------------------------- /configs/ssd300_mobilenetv2.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "name": "ssd_mobilenetv2", 4 | "input_size": 300, 5 | "l2_regularization": 0.0005, 6 | "kernel_initializer": "he_normal", 7 | "width_multiplier": 0.5, 8 | "default_boxes": { 9 | "extra_box_for_ar_1": true, 10 | "clip_boxes": true, 11 | "variances": [ 12 | 0.1, 13 | 0.1, 14 | 0.2, 15 | 0.2 16 | ], 17 | "min_scale": 0.2, 18 | "max_scale": 0.9, 19 | "layers": [ 20 | { 21 | "name": "block_13_expand_relu", 22 | "size": 19, 23 | "offset": [ 24 | 0.5, 25 | 0.5 26 | ], 27 | "aspect_ratios": [ 28 | 1.0, 29 | 2.0, 30 | 0.5, 31 | 3.0, 32 | 0.33 33 | ] 34 | }, 35 | { 36 | "name": "block_16_project_BN", 37 | "size": 10, 38 | "offset": [ 39 | 0.5, 40 | 0.5 41 | ], 42 | "aspect_ratios": [ 43 | 1.0, 44 | 2.0, 45 | 0.5, 46 | 3.0, 47 | 0.33 48 | ] 49 | }, 50 | { 51 | "name": "conv17_2/relu", 52 | "size": 5, 53 | "offset": [ 54 | 0.5, 55 | 0.5 56 | ], 57 | "aspect_ratios": [ 58 | 1.0, 59 | 2.0, 60 | 0.5, 61 | 3.0, 62 | 0.33 63 | ] 64 | }, 65 | { 66 | "name": "conv18_2/relu", 67 | "size": 3, 68 | "offset": [ 69 | 0.5, 70 | 0.5 71 | ], 72 | "aspect_ratios": [ 73 | 1.0, 74 | 2.0, 75 | 0.5, 76 | 3.0, 77 | 0.33 78 | ] 79 | }, 80 | { 81 | "name": "conv19_2/relu", 82 | "size": 2, 83 | "offset": [ 84 | 0.5, 85 | 0.5 86 | ], 87 | "aspect_ratios": [ 88 | 1.0, 89 | 2.0, 90 | 0.5, 91 | 3.0, 92 | 0.33 93 | ] 94 | }, 95 | { 96 | "name": "conv20_2/relu", 97 | "size": 1, 98 | "offset": [ 99 | 0.5, 100 | 0.5 101 | ], 102 | "aspect_ratios": [ 103 | 1.0, 104 | 2.0, 105 | 0.5, 106 | 3.0, 107 | 0.33 108 | ] 109 | } 110 | ] 111 | } 112 | }, 113 | "training": { 114 | "match_threshold": 0.5, 115 | "neutral_threshold": 0.3, 116 | "min_negative_boxes": 0, 117 | "negative_boxes_ratio": 3, 118 | "alpha": 1 119 | } 120 | } -------------------------------------------------------------------------------- /configs/ssd300_mobilenetv2_coco2017-train.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "name": "ssd_mobilenetv2", 4 | "input_size": 300, 5 | "l2_regularization": 0.0005, 6 | "kernel_initializer": "he_normal", 7 | "width_multiplier": 0.5, 8 | "default_boxes": { 9 | "extra_box_for_ar_1": true, 10 | "clip_boxes": true, 11 | "variances": [ 12 | 0.1, 13 | 0.1, 14 | 0.2, 15 | 0.2 16 | ], 17 | "min_scale": 0.15, 18 | "max_scale": 0.9, 19 | "layers": [ 20 | { 21 | "name": "block_13_expand_relu", 22 | "size": 19, 23 | "offset": [ 24 | 0.5, 25 | 0.5 26 | ], 27 | "aspect_ratios": [ 28 | 1.0, 29 | 2.0, 30 | 0.5, 31 | 3.0, 32 | 0.33 33 | ] 34 | }, 35 | { 36 | "name": "block_16_project_BN", 37 | "size": 10, 38 | "offset": [ 39 | 0.5, 40 | 0.5 41 | ], 42 | "aspect_ratios": [ 43 | 1.0, 44 | 2.0, 45 | 0.5, 46 | 3.0, 47 | 0.33 48 | ] 49 | }, 50 | { 51 | "name": "conv17_2/relu", 52 | "size": 5, 53 | "offset": [ 54 | 0.5, 55 | 0.5 56 | ], 57 | "aspect_ratios": [ 58 | 1.0, 59 | 2.0, 60 | 0.5, 61 | 3.0, 62 | 0.33 63 | ] 64 | }, 65 | { 66 | "name": "conv18_2/relu", 67 | "size": 3, 68 | "offset": [ 69 | 0.5, 70 | 0.5 71 | ], 72 | "aspect_ratios": [ 73 | 1.0, 74 | 2.0, 75 | 0.5, 76 | 3.0, 77 | 0.33 78 | ] 79 | }, 80 | { 81 | "name": "conv19_2/relu", 82 | "size": 2, 83 | "offset": [ 84 | 0.5, 85 | 0.5 86 | ], 87 | "aspect_ratios": [ 88 | 1.0, 89 | 2.0, 90 | 0.5, 91 | 3.0, 92 | 0.33 93 | ] 94 | }, 95 | { 96 | "name": "conv20_2/relu", 97 | "size": 1, 98 | "offset": [ 99 | 0.5, 100 | 0.5 101 | ], 102 | "aspect_ratios": [ 103 | 1.0, 104 | 2.0, 105 | 0.5, 106 | 3.0, 107 | 0.33 108 | ] 109 | } 110 | ] 111 | } 112 | }, 113 | "training": { 114 | "match_threshold": 0.5, 115 | "neutral_threshold": 0.3, 116 | "min_negative_boxes": 0, 117 | "negative_boxes_ratio": 3, 118 | "alpha": 1 119 | } 120 | } -------------------------------------------------------------------------------- /configs/ssd300_vgg16.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "name": "ssd_vgg16", 4 | "input_size": 300, 5 | "l2_regularization": 0.0005, 6 | "kernel_initializer": "he_normal", 7 | "default_boxes": { 8 | "extra_box_for_ar_1": true, 9 | "clip_boxes": false, 10 | "variances": [ 11 | 0.1, 12 | 0.1, 13 | 0.2, 14 | 0.2 15 | ], 16 | "min_scale": 0.2, 17 | "max_scale": 0.9, 18 | "layers": [ 19 | { 20 | "name": "conv4_3", 21 | "size": 38, 22 | "offset": [ 23 | 0.5, 24 | 0.5 25 | ], 26 | "aspect_ratios": [ 27 | 1.0, 28 | 2.0, 29 | 0.5 30 | ] 31 | }, 32 | { 33 | "name": "fc7", 34 | "size": 19, 35 | "offset": [ 36 | 0.5, 37 | 0.5 38 | ], 39 | "aspect_ratios": [ 40 | 1.0, 41 | 2.0, 42 | 0.5, 43 | 3.0, 44 | 0.33 45 | ] 46 | }, 47 | { 48 | "name": "conv8_2", 49 | "size": 10, 50 | "offset": [ 51 | 0.5, 52 | 0.5 53 | ], 54 | "aspect_ratios": [ 55 | 1.0, 56 | 2.0, 57 | 0.5, 58 | 3.0, 59 | 0.33 60 | ] 61 | }, 62 | { 63 | "name": "conv9_2", 64 | "size": 5, 65 | "offset": [ 66 | 0.5, 67 | 0.5 68 | ], 69 | "aspect_ratios": [ 70 | 1.0, 71 | 2.0, 72 | 0.5, 73 | 3.0, 74 | 0.33 75 | ] 76 | }, 77 | { 78 | "name": "conv10_2", 79 | "size": 3, 80 | "offset": [ 81 | 0.5, 82 | 0.5 83 | ], 84 | "aspect_ratios": [ 85 | 1.0, 86 | 2.0, 87 | 0.5 88 | ] 89 | }, 90 | { 91 | "name": "conv11_2", 92 | "size": 1, 93 | "offset": [ 94 | 0.5, 95 | 0.5 96 | ], 97 | "aspect_ratios": [ 98 | 1.0, 99 | 2.0, 100 | 0.5 101 | ] 102 | } 103 | ] 104 | } 105 | }, 106 | "training": { 107 | "match_threshold": 0.5, 108 | "neutral_threshold": 0.3, 109 | "min_negative_boxes": 0, 110 | "negative_boxes_ratio": 3, 111 | "alpha": 1 112 | } 113 | } -------------------------------------------------------------------------------- /configs/ssd300_vgg16_pascal-voc-07-12.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "name": "ssd_vgg16", 4 | "input_size": 300, 5 | "l2_regularization": 0.0005, 6 | "kernel_initializer": "he_normal", 7 | "base_network_trainable": true, 8 | "default_boxes": { 9 | "extra_box_for_ar_1": true, 10 | "clip_boxes": true, 11 | "variances": [ 12 | 0.1, 13 | 0.1, 14 | 0.2, 15 | 0.2 16 | ], 17 | "min_scale": 0.1, 18 | "max_scale": 1.05, 19 | "layers": [ 20 | { 21 | "name": "conv4_3", 22 | "size": 38, 23 | "offset": [ 24 | 0.5, 25 | 0.5 26 | ], 27 | "aspect_ratios": [ 28 | 1.0, 29 | 2.0, 30 | 0.5 31 | ] 32 | }, 33 | { 34 | "name": "fc7", 35 | "size": 19, 36 | "offset": [ 37 | 0.5, 38 | 0.5 39 | ], 40 | "aspect_ratios": [ 41 | 1.0, 42 | 2.0, 43 | 0.5, 44 | 3.0, 45 | 0.33 46 | ] 47 | }, 48 | { 49 | "name": "conv8_2", 50 | "size": 10, 51 | "offset": [ 52 | 0.5, 53 | 0.5 54 | ], 55 | "aspect_ratios": [ 56 | 1.0, 57 | 2.0, 58 | 0.5, 59 | 3.0, 60 | 0.33 61 | ] 62 | }, 63 | { 64 | "name": "conv9_2", 65 | "size": 5, 66 | "offset": [ 67 | 0.5, 68 | 0.5 69 | ], 70 | "aspect_ratios": [ 71 | 1.0, 72 | 2.0, 73 | 0.5, 74 | 3.0, 75 | 0.33 76 | ] 77 | }, 78 | { 79 | "name": "conv10_2", 80 | "size": 3, 81 | "offset": [ 82 | 0.5, 83 | 0.5 84 | ], 85 | "aspect_ratios": [ 86 | 1.0, 87 | 2.0, 88 | 0.5 89 | ] 90 | }, 91 | { 92 | "name": "conv11_2", 93 | "size": 1, 94 | "offset": [ 95 | 0.5, 96 | 0.5 97 | ], 98 | "aspect_ratios": [ 99 | 1.0, 100 | 2.0, 101 | 0.5 102 | ] 103 | } 104 | ] 105 | } 106 | }, 107 | "training": { 108 | "match_threshold": 0.5, 109 | "neutral_threshold": 0.3, 110 | "min_negative_boxes": 0, 111 | "negative_boxes_ratio": 3, 112 | "alpha": 1, 113 | "optimizer": { 114 | "name": "adam", 115 | "beta_1": 0.9, 116 | "beta_2": 0.999, 117 | "epsilon": 1e-08, 118 | "decay": 0.0, 119 | "momentum": 0.9, 120 | "nesterov": false 121 | } 122 | } 123 | } -------------------------------------------------------------------------------- /configs/ssd300_vgg16_pascal-voc-2007.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "name": "ssd_vgg16", 4 | "input_size": 300, 5 | "l2_regularization": 0.0005, 6 | "kernel_initializer": "he_normal", 7 | "base_network_trainable": false, 8 | "default_boxes": { 9 | "extra_box_for_ar_1": true, 10 | "clip_boxes": true, 11 | "variances": [ 12 | 0.1, 13 | 0.1, 14 | 0.2, 15 | 0.2 16 | ], 17 | "min_scale": 0.1, 18 | "max_scale": 1.05, 19 | "layers": [ 20 | { 21 | "name": "conv4_3", 22 | "size": 38, 23 | "offset": [ 24 | 0.5, 25 | 0.5 26 | ], 27 | "aspect_ratios": [ 28 | 1.0, 29 | 2.0, 30 | 0.5 31 | ] 32 | }, 33 | { 34 | "name": "fc7", 35 | "size": 19, 36 | "offset": [ 37 | 0.5, 38 | 0.5 39 | ], 40 | "aspect_ratios": [ 41 | 1.0, 42 | 2.0, 43 | 0.5, 44 | 3.0, 45 | 0.33 46 | ] 47 | }, 48 | { 49 | "name": "conv8_2", 50 | "size": 10, 51 | "offset": [ 52 | 0.5, 53 | 0.5 54 | ], 55 | "aspect_ratios": [ 56 | 1.0, 57 | 2.0, 58 | 0.5, 59 | 3.0, 60 | 0.33 61 | ] 62 | }, 63 | { 64 | "name": "conv9_2", 65 | "size": 5, 66 | "offset": [ 67 | 0.5, 68 | 0.5 69 | ], 70 | "aspect_ratios": [ 71 | 1.0, 72 | 2.0, 73 | 0.5, 74 | 3.0, 75 | 0.33 76 | ] 77 | }, 78 | { 79 | "name": "conv10_2", 80 | "size": 3, 81 | "offset": [ 82 | 0.5, 83 | 0.5 84 | ], 85 | "aspect_ratios": [ 86 | 1.0, 87 | 2.0, 88 | 0.5 89 | ] 90 | }, 91 | { 92 | "name": "conv11_2", 93 | "size": 1, 94 | "offset": [ 95 | 0.5, 96 | 0.5 97 | ], 98 | "aspect_ratios": [ 99 | 1.0, 100 | 2.0, 101 | 0.5 102 | ] 103 | } 104 | ] 105 | } 106 | }, 107 | "training": { 108 | "match_threshold": 0.5, 109 | "neutral_threshold": 0.3, 110 | "min_negative_boxes": 0, 111 | "negative_boxes_ratio": 3, 112 | "alpha": 1, 113 | "optimizer": { 114 | "name": "adam", 115 | "beta_1": 0.9, 116 | "beta_2": 0.999, 117 | "epsilon": 1e-08, 118 | "decay": 0.0 119 | } 120 | } 121 | } -------------------------------------------------------------------------------- /configs/ssd320_mobilenetv2_coco2017-train.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "name": "ssd_mobilenetv2", 4 | "input_size": 300, 5 | "l2_regularization": 0.0005, 6 | "kernel_initializer": "he_normal", 7 | "width_multiplier": 0.5, 8 | "default_boxes": { 9 | "extra_box_for_ar_1": true, 10 | "clip_boxes": true, 11 | "variances": [ 12 | 0.1, 13 | 0.1, 14 | 0.2, 15 | 0.2 16 | ], 17 | "min_scale": 0.15, 18 | "max_scale": 0.9, 19 | "layers": [ 20 | { 21 | "name": "block_13_expand_relu", 22 | "size": 19, 23 | "offset": [ 24 | 0.5, 25 | 0.5 26 | ], 27 | "aspect_ratios": [ 28 | 1.0, 29 | 2.0, 30 | 0.5, 31 | 3.0, 32 | 0.33 33 | ] 34 | }, 35 | { 36 | "name": "block_16_project_BN", 37 | "size": 10, 38 | "offset": [ 39 | 0.5, 40 | 0.5 41 | ], 42 | "aspect_ratios": [ 43 | 1.0, 44 | 2.0, 45 | 0.5, 46 | 3.0, 47 | 0.33 48 | ] 49 | }, 50 | { 51 | "name": "conv17_2/relu", 52 | "size": 5, 53 | "offset": [ 54 | 0.5, 55 | 0.5 56 | ], 57 | "aspect_ratios": [ 58 | 1.0, 59 | 2.0, 60 | 0.5, 61 | 3.0, 62 | 0.33 63 | ] 64 | }, 65 | { 66 | "name": "conv18_2/relu", 67 | "size": 3, 68 | "offset": [ 69 | 0.5, 70 | 0.5 71 | ], 72 | "aspect_ratios": [ 73 | 1.0, 74 | 2.0, 75 | 0.5, 76 | 3.0, 77 | 0.33 78 | ] 79 | }, 80 | { 81 | "name": "conv19_2/relu", 82 | "size": 2, 83 | "offset": [ 84 | 0.5, 85 | 0.5 86 | ], 87 | "aspect_ratios": [ 88 | 1.0, 89 | 2.0, 90 | 0.5, 91 | 3.0, 92 | 0.33 93 | ] 94 | }, 95 | { 96 | "name": "conv20_2/relu", 97 | "size": 1, 98 | "offset": [ 99 | 0.5, 100 | 0.5 101 | ], 102 | "aspect_ratios": [ 103 | 1.0, 104 | 2.0, 105 | 0.5, 106 | 3.0, 107 | 0.33 108 | ] 109 | } 110 | ] 111 | } 112 | }, 113 | "training": { 114 | "match_threshold": 0.5, 115 | "neutral_threshold": 0.3, 116 | "min_negative_boxes": 0, 117 | "negative_boxes_ratio": 3, 118 | "alpha": 1 119 | } 120 | } -------------------------------------------------------------------------------- /configs/tbpp384_vgg16.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "name": "tbpp_vgg16", 4 | "input_size": 384, 5 | "l2_regularization": 0.0005, 6 | "kernel_initializer": "he_normal", 7 | "default_boxes": { 8 | "extra_box_for_ar_1": true, 9 | "clip_boxes": true, 10 | "variances": [ 11 | 0.1, 12 | 0.1, 13 | 0.2, 14 | 0.2 15 | ], 16 | "min_scale": 0.2, 17 | "max_scale": 0.9, 18 | "layers": [ 19 | { 20 | "name": "conv4_3", 21 | "size": 48, 22 | "offset": [ 23 | 0.5, 24 | 0.7 25 | ], 26 | "aspect_ratios": [ 27 | 1, 28 | 2, 29 | 3, 30 | 5, 31 | 0.5, 32 | 0.33, 33 | 0.2 34 | ] 35 | }, 36 | { 37 | "name": "fc7", 38 | "size": 24, 39 | "offset": [ 40 | 0.5, 41 | 0.7 42 | ], 43 | "aspect_ratios": [ 44 | 1, 45 | 2, 46 | 3, 47 | 5, 48 | 0.5, 49 | 0.33, 50 | 0.2 51 | ] 52 | }, 53 | { 54 | "name": "conv8_2", 55 | "size": 12, 56 | "offset": [ 57 | 0.5, 58 | 0.7 59 | ], 60 | "aspect_ratios": [ 61 | 1, 62 | 2, 63 | 3, 64 | 5, 65 | 0.5, 66 | 0.33, 67 | 0.2 68 | ] 69 | }, 70 | { 71 | "name": "conv9_2", 72 | "size": 6, 73 | "offset": [ 74 | 0.5, 75 | 0.7 76 | ], 77 | "aspect_ratios": [ 78 | 1, 79 | 2, 80 | 3, 81 | 5, 82 | 0.5, 83 | 0.33, 84 | 0.2 85 | ] 86 | }, 87 | { 88 | "name": "conv10_2", 89 | "size": 4, 90 | "offset": [ 91 | 0.5, 92 | 0.7 93 | ], 94 | "aspect_ratios": [ 95 | 1, 96 | 2, 97 | 3, 98 | 5, 99 | 0.5, 100 | 0.33, 101 | 0.2 102 | ] 103 | }, 104 | { 105 | "name": "conv11_2", 106 | "size": 2, 107 | "offset": [ 108 | 0.5, 109 | 0.7 110 | ], 111 | "aspect_ratios": [ 112 | 1, 113 | 2, 114 | 3, 115 | 5, 116 | 0.5, 117 | 0.33, 118 | 0.2 119 | ] 120 | } 121 | ] 122 | } 123 | }, 124 | "training": { 125 | "match_threshold": 0.5, 126 | "neutral_threshold": 0.3, 127 | "min_negative_boxes": 0, 128 | "negative_boxes_ratio": 3, 129 | "alpha": 0.2 130 | } 131 | } -------------------------------------------------------------------------------- /configs/tbpp768_vgg16.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "name": "tbpp_vgg16", 4 | "input_size": 768, 5 | "l2_regularization": 0.0005, 6 | "kernel_initializer": "he_normal", 7 | "default_boxes": { 8 | "extra_box_for_ar_1": true, 9 | "clip_boxes": true, 10 | "variances": [ 11 | 0.1, 12 | 0.1, 13 | 0.2, 14 | 0.2 15 | ], 16 | "min_scale": 0.2, 17 | "max_scale": 0.9, 18 | "layers": [ 19 | { 20 | "name": "conv4_3", 21 | "size": 48, 22 | "offset": [ 23 | 0.5, 24 | 0.7 25 | ], 26 | "aspect_ratios": [ 27 | 1, 28 | 2, 29 | 3, 30 | 5, 31 | 0.5, 32 | 0.33, 33 | 0.2 34 | ] 35 | }, 36 | { 37 | "name": "fc7", 38 | "size": 24, 39 | "offset": [ 40 | 0.5, 41 | 0.7 42 | ], 43 | "aspect_ratios": [ 44 | 1, 45 | 2, 46 | 3, 47 | 5, 48 | 0.5, 49 | 0.33, 50 | 0.2 51 | ] 52 | }, 53 | { 54 | "name": "conv8_2", 55 | "size": 12, 56 | "offset": [ 57 | 0.5, 58 | 0.7 59 | ], 60 | "aspect_ratios": [ 61 | 1, 62 | 2, 63 | 3, 64 | 5, 65 | 0.5, 66 | 0.33, 67 | 0.2 68 | ] 69 | }, 70 | { 71 | "name": "conv9_2", 72 | "size": 6, 73 | "offset": [ 74 | 0.5, 75 | 0.7 76 | ], 77 | "aspect_ratios": [ 78 | 1, 79 | 2, 80 | 3, 81 | 5, 82 | 0.5, 83 | 0.33, 84 | 0.2 85 | ] 86 | }, 87 | { 88 | "name": "conv10_2", 89 | "size": 4, 90 | "offset": [ 91 | 0.5, 92 | 0.7 93 | ], 94 | "aspect_ratios": [ 95 | 1, 96 | 2, 97 | 3, 98 | 5, 99 | 0.5, 100 | 0.33, 101 | 0.2 102 | ] 103 | }, 104 | { 105 | "name": "conv11_2", 106 | "size": 2, 107 | "offset": [ 108 | 0.5, 109 | 0.7 110 | ], 111 | "aspect_ratios": [ 112 | 1, 113 | 2, 114 | 3, 115 | 5, 116 | 0.5, 117 | 0.33, 118 | 0.2 119 | ] 120 | } 121 | ] 122 | } 123 | }, 124 | "training": { 125 | "match_threshold": 0.5, 126 | "neutral_threshold": 0.3, 127 | "min_negative_boxes": 0, 128 | "negative_boxes_ratio": 3, 129 | "alpha": 0.2 130 | } 131 | } -------------------------------------------------------------------------------- /convert.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import tensorflow as tf 5 | from networks import SSD_VGG16, SSD_MOBILENET, SSD_MOBILENETV2 6 | 7 | SUPPORTED_TYPES = [ 8 | "keras", 9 | "tflite" 10 | ] 11 | 12 | parser = argparse.ArgumentParser( 13 | description='Converts a supported model into tflite.') 14 | parser.add_argument('config', type=str, help='path to config file.') 15 | parser.add_argument('weights', type=str, help='path to the weight file.') 16 | parser.add_argument('output_dir', type=str, help='path to the output folder.') 17 | parser.add_argument('--label_maps', type=str, help='path to label maps file.') 18 | parser.add_argument('--output_type', type=str, 19 | help='the type of the output model. One of type: "keras", "tflite"', default="keras") 20 | parser.add_argument('--num_predictions', type=int, 21 | help='the number of detections to be output as final detections', default=10) 22 | args = parser.parse_args() 23 | 24 | assert os.path.exists(args.label_maps), "label_maps file does not exist" 25 | assert os.path.exists(args.config), "config file does not exist" 26 | assert args.num_predictions > 0, "num_predictions must be larger than zero" 27 | assert args.output_type in SUPPORTED_TYPES, f"{args.output_type} is not supported yet. Please choose one of type {SUPPORTED_TYPES}" 28 | 29 | if not os.path.exists(args.output_dir): 30 | os.makedirs(args.output_dir) 31 | 32 | 33 | with open(args.config, "r") as config_file: 34 | config = json.load(config_file) 35 | 36 | model_config = config["model"] 37 | 38 | if model_config["name"] == "ssd_vgg16": 39 | with open(args.label_maps, "r") as file: 40 | label_maps = [line.strip("\n") for line in file.readlines()] 41 | model = SSD_VGG16( 42 | config, 43 | label_maps, 44 | is_training=False, 45 | num_predictions=args.num_predictions) 46 | 47 | elif model_config["name"] == "ssd_mobilenetv1": 48 | with open(args.label_maps, "r") as file: 49 | label_maps = [line.strip("\n") for line in file.readlines()] 50 | model = SSD_MOBILENET( 51 | config, 52 | label_maps, 53 | is_training=False, 54 | num_predictions=args.num_predictions) 55 | elif model_config["name"] == "ssd_mobilenetv2": 56 | with open(args.label_maps, "r") as file: 57 | label_maps = [line.strip("\n") for line in file.readlines()] 58 | model = SSD_MOBILENETV2( 59 | config, 60 | label_maps, 61 | is_training=False, 62 | num_predictions=args.num_predictions) 63 | else: 64 | print( 65 | f"model with name ${model_config['name']} has not been implemented yet") 66 | exit() 67 | 68 | model.load_weights(args.weights) 69 | 70 | config_file_name = os.path.basename(args.config) 71 | config_file_name = config_file_name[:config_file_name.index(".")] 72 | if args.output_type == "keras": 73 | model.save(os.path.join(args.output_dir, f"{config_file_name}.h5")) 74 | elif args.output_type == "tflite": 75 | tflite_converter = tf.lite.TFLiteConverter.from_keras_model(model) 76 | tflite_converter.optimizations = [tf.lite.Optimize.DEFAULT] 77 | tflite_converter.target_spec.supported_ops = [ 78 | tf.lite.OpsSet.TFLITE_BUILTINS, 79 | tf.lite.OpsSet.SELECT_TF_OPS, 80 | tf.float16, 81 | ] 82 | tflite_model = tflite_converter.convert() 83 | open(os.path.join(args.output_dir, f"{config_file_name}.tflite"), 'wb').write( 84 | tflite_model) 85 | -------------------------------------------------------------------------------- /custom_layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .decode_tbpp_predictions import DecodeTBPPPredictions 2 | from .decode_ssd_predictions import DecodeSSDPredictions 3 | from .l2_normalization import L2Normalization 4 | from .default_boxes import DefaultBoxes 5 | -------------------------------------------------------------------------------- /custom_layers/decode_ssd_predictions.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras.layers import Layer 3 | from utils import ssd_utils 4 | 5 | 6 | class DecodeSSDPredictions(Layer): 7 | def __init__( 8 | self, 9 | input_size, 10 | nms_max_output_size=400, 11 | confidence_threshold=0.01, 12 | iou_threshold=0.45, 13 | num_predictions=10, 14 | **kwargs 15 | ): 16 | self.input_size = input_size 17 | self.nms_max_output_size = nms_max_output_size 18 | self.confidence_threshold = confidence_threshold 19 | self.iou_threshold = iou_threshold 20 | self.num_predictions = num_predictions 21 | super(DecodeSSDPredictions, self).__init__(**kwargs) 22 | 23 | def build(self, input_shape): 24 | super(DecodeSSDPredictions, self).build(input_shape) 25 | 26 | def call(self, inputs): 27 | y_pred = ssd_utils.decode_predictions( 28 | y_pred=inputs, 29 | input_size=self.input_size, 30 | nms_max_output_size=self.nms_max_output_size, 31 | confidence_threshold=self.confidence_threshold, 32 | iou_threshold=self.iou_threshold, 33 | num_predictions=self.num_predictions 34 | ) 35 | return y_pred 36 | 37 | def get_config(self): 38 | config = { 39 | 'input_size': self.input_size, 40 | 'nms_max_output_size': self.nms_max_output_size, 41 | 'confidence_threshold': self.confidence_threshold, 42 | 'iou_threshold': self.iou_threshold, 43 | 'num_predictions': self.num_predictions, 44 | } 45 | base_config = super(DecodeSSDPredictions, self).get_config() 46 | return dict(list(base_config.items()) + list(config.items())) 47 | 48 | @classmethod 49 | def from_config(cls, config): 50 | return cls(**config) 51 | -------------------------------------------------------------------------------- /custom_layers/decode_tbpp_predictions.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras.layers import Layer 3 | from utils import textboxes_utils 4 | 5 | 6 | class DecodeTBPPPredictions(Layer): 7 | def __init__( 8 | self, 9 | input_size, 10 | nms_max_output_size=400, 11 | confidence_threshold=0.01, 12 | iou_threshold=0.45, 13 | num_predictions=10, 14 | **kwargs 15 | ): 16 | self.input_size = input_size 17 | self.nms_max_output_size = nms_max_output_size 18 | self.confidence_threshold = confidence_threshold 19 | self.iou_threshold = iou_threshold 20 | self.num_predictions = num_predictions 21 | super(DecodeTBPPPredictions, self).__init__(**kwargs) 22 | 23 | def build(self, input_shape): 24 | super(DecodeTBPPPredictions, self).build(input_shape) 25 | 26 | def call(self, inputs): 27 | y_pred = textboxes_utils.decode_predictions( 28 | y_pred=inputs, 29 | input_size=self.input_size, 30 | nms_max_output_size=self.nms_max_output_size, 31 | confidence_threshold=self.confidence_threshold, 32 | iou_threshold=self.iou_threshold, 33 | num_predictions=self.num_predictions 34 | ) 35 | return y_pred 36 | 37 | def get_config(self): 38 | config = { 39 | 'input_size': self.input_size, 40 | 'nms_max_output_size': self.nms_max_output_size, 41 | 'confidence_threshold': self.confidence_threshold, 42 | 'iou_threshold': self.iou_threshold, 43 | 'num_predictions': self.num_predictions, 44 | } 45 | base_config = super(DecodeTBPPPredictions, self).get_config() 46 | return dict(list(base_config.items()) + list(config.items())) 47 | 48 | @classmethod 49 | def from_config(cls, config): 50 | return cls(**config) 51 | -------------------------------------------------------------------------------- /custom_layers/default_boxes.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.keras.layers import Layer 4 | from utils.ssd_utils import get_number_default_boxes, generate_default_boxes_for_feature_map 5 | 6 | 7 | class DefaultBoxes(Layer): 8 | """ A custom keras layer that generates default boxes for a given feature map. 9 | 10 | Args: 11 | - image_shape: The shape of the input image 12 | - scale: The current scale for the default box. 13 | - next_scale: The next scale for the default box. 14 | - aspect_ratios: The aspect ratios for the default boxes. 15 | - offset: The offset for the center of the default boxes. Defaults to center of each grid cell. 16 | - variances: The normalization values for each bounding boxes properties (cx, cy, width, height). 17 | - extra_box_for_ar_1: Whether to add an extra box for default box with aspect ratio 1. 18 | Returns: 19 | - A tensor of shape (batch_size, feature_map_size, feature_map_size, num_default_boxes, 8) 20 | 21 | Raises: 22 | - feature map height does not equal to feature map width 23 | - image width does not equals to image height 24 | 25 | Code References: 26 | - https://github.com/pierluigiferrari/ssd_keras/blob/master/keras_layers/keras_layer_AnchorBoxes.py 27 | 28 | Paper References: 29 | - Liu, W., Anguelov, D., Erhan, D., Szegedy, C., Reed, S., Fu, C. Y., & Berg, A. C. (2016). 30 | SSD: Single Shot MultiBox Detector. https://arxiv.org/abs/1512.02325 31 | """ 32 | 33 | def __init__( 34 | self, 35 | image_shape, 36 | scale, 37 | next_scale, 38 | aspect_ratios, 39 | variances, 40 | offset=(0.5, 0.5), 41 | extra_box_for_ar_1=True, 42 | clip_boxes=True, 43 | **kwargs 44 | ): 45 | self.image_shape = image_shape 46 | self.scale = scale 47 | self.next_scale = next_scale 48 | self.aspect_ratios = aspect_ratios 49 | self.extra_box_for_ar_1 = extra_box_for_ar_1 50 | self.clip_boxes = clip_boxes, 51 | self.variances = variances 52 | self.offset = offset 53 | super(DefaultBoxes, self).__init__(**kwargs) 54 | 55 | def build(self, input_shape): 56 | _, feature_map_height, feature_map_width, _ = input_shape 57 | image_height, image_width, _ = self.image_shape 58 | 59 | assert feature_map_height == feature_map_width, "feature map width must be equal to feature map height" 60 | assert image_height == image_width, "image width must be equal to image height" 61 | 62 | self.feature_map_size = min(feature_map_height, feature_map_width) 63 | self.image_size = min(image_height, image_width) 64 | super(DefaultBoxes, self).build(input_shape) 65 | 66 | def call(self, inputs): 67 | default_boxes = generate_default_boxes_for_feature_map( 68 | feature_map_size=self.feature_map_size, 69 | image_size=self.image_size, 70 | offset=self.offset, 71 | scale=self.scale, 72 | next_scale=self.next_scale, 73 | aspect_ratios=self.aspect_ratios, 74 | variances=self.variances, 75 | extra_box_for_ar_1=self.extra_box_for_ar_1, 76 | clip_boxes=self.clip_boxes, 77 | ) 78 | default_boxes = np.expand_dims(default_boxes, axis=0) 79 | default_boxes = tf.constant(default_boxes, dtype='float32') 80 | default_boxes = tf.tile(default_boxes, (tf.shape(inputs)[0], 1, 1, 1, 1)) 81 | return default_boxes 82 | 83 | def get_config(self): 84 | config = { 85 | "image_shape": self.image_shape, 86 | "scale": self.scale, 87 | "next_scale": self.next_scale, 88 | "aspect_ratios": self.aspect_ratios, 89 | "extra_box_for_ar_1": self.extra_box_for_ar_1, 90 | "clip_boxes": self.clip_boxes, 91 | "variances": self.variances, 92 | "offset": self.offset, 93 | "feature_map_size": self.feature_map_size, 94 | "image_size": self.image_size 95 | } 96 | base_config = super(DefaultBoxes, self).get_config() 97 | return dict(list(base_config.items()) + list(config.items())) 98 | 99 | @classmethod 100 | def from_config(cls, config): 101 | return cls(**config) 102 | -------------------------------------------------------------------------------- /custom_layers/l2_normalization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.keras.layers import Layer 4 | 5 | 6 | class L2Normalization(Layer): 7 | """ A custom layer that performs l2 normalization on its inputs with learnable parameter gamma. 8 | Note: 9 | 1. This is implementation is taken from https://github.com/pierluigiferrari/ssd_keras/blob/master/keras_layers/keras_layer_L2Normalization.py with slight modifications: 10 | - axis variable is passed as parameter instead of fixed value 11 | - K.variable is replaced with tf.Variable 12 | - fixed dtype mismatched by specifying dtype=np.float32 13 | 2. get_config & from_config is necessary to make the layer serializable 14 | 3. we need to multiply self.gamma_init with np.ones((input_shape[self.axis],), dtype=np.float32) 15 | to turn gamma into the shape of (input_shape[self.axis],) which will allow us to broadcast those values 16 | when multiplying with the output in the call function. 17 | 18 | Args: 19 | - gamma_init: The initial scaling parameter. Defaults to 20 following the SSD paper. 20 | - axis: the axis to apply the scaling to 21 | 22 | Returns: 23 | - A scaled tensor with the same shape as input_shape 24 | 25 | Code References: 26 | - https://github.com/pierluigiferrari/ssd_keras/blob/master/keras_layers/keras_layer_L2Normalization.py 27 | 28 | Paper References: 29 | - Liu, W., Anguelov, D., Erhan, D., Szegedy, C., Reed, S., Fu, C. Y., & Berg, A. C. (2016). 30 | SSD: Single Shot MultiBox Detector. https://arxiv.org/abs/1512.02325 31 | - Liu, W., Rabinovich, A., & Berg, A. C. (2016). 32 | ParseNet: Looking Wider to See Better. International Conference on Learning Representation (ICLR) 2016. 33 | https://arxiv.org/abs/1506.04579 34 | """ 35 | 36 | def __init__(self, gamma_init=20, axis=-1, **kwargs): 37 | self.axis = axis 38 | self.gamma_init = gamma_init 39 | super(L2Normalization, self).__init__(**kwargs) 40 | 41 | def build(self, input_shape): 42 | gamma = self.gamma_init * np.ones((input_shape[self.axis],), dtype=np.float32) 43 | self.gamma = tf.Variable(gamma, trainable=True) 44 | super(L2Normalization, self).build(input_shape) 45 | 46 | def call(self, inputs): 47 | return tf.math.l2_normalize(inputs, self.axis) * self.gamma 48 | 49 | def get_config(self): 50 | config = {'gamma_init': self.gamma_init, 'axis': self.axis} 51 | base_config = super(L2Normalization, self).get_config() 52 | return dict(list(base_config.items()) + list(config.items())) 53 | 54 | @classmethod 55 | def from_config(cls, config): 56 | return cls(**config) 57 | -------------------------------------------------------------------------------- /data_generators/__init__.py: -------------------------------------------------------------------------------- 1 | from .ssd_data_generator import SSD_DATA_GENERATOR 2 | from .tbpp_data_generator import TBPP_DATA_GENERATOR 3 | -------------------------------------------------------------------------------- /display_default_boxes.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import json 4 | import argparse 5 | import numpy as np 6 | from glob import glob 7 | from xml.dom import minidom 8 | import xml.etree.cElementTree as ET 9 | from pycocotools.coco import COCO 10 | from utils import ssd_utils 11 | 12 | parser = argparse.ArgumentParser(description='Displays default boxes in a selected image.') 13 | parser.add_argument('config', type=str, help='path to config file.') 14 | parser.add_argument('image', type=str, help='path to image file.') 15 | args = parser.parse_args() 16 | 17 | print("loading config file") 18 | with open(args.config, "r") as config_file: 19 | config = json.load(config_file) 20 | 21 | 22 | model_config = config["model"] 23 | default_boxes_config = model_config["default_boxes"] 24 | input_size = model_config["input_size"] 25 | extra_box_for_ar_1 = default_boxes_config["extra_box_for_ar_1"] 26 | clip_boxes = default_boxes_config["clip_boxes"] 27 | 28 | print("loading image file") 29 | image = cv2.imread(args.image) 30 | image = cv2.resize(image, (input_size, input_size)) 31 | 32 | print("generating default boxes") 33 | scales = np.linspace( 34 | default_boxes_config["min_scale"], 35 | default_boxes_config["max_scale"], 36 | len(default_boxes_config["layers"]) 37 | ) 38 | mbox_conf_layers = [] 39 | mbox_loc_layers = [] 40 | mbox_default_boxes_layers = [] 41 | for i, layer in enumerate(default_boxes_config["layers"]): 42 | temp_image = image.copy() 43 | print(f"displaying default boxes for layer: {layer['name']}") 44 | layer_default_boxes = ssd_utils.generate_default_boxes_for_feature_map( 45 | feature_map_size=layer["size"], 46 | image_size=input_size, 47 | offset=layer["offset"], 48 | scale=scales[i], 49 | next_scale=scales[i+1] if i+1 <= len(default_boxes_config["layers"]) - 1 else 1, 50 | aspect_ratios=layer["aspect_ratios"], 51 | variances=default_boxes_config["variances"], 52 | extra_box_for_ar_1=extra_box_for_ar_1, 53 | clip_boxes=clip_boxes 54 | ) 55 | 56 | grid_size = input_size / layer["size"] 57 | offset = layer["offset"] 58 | offset_x, offset_y = offset 59 | 60 | cx = np.linspace(offset_x * grid_size, input_size - (offset_x * grid_size), layer["size"]) 61 | cy = np.linspace(offset_y * grid_size, input_size - (offset_y * grid_size), layer["size"]) 62 | 63 | for n in range(len(cx)): 64 | for m in range(len(cy)): 65 | cv2.circle( 66 | temp_image, 67 | (int(cx[n]), int(cy[m])), 68 | 1, 69 | (255, 0, 0), 70 | 1 71 | ) 72 | 73 | middle_cell = layer['size']//2 74 | target_cell = 0 if middle_cell == 0 else middle_cell 75 | 76 | for default_box in layer_default_boxes[target_cell][target_cell]: 77 | cx = default_box[0] * input_size 78 | cy = default_box[1] * input_size 79 | w = default_box[2] * input_size 80 | h = default_box[3] * input_size 81 | cv2.rectangle( 82 | temp_image, 83 | (int(cx-(w/2)), int(cy-(h/2))), 84 | (int(cx+(w/2)), int(cy+(h/2))), 85 | (0, 255, 0), 86 | 3 87 | ) 88 | cv2.imshow(f"layer: {layer['name']}", temp_image) 89 | if cv2.waitKey(0) == ord('q'): 90 | cv2.destroyAllWindows() 91 | -------------------------------------------------------------------------------- /evaluate.sh: -------------------------------------------------------------------------------- 1 | python evaluate.py \ 2 | /Users/socretlee/CodingDrive/other/object-detection-in-keras/data/pascal-voc-2007/images \ 3 | /Users/socretlee/CodingDrive/other/object-detection-in-keras/data/pascal-voc-2007/labels \ 4 | /Users/socretlee/CodingDrive/other/object-detection-in-keras/output/cp_229_loss-5.06_valloss-5.17.h5 \ 5 | /Users/socretlee/CodingDrive/other/object-detection-in-keras/data/pascal-voc-2007/test.txt \ 6 | --label_maps=/Users/socretlee/CodingDrive/other/object-detection-in-keras/data/pascal-voc-2007/label_maps.txt \ 7 | --output_dir=output/evaluations/cp_229_loss-5.06_valloss-5.17.h5 \ 8 | --iou_threshold=0.5 -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import os 3 | import json 4 | import argparse 5 | import numpy as np 6 | from glob import glob 7 | from utils import inference_utils 8 | 9 | parser = argparse.ArgumentParser( 10 | description='run inference on an input image.') 11 | parser.add_argument('images', type=str, 12 | help='glob string for list of images.') 13 | parser.add_argument('config', type=str, help='path to config file.') 14 | parser.add_argument('weights', type=str, help='path to the weight file.') 15 | parser.add_argument('--label_maps', type=str, help='path to label maps file.') 16 | parser.add_argument('--confidence_threshold', type=float, 17 | help='the confidence score a detection should match in order to be counted.', default=0.9) 18 | parser.add_argument('--num_predictions', type=int, 19 | help='the number of detections to be output as final detections', default=10) 20 | args = parser.parse_args() 21 | 22 | # assert os.path.exists(args.input_image), "config file does not exist" 23 | assert os.path.exists(args.config), "config file does not exist" 24 | assert args.num_predictions > 0, "num_predictions must be larger than zero" 25 | assert args.confidence_threshold > 0, "confidence_threshold must be larger than zero." 26 | assert args.confidence_threshold <= 1, "confidence_threshold must be smaller than or equal to 1." 27 | with open(args.config, "r") as config_file: 28 | config = json.load(config_file) 29 | 30 | input_size = config["model"]["input_size"] 31 | model_config = config["model"] 32 | 33 | if model_config["name"] == "ssd_vgg16": 34 | model, process_input_fn, label_maps = inference_utils.ssd_vgg16(config, args) 35 | elif model_config["name"] == "ssd_mobilenetv1": 36 | model, process_input_fn, label_maps = inference_utils.ssd_mobilenetv1(config, args) 37 | elif model_config["name"] == "ssd_mobilenetv2": 38 | model, process_input_fn, label_maps = inference_utils.ssd_mobilenetv2(config, args) 39 | elif model_config["name"] == "tbpp_vgg16": 40 | model, process_input_fn, label_maps = inference_utils.tbpp_vgg16(config, args) 41 | else: 42 | print( 43 | f"model with name ${model_config['name']} has not been implemented yet") 44 | exit() 45 | 46 | model.load_weights(args.weights) 47 | 48 | 49 | for idx, input_image in enumerate(list(glob(args.images))): 50 | image = cv2.imread(input_image) # read image in bgr format 51 | # image = cv2.resize(image, (0, 0), fx=0.3, fy=0.3) 52 | image = np.array(image, dtype=np.float) 53 | image = np.uint8(image) 54 | 55 | display_image = image.copy() 56 | image_height, image_width, _ = image.shape 57 | height_scale, width_scale = input_size/image_height, input_size/image_width 58 | 59 | image = cv2.resize(image, (input_size, input_size)) 60 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 61 | image = process_input_fn(image) 62 | 63 | image = np.expand_dims(image, axis=0) 64 | y_pred = model.predict(image) 65 | 66 | for i, pred in enumerate(y_pred[0]): 67 | classname = label_maps[int(pred[0]) - 1].upper() 68 | confidence_score = pred[1] 69 | 70 | score = f"{'%.2f' % (confidence_score * 100)}%" 71 | print(f"-- {classname}: {score}") 72 | 73 | if confidence_score <= 1 and confidence_score > args.confidence_threshold: 74 | xmin = max(int(pred[2] / width_scale), 1) 75 | ymin = max(int(pred[3] / height_scale), 1) 76 | xmax = min(int(pred[4] / width_scale), image_width-1) 77 | ymax = min(int(pred[5] / height_scale), image_height-1) 78 | 79 | cv2.putText( 80 | display_image, 81 | classname, 82 | (int(xmin), int(ymin)), 83 | cv2.FONT_HERSHEY_PLAIN, 84 | 1, 85 | (100, 100, 255), 86 | 1, 87 | 2 88 | ) 89 | 90 | cv2.rectangle( 91 | display_image, 92 | (xmin, ymin), 93 | (xmax, ymax), 94 | (255, 0, 0), 95 | 2 96 | ) 97 | 98 | print("\n") 99 | 100 | cv2.imshow("output", display_image) 101 | 102 | if cv2.waitKey(0) == ord('q'): 103 | cv2.destroyAllWindows() 104 | elif cv2.waitKey(0) == ord('s'): 105 | print("saving sample") 106 | cv2.destroyAllWindows() 107 | -------------------------------------------------------------------------------- /inference.sh: -------------------------------------------------------------------------------- 1 | python inference.py \ 2 | "data/pascal-voc-2007/images/*" \ 3 | configs/ssd300_vgg16_pascal-voc-2007.json \ 4 | /Users/socretlee/Google\ Drive/1-projects/ssd300_vgg16_pascal-voc-2007_trainval/cp_166_loss-5.24_valloss-5.99.h5 \ 5 | --label_maps=/Users/socretlee/CodingDrive/other/object-detection-in-keras/data/pascal-voc-2007/label_maps.txt \ 6 | --confidence_threshold=0.8 \ 7 | --num_predictions=100 -------------------------------------------------------------------------------- /losses/__init__.py: -------------------------------------------------------------------------------- 1 | from .smooth_l1_loss import SMOOTH_L1_LOSS 2 | from .softmax_loss import SOFTMAX_LOSS 3 | from .ssd_loss import SSD_LOSS 4 | from .tbpp_loss import TBPP_LOSS 5 | -------------------------------------------------------------------------------- /losses/smooth_l1_loss.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class SMOOTH_L1_LOSS: 5 | """ Compute smooth l1 loss between the predicted bounding boxes and the ground truth bounding boxes. 6 | 7 | Args: 8 | - y_true: The ground truth bounding boxes. 9 | - y_pred: The predicted bounding boxes. 10 | 11 | Code References: 12 | - https://github.com/pierluigiferrari/ssd_keras/blob/master/keras_loss_function/keras_ssd_loss.py 13 | 14 | Paper References: 15 | - Girshick, R. (2015). Fast-RCNN. https://arxiv.org/pdf/1504.08083.pdf 16 | - Liu, W., Anguelov, D., Erhan, D., Szegedy, C., Reed, S., Fu, C.Y., & Berg, A. C. (2016). 17 | SSD: Single Shot MultiBox Detector. https://arxiv.org/abs/1512.02325 18 | """ 19 | 20 | def compute(self, y_true, y_pred): 21 | abs_loss = tf.abs(y_true - y_pred) 22 | square_loss = 0.5 * (y_true - y_pred) ** 2 23 | res = tf.where(tf.less(abs_loss, 1.0), square_loss, abs_loss - 0.5) 24 | return tf.reduce_sum(res, axis=-1) 25 | -------------------------------------------------------------------------------- /losses/softmax_loss.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class SOFTMAX_LOSS: 5 | """ Calculates the softmax loss between the predicted classes and ground truth classes. 6 | 7 | Args: 8 | - y_true: The ground truth classes. 9 | - y_pred: The predicted classes. 10 | 11 | Code Reference: 12 | - https://github.com/pierluigiferrari/ssd_keras/blob/master/keras_loss_function/keras_ssd_loss.py 13 | """ 14 | 15 | def compute(self, y_true, y_pred): 16 | y_pred = tf.maximum(y_pred, 1e-15) 17 | return -1 * tf.reduce_sum(y_true * tf.math.log(y_pred), axis=-1) 18 | -------------------------------------------------------------------------------- /losses/ssd_loss.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from .smooth_l1_loss import SMOOTH_L1_LOSS 3 | from .softmax_loss import SOFTMAX_LOSS 4 | 5 | 6 | class SSD_LOSS: 7 | """ Loss function as defined in the SSD paper. 8 | 9 | Args: 10 | - alpha: weight term from the SSD paper. Defaults to 1. 11 | - min_negative_boxes: the minimum number of negative boxes allowed in the loss calculation. Defaults to 0. 12 | - negative_boxes_ratio: the ratio of negative boxes to positive boxes. Defaults to 3 (3 times the possible boxes). 13 | 14 | Returns: 15 | - A tensor of shape (batch_size,) where each item in the tensor represents the loss for each batch item. 16 | 17 | Paper References: 18 | - Liu, W., Anguelov, D., Erhan, D., Szegedy, C., Reed, S., Fu, C.Y., & Berg, A. C. (2016). 19 | SSD: Single Shot MultiBox Detector. https://arxiv.org/abs/1512.02325 20 | 21 | Code References: 22 | - https://github.com/pierluigiferrari/ssd_keras/blob/master/keras_loss_function/keras_ssd_loss.py 23 | """ 24 | 25 | def __init__( 26 | self, 27 | alpha=1, 28 | min_negative_boxes=0, 29 | negative_boxes_ratio=3, 30 | ): 31 | self.alpha = alpha 32 | self.min_negative_boxes = min_negative_boxes 33 | self.negative_boxes_ratio = negative_boxes_ratio 34 | self.smooth_l1_loss = SMOOTH_L1_LOSS() 35 | self.softmax_loss = SOFTMAX_LOSS() 36 | 37 | def compute(self, y_true, y_pred): 38 | # calculate smooth l1 loss and softmax loss for all boxes 39 | batch_size = tf.shape(y_true)[0] 40 | num_boxes = tf.shape(y_true)[1] 41 | # 42 | bbox_true = y_true[:, :, -12:-8] 43 | bbox_pred = y_pred[:, :, -12:-8] 44 | class_true = y_true[:, :, :-12] 45 | class_pred = y_pred[:, :, :-12] 46 | # 47 | regression_loss = self.smooth_l1_loss.compute(bbox_true, bbox_pred) 48 | classification_loss = self.softmax_loss.compute(class_true, class_pred) 49 | # 50 | negatives = class_true[:, :, 0] # (batch_size, num_boxes) 51 | positives = tf.reduce_max(class_true[:, :, 1:], axis=-1) # (batch_size, num_boxes) 52 | num_positives = tf.cast(tf.reduce_sum(positives), tf.int32) 53 | # 54 | pos_regression_loss = tf.reduce_sum(regression_loss * positives, axis=-1) 55 | pos_classification_loss = tf.reduce_sum(classification_loss * positives, axis=-1) 56 | # 57 | neg_classification_loss = classification_loss * negatives 58 | num_neg_classification_loss = tf.math.count_nonzero(neg_classification_loss, dtype=tf.int32) 59 | num_neg_classification_loss_keep = tf.minimum( 60 | tf.maximum(self.negative_boxes_ratio * num_positives, self.min_negative_boxes), 61 | num_neg_classification_loss 62 | ) 63 | 64 | def f1(): 65 | return tf.zeros([batch_size]) 66 | 67 | def f2(): 68 | neg_classification_loss_1d = tf.reshape(neg_classification_loss, [-1]) 69 | _, indices = tf.nn.top_k( 70 | neg_classification_loss_1d, 71 | k=num_neg_classification_loss_keep, 72 | sorted=False 73 | ) 74 | negatives_keep = tf.scatter_nd( 75 | indices=tf.expand_dims(indices, axis=1), 76 | updates=tf.ones_like(indices, dtype=tf.int32), 77 | shape=tf.shape(neg_classification_loss_1d) 78 | ) 79 | negatives_keep = tf.cast(tf.reshape(negatives_keep, [batch_size, num_boxes]), tf.float32) 80 | neg_class_loss = tf.reduce_sum(classification_loss * negatives_keep, axis=-1) 81 | return neg_class_loss 82 | 83 | neg_classification_loss = tf.cond(tf.equal(num_neg_classification_loss, tf.constant(0)), f1, f2) 84 | classification_loss = pos_classification_loss + neg_classification_loss 85 | 86 | total = (classification_loss + self.alpha * pos_regression_loss) / tf.maximum(1.0, tf.cast(num_positives, tf.float32)) 87 | total = total * tf.cast(batch_size, tf.float32) 88 | return total 89 | -------------------------------------------------------------------------------- /losses/tbpp_loss.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from .smooth_l1_loss import SMOOTH_L1_LOSS 3 | from .softmax_loss import SOFTMAX_LOSS 4 | 5 | 6 | class TBPP_LOSS: 7 | """ Loss function as defined in the SSD paper. 8 | 9 | Args: 10 | - alpha: weight term from the SSD paper. Defaults to 1. 11 | - min_negative_boxes: the minimum number of negative boxes allowed in the loss calculation. Defaults to 0. 12 | - negative_boxes_ratio: the ratio of negative boxes to positive boxes. Defaults to 3 (3 times the possible boxes). 13 | 14 | Returns: 15 | - A tensor of shape (batch_size,) where each item in the tensor represents the loss for each batch item. 16 | 17 | Paper References: 18 | - Liao, M., Shi, B., & Bai, X. (2018). TextBoxes++: A Single-Shot Oriented Scene Text Detector. https://arxiv.org/abs/1512.02325 19 | """ 20 | 21 | def __init__( 22 | self, 23 | alpha=1, 24 | min_negative_boxes=0, 25 | negative_boxes_ratio=3, 26 | ): 27 | self.alpha = alpha 28 | self.min_negative_boxes = min_negative_boxes 29 | self.negative_boxes_ratio = negative_boxes_ratio 30 | self.smooth_l1_loss = SMOOTH_L1_LOSS() 31 | self.softmax_loss = SOFTMAX_LOSS() 32 | 33 | def compute(self, y_true, y_pred): 34 | # calculate smooth l1 loss and softmax loss for all boxes 35 | batch_size = tf.shape(y_true)[0] 36 | num_boxes = tf.shape(y_true)[1] 37 | # 38 | textboxes_true = y_true[:, :, -20:-8] 39 | textboxes_pred = y_pred[:, :, -12:] 40 | class_true = y_true[:, :, :-20] 41 | class_pred = y_pred[:, :, :-12] 42 | # 43 | regression_loss = self.smooth_l1_loss.compute(textboxes_true, textboxes_pred) 44 | # tf.print(regression_loss[0, 0]) 45 | classification_loss = self.softmax_loss.compute(class_true, class_pred) 46 | # tf.print(classification_loss[0, 0]) 47 | # 48 | negatives = class_true[:, :, 0] # (batch_size, num_boxes) 49 | positives = tf.reduce_max(class_true[:, :, 1:], axis=-1) # (batch_size, num_boxes) 50 | num_positives = tf.cast(tf.reduce_sum(positives), tf.int32) 51 | # 52 | pos_regression_loss = tf.reduce_sum(regression_loss * positives, axis=-1) 53 | pos_classification_loss = tf.reduce_sum(classification_loss * positives, axis=-1) 54 | # 55 | neg_classification_loss = classification_loss * negatives 56 | num_neg_classification_loss = tf.math.count_nonzero(neg_classification_loss, dtype=tf.int32) 57 | num_neg_classification_loss_keep = tf.minimum( 58 | tf.maximum(self.negative_boxes_ratio * num_positives, self.min_negative_boxes), 59 | num_neg_classification_loss 60 | ) 61 | 62 | def f1(): 63 | return tf.zeros([batch_size]) 64 | 65 | def f2(): 66 | neg_classification_loss_1d = tf.reshape(neg_classification_loss, [-1]) 67 | _, indices = tf.nn.top_k( 68 | neg_classification_loss_1d, 69 | k=num_neg_classification_loss_keep, 70 | sorted=False 71 | ) 72 | negatives_keep = tf.scatter_nd( 73 | indices=tf.expand_dims(indices, axis=1), 74 | updates=tf.ones_like(indices, dtype=tf.int32), 75 | shape=tf.shape(neg_classification_loss_1d) 76 | ) 77 | negatives_keep = tf.cast(tf.reshape(negatives_keep, [batch_size, num_boxes]), tf.float32) 78 | neg_class_loss = tf.reduce_sum(classification_loss * negatives_keep, axis=-1) 79 | return neg_class_loss 80 | 81 | neg_classification_loss = tf.cond(tf.equal(num_neg_classification_loss, tf.constant(0)), f1, f2) 82 | classification_loss = pos_classification_loss + neg_classification_loss 83 | 84 | total = (classification_loss + self.alpha * pos_regression_loss) / tf.maximum(1.0, tf.cast(num_positives, tf.float32)) 85 | total = total * tf.cast(batch_size, tf.float32) 86 | return total 87 | -------------------------------------------------------------------------------- /networks/__init__.py: -------------------------------------------------------------------------------- 1 | from .ssd_vgg16 import SSD_VGG16 2 | from .ssd_mobilenet import SSD_MOBILENET 3 | from .ssd_mobilenetv2 import SSD_MOBILENETV2 4 | from .tbpp_vgg16 import TBPP_VGG16 5 | -------------------------------------------------------------------------------- /networks/base_networks/__init__.py: -------------------------------------------------------------------------------- 1 | from .truncated_vgg16 import TRUNCATED_VGG16 2 | -------------------------------------------------------------------------------- /networks/base_networks/truncated_vgg16.py: -------------------------------------------------------------------------------- 1 | from custom_layers.l2_normalization import L2Normalization 2 | from tensorflow.keras.models import Model 3 | from tensorflow.keras.layers import MaxPool2D, Conv2D, Reshape, Concatenate, Activation, Input, ZeroPadding2D, MaxPooling2D 4 | from tensorflow.python.keras.utils import data_utils 5 | 6 | WEIGHTS_PATH_NO_TOP = ('https://storage.googleapis.com/tensorflow/' 7 | 'keras-applications/vgg16/' 8 | 'vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5') 9 | 10 | 11 | def TRUNCATED_VGG16( 12 | input_shape=None, 13 | kernel_initializer=None, 14 | kernel_regularizer=None, 15 | ): 16 | """ A truncated version of VGG16 configuration D 17 | """ 18 | input_layer = Input(shape=input_shape, name="input") 19 | # block 1 20 | conv1_1 = Conv2D( 21 | 64, (3, 3), activation='relu', padding='same', name='block1_conv1', 22 | kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer)(input_layer) 23 | conv1_2 = Conv2D( 24 | 64, (3, 3), activation='relu', padding='same', name='block1_conv2', 25 | kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer)(conv1_1) 26 | pool1 = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool', padding="same")(conv1_2) 27 | 28 | # block 2 29 | conv2_1 = Conv2D( 30 | 128, (3, 3), activation='relu', padding='same', name='block2_conv1', 31 | kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer)(pool1) 32 | conv2_2 = Conv2D( 33 | 128, (3, 3), activation='relu', padding='same', name='block2_conv2', 34 | kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer)(conv2_1) 35 | pool2 = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool', padding="same")(conv2_2) 36 | 37 | # block 3 38 | conv3_1 = Conv2D( 39 | 256, (3, 3), activation='relu', padding='same', name='block3_conv1', 40 | kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer)(pool2) 41 | conv3_2 = Conv2D( 42 | 256, (3, 3), activation='relu', padding='same', name='block3_conv2', 43 | kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer)(conv3_1) 44 | conv3_3 = Conv2D( 45 | 256, (3, 3), activation='relu', padding='same', name='block3_conv3', 46 | kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer)(conv3_2) 47 | pool3 = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool', padding="same")(conv3_3) 48 | 49 | # block 4 50 | conv4_1 = Conv2D( 51 | 512, (3, 3), activation='relu', padding='same', name='block4_conv1', 52 | kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer)(pool3) 53 | conv4_2 = Conv2D( 54 | 512, (3, 3), activation='relu', padding='same', name='block4_conv2', 55 | kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer)(conv4_1) 56 | conv4_3 = Conv2D( 57 | 512, (3, 3), activation='relu', padding='same', name='block4_conv3', 58 | kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer)(conv4_2) 59 | pool4 = MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool', padding="same")(conv4_3) 60 | 61 | # block 5 62 | conv5_1 = Conv2D( 63 | 512, (3, 3), activation='relu', padding='same', name='block5_conv1', 64 | kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer)(pool4) 65 | conv5_2 = Conv2D( 66 | 512, (3, 3), activation='relu', padding='same', name='block5_conv2', 67 | kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer)(conv5_1) 68 | conv5_3 = Conv2D( 69 | 512, (3, 3), activation='relu', padding='same', name='block5_conv3', 70 | kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer)(conv5_2) 71 | 72 | model = Model(inputs=input_layer, outputs=conv5_3) 73 | 74 | weights_path = data_utils.get_file( 75 | 'vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5', 76 | WEIGHTS_PATH_NO_TOP, 77 | cache_subdir='models', 78 | file_hash='6d6bbae143d832006294945121d1f1fc') 79 | 80 | model.load_weights(weights_path, by_name=True) 81 | 82 | return model 83 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib==3.3.3 2 | numpy==1.19.4 3 | opencv-python==4.4.0.46 4 | Pillow==8.0.1 5 | tensorflow==2.4.0 6 | tensorflow-estimator==2.4.0 7 | tf-estimator-nightly==2.5.0.dev2021010101 8 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import os 3 | import json 4 | import argparse 5 | import tensorflow as tf 6 | from tensorflow.keras.applications import vgg16, mobilenet, mobilenet_v2 7 | import numpy as np 8 | from glob import glob 9 | from networks import SSD_VGG16, SSD_MOBILENET, SSD_MOBILENETV2 10 | from utils import inference_utils, textboxes_utils, command_line_utils 11 | 12 | 13 | parser = argparse.ArgumentParser( 14 | description='run inference on an input image.') 15 | parser.add_argument('test_file', type=str, help='path to the test set file.') 16 | parser.add_argument('images_dir', type=str, help='path to images dir.') 17 | parser.add_argument('labels_dir', type=str, help='path to labels dir.') 18 | parser.add_argument('config', type=str, help='path to config file.') 19 | parser.add_argument('weights', type=str, help='path to config file.') 20 | parser.add_argument('--label_maps', type=str, help='path to label maps file.') 21 | parser.add_argument('--num_predictions', type=int, 22 | help='the number of detections to be output as final detections', default=10) 23 | parser.add_argument('--output_dir', type=str, 24 | help='ouput', default="output") 25 | args = parser.parse_args() 26 | 27 | assert os.path.exists(args.config), "config file does not exist" 28 | assert args.num_predictions > 0, "num_predictions must be larger than zero" 29 | # assert args.confidence_threshold > 0, "confidence_threshold must be larger than zero." 30 | # assert args.confidence_threshold <= 1, "confidence_threshold must be smaller than or equal to 1." 31 | 32 | with open(args.config, "r") as config_file: 33 | config = json.load(config_file) 34 | 35 | if not os.path.exists(args.output_dir): 36 | os.makedirs(args.output_dir) 37 | 38 | input_size = config["model"]["input_size"] 39 | model_config = config["model"] 40 | 41 | if model_config["name"] == "ssd_mobilenetv2": 42 | model, process_input_fn, label_maps = inference_utils.inference_ssd_mobilenetv2( 43 | config, args) 44 | elif model_config["name"] == "ssd_vgg16": 45 | model, process_input_fn, label_maps = inference_utils.ssd_vgg16(config, args) 46 | else: 47 | print( 48 | f"model with name ${model_config['name']} has not been implemented yet") 49 | exit() 50 | 51 | model.load_weights(args.weights) 52 | 53 | with open(args.test_file, "r") as test_set_file: 54 | tests = test_set_file.readlines() 55 | for idx, sample in enumerate(tests): 56 | print(f"{idx+1}/{len(tests)}") 57 | image_file, label_file = sample.split(" ") 58 | filename = image_file[:image_file.index(".")] 59 | image = cv2.imread(os.path.join(args.images_dir, image_file)) 60 | image = np.array(image, dtype=np.float) 61 | image = np.uint8(image) 62 | image_height, image_width, _ = image.shape 63 | height_scale, width_scale = input_size/image_height, input_size/image_width 64 | 65 | image = cv2.resize(image, (input_size, input_size)) 66 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 67 | image = process_input_fn(image) 68 | 69 | image = np.expand_dims(image, axis=0) 70 | y_pred = model.predict(image)[0] 71 | 72 | with open(os.path.join(args.output_dir, f"{filename}.txt"), "w") as outfile: 73 | for i, pred in enumerate(y_pred): 74 | classname = label_maps[int(pred[0]) - 1].lower() 75 | confidence_score = pred[1] 76 | pred[[2, 4]] /= width_scale 77 | pred[[3, 5]] /= height_scale 78 | outfile.write(f"{classname} {confidence_score} {int(pred[2])} {int(pred[3])} {int(pred[4])} {int(pred[5])}\n") 79 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | python test.py \ 2 | /Users/socretlee/CodingDrive/other/object-detection-in-keras/data/easy_example_tests/test.txt \ 3 | /Users/socretlee/CodingDrive/other/object-detection-in-keras/data/easy_example_tests/images \ 4 | /Users/socretlee/CodingDrive/other/object-detection-in-keras/data/easy_example_tests/labels \ 5 | configs/ssd300_vgg16_pascal-voc-2007.json \ 6 | /Users/socretlee/CodingDrive/other/object-detection-in-keras/data/cp_275_loss-3.30_valloss-3.84.h5 \ 7 | --label_maps=/Users/socretlee/CodingDrive/other/object-detection-in-keras/data/easy_example_tests/label_maps.txt \ 8 | --output_dir=/Users/socretlee/CodingDrive/other/object-detection-in-keras/output/cp_275_loss-3.30_valloss-3.84.h5 \ 9 | --num_predictions=3 -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | from utils import training_utils, command_line_utils 2 | from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger, TerminateOnNaN, LearningRateScheduler 3 | import argparse 4 | import json 5 | import os 6 | 7 | 8 | parser = argparse.ArgumentParser( 9 | description='Start the training process of a particular network.') 10 | parser.add_argument('config', type=str, help='path to config file.') 11 | parser.add_argument('images_dir', type=str, help='path to images dir.') 12 | parser.add_argument('labels_dir', type=str, help='path to labels dir.') 13 | # 14 | parser.add_argument('--training_split', type=str, 15 | help='path to training split file.') 16 | parser.add_argument('--validation_split', type=str, 17 | help='path to validation split file.') 18 | # 19 | parser.add_argument('--label_maps', type=str, help='path to label maps file.') 20 | # 21 | parser.add_argument('--checkpoint', type=str, 22 | help='path to checkpoint weight file.') 23 | # 24 | parser.add_argument('--learning_rate', type=float, 25 | help='learning rate used in training.', default=10e-3) 26 | parser.add_argument('--epochs', type=int, 27 | help='the number of epochs to train', default=100) 28 | parser.add_argument('--initial_epoch', type=int, 29 | help='the initial epochs to start from', default=0) 30 | parser.add_argument('--batch_size', type=int, 31 | help='the batch size used in training', default=32) 32 | parser.add_argument('--shuffle', type=command_line_utils.str2bool, nargs='?', 33 | help='whether to shuffle the dataset when creating the batch', default=True) 34 | parser.add_argument('--augment', type=command_line_utils.str2bool, 35 | nargs='?', help='whether to augment training samples', default=False) 36 | parser.add_argument('--schedule_lr', type=command_line_utils.str2bool, 37 | nargs='?', help='whether to use the lr scheduler', default=True) 38 | parser.add_argument('--show_network_structure', type=command_line_utils.str2bool, 39 | nargs='?', help='whether to print out the network structure when constructing the network', default=False) 40 | parser.add_argument('--output_dir', type=str, 41 | help='path to config file.', default="output") 42 | args = parser.parse_args() 43 | 44 | assert os.path.exists(args.config), "config file does not exist" 45 | assert os.path.exists(args.images_dir), "images_dir does not exist" 46 | assert os.path.exists(args.labels_dir), "labels_dir does not exist" 47 | assert args.epochs > 0, "epochs must be larger than zero" 48 | assert args.batch_size > 0, "batch_size must be larger than 0" 49 | assert args.learning_rate > 0, "learning_rate must be larger than 0" 50 | 51 | if args.label_maps is not None: 52 | assert os.path.exists(args.label_maps), "label_maps file does not exist" 53 | 54 | if not os.path.exists(args.output_dir): 55 | os.makedirs(args.output_dir) 56 | 57 | with open(args.config, "r") as config_file: 58 | config = json.load(config_file) 59 | 60 | model_config = config["model"] 61 | 62 | if model_config["name"] == "ssd_mobilenetv1": 63 | training_utils.ssd_mobilenetv1(config, args) 64 | elif model_config["name"] == "ssd_mobilenetv2": 65 | training_utils.ssd_mobilenetv2(config, args) 66 | elif model_config["name"] == "ssd_vgg16": 67 | # configure callbacks here 68 | callbacks = [ 69 | ModelCheckpoint( 70 | filepath=os.path.join( 71 | args.output_dir, 72 | "cp_{epoch:02d}_loss-{loss:.2f}.h5" if args.validation_split is None else "cp_{epoch:02d}_loss-{loss:.2f}_valloss-{val_loss:.2f}.h5" 73 | ), 74 | save_weights_only=False, 75 | save_best_only=True, 76 | monitor='loss' if args.validation_split is None else 'val_loss', 77 | mode='min' 78 | ), 79 | CSVLogger( 80 | os.path.join(args.output_dir, "training.csv"), 81 | append=False 82 | ), 83 | TerminateOnNaN(), 84 | ] 85 | 86 | if (args.schedule_lr): 87 | def lr_schedule(epoch): 88 | if epoch < 108: 89 | return args.learning_rate 90 | elif epoch < 146: 91 | return 0.0001 92 | else: 93 | return 0.00001 94 | callbacks.append(LearningRateScheduler(schedule=lr_schedule, verbose=1)) 95 | 96 | training_utils.ssd_vgg16(config, args, callbacks) 97 | elif model_config["name"] == "tbpp_vgg16": 98 | training_utils.tbpp_vgg16(config, args) 99 | else: 100 | print( 101 | f"model with name ${model_config['name']} has not been implemented yet") 102 | exit() 103 | -------------------------------------------------------------------------------- /train.sh: -------------------------------------------------------------------------------- 1 | python train.py \ 2 | configs/ssd300_vgg16_pascal-voc-07-12.json \ 3 | /Users/socretlee/CodingDrive/other/object-detection-in-keras/data/pascal-voc-07-12/images \ 4 | /Users/socretlee/CodingDrive/other/object-detection-in-keras/data/pascal-voc-07-12/labels \ 5 | --training_split=/Users/socretlee/CodingDrive/other/object-detection-in-keras/data/pascal-voc-07-12/train.txt \ 6 | --validation_split=/Users/socretlee/CodingDrive/other/object-detection-in-keras/data/pascal-voc-07-12/val.txt \ 7 | --label_maps=/Users/socretlee/CodingDrive/other/object-detection-in-keras/data/pascal-voc-07-12/label_maps.txt \ 8 | --learning_rate=0.001 \ 9 | --epochs=100 \ 10 | --batch_size=3 \ 11 | --shuffle=True \ 12 | --augment=True \ 13 | --output_dir=output/ssd300_vgg16_pascal-voc-2007 -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .one_hot_class_label import one_hot_class_label 2 | -------------------------------------------------------------------------------- /utils/augmentation_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .random_brightness import random_brightness 2 | from .random_contrast import random_contrast 3 | from .random_crop_quad import random_crop_quad 4 | from .random_crop import random_crop 5 | from .random_expand_quad import random_expand_quad 6 | from .random_expand import random_expand 7 | from .random_horizontal_flip_quad import random_horizontal_flip_quad 8 | from .random_horizontal_flip import random_horizontal_flip 9 | from .random_hue import random_hue 10 | from .random_lighting_noise import random_lighting_noise 11 | from .random_saturation import random_saturation 12 | from .random_vertical_flip_quad import random_vertical_flip_quad 13 | from .random_vertical_flip import random_vertical_flip 14 | from .resize_to_fixed_size import resize_to_fixed_size 15 | from .bboxes_filter import bboxes_filter 16 | -------------------------------------------------------------------------------- /utils/augmentation_utils/bboxes_filter.py: -------------------------------------------------------------------------------- 1 | def bboxes_filter(): 2 | """ 3 | """ 4 | def _augment( 5 | image, 6 | bboxes, 7 | classes=None 8 | ): 9 | return image, bboxes, classes 10 | return _augment 11 | -------------------------------------------------------------------------------- /utils/augmentation_utils/random_brightness.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import random 3 | import numpy as np 4 | 5 | 6 | def random_brightness( 7 | min_delta=-32, 8 | max_delta=32, 9 | p=0.5 10 | ): 11 | """ Changes the brightness of an image by adding/subtracting a delta value to/from each pixel. 12 | The image format is assumed to be BGR to match Opencv's standard. 13 | 14 | Args: 15 | - image: numpy array representing the input image. 16 | - bboxes: numpy array representing the bounding boxes. 17 | - classes: the list of classes associating with each bounding boxes. 18 | - min_delta: minimum delta value. 19 | - max_delta: maximum delta value. 20 | - p: The probability with which the brightness is changed 21 | 22 | Returns: 23 | - image: The modified image 24 | - bboxes: The unmodified bounding boxes 25 | - classes: The unmodified bounding boxes 26 | 27 | Raises: 28 | - min_delta is less than -255.0 29 | - max_delta is larger than 255.0 30 | - p is smaller than zero 31 | - p is larger than 1 32 | 33 | Webpage References: 34 | - https://www.telesens.co/2018/06/28/data-augmentation-in-ssd/ 35 | 36 | Code References: 37 | - https://github.com/amdegroot/ssd.pytorch/blob/master/utils/augmentations.py 38 | """ 39 | assert min_delta >= -255.0, "min_delta must be larger than -255.0" 40 | assert max_delta <= 255.0, "max_delta must be less than 255.0" 41 | assert p >= 0, "p must be larger than or equal to zero" 42 | assert p <= 1, "p must be less than or equal to 1" 43 | 44 | def _augment(image, bboxes=None, classes=None): 45 | if (random.random() > p): 46 | return image, bboxes, classes 47 | 48 | temp_image = image.copy() 49 | d = random.uniform(min_delta, max_delta) 50 | temp_image += d 51 | temp_image = np.clip(temp_image, 0, 255) 52 | return temp_image, bboxes, classes 53 | 54 | return _augment 55 | -------------------------------------------------------------------------------- /utils/augmentation_utils/random_contrast.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import random 3 | import numpy as np 4 | 5 | 6 | def random_contrast( 7 | min_delta=0.5, 8 | max_delta=1.5, 9 | p=0.5 10 | ): 11 | """ Changes the contrast of an image by increasing/decreasing each pixel by a factor of delta. 12 | The image format is assumed to be BGR to match Opencv's standard. 13 | 14 | Args: 15 | - image: numpy array representing the input image. 16 | - bboxes: numpy array representing the bounding boxes. 17 | - classes: the list of classes associating with each bounding boxes. 18 | - min_delta: minimum delta value. 19 | - max_delta: maximum delta value. 20 | - p: The probability with which the contrast is changed 21 | 22 | Returns: 23 | - image: The modified image 24 | - bboxes: The unmodified bounding boxes 25 | - classes: The unmodified bounding boxes 26 | 27 | Raises: 28 | - min_delta is less than 0 29 | - max_delta is less than min_delta 30 | - p is smaller than zero 31 | - p is larger than 1 32 | 33 | Webpage References: 34 | - https://www.telesens.co/2018/06/28/data-augmentation-in-ssd/ 35 | 36 | Code References: 37 | - https://github.com/amdegroot/ssd.pytorch/blob/master/utils/augmentations.py 38 | """ 39 | assert min_delta >= 0.0, "min_delta must be larger than zero" 40 | assert max_delta >= min_delta, "max_delta must be larger than min_delta" 41 | assert p >= 0, "p must be larger than or equal to zero" 42 | assert p <= 1, "p must be less than or equal to 1" 43 | 44 | def _augment( 45 | image, 46 | bboxes=None, 47 | classes=None, 48 | ): 49 | if (random.random() > p): 50 | return image, bboxes, classes 51 | 52 | temp_image = image.copy() 53 | d = random.uniform(min_delta, max_delta) 54 | temp_image *= d 55 | temp_image = np.clip(temp_image, 0, 255) 56 | return temp_image, bboxes, classes 57 | 58 | return _augment 59 | -------------------------------------------------------------------------------- /utils/augmentation_utils/random_crop.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import random 3 | import numpy as np 4 | from utils.bbox_utils import iou 5 | 6 | 7 | def random_crop( 8 | min_size=0.1, 9 | max_size=1, 10 | min_ar=1, 11 | max_ar=2, 12 | overlap_modes=[ 13 | None, 14 | [0.1, None], 15 | [0.3, None], 16 | [0.7, None], 17 | [0.9, None], 18 | [None, None], 19 | ], 20 | max_attempts=100, 21 | p=0.5, 22 | ): 23 | """ Randomly crops a patch from the image. 24 | 25 | Args: 26 | - image: numpy array representing the input image. 27 | - bboxes: numpy array representing the bounding boxes. 28 | - classes: the list of classes associating with each bounding boxes. 29 | - min_size: the maximum size a crop can be 30 | - max_size: the maximum size a crop can be 31 | - min_ar: the minimum aspect ratio a crop can be 32 | - max_ar: the maximum aspect ratio a crop can be 33 | - overlap_modes: the list of overlapping modes the function can randomly choose from. 34 | - max_attempts: the max number of attempts to generate a patch. 35 | 36 | Returns: 37 | - image: the modified image 38 | - bboxes: the modified bounding boxes 39 | - classes: the modified classes 40 | 41 | Webpage References: 42 | - https://www.telesens.co/2018/06/28/data-augmentation-in-ssd/ 43 | 44 | Code References: 45 | - https://github.com/amdegroot/ssd.pytorch/blob/master/utils/augmentations.py 46 | """ 47 | assert p >= 0, "p must be larger than or equal to zero" 48 | assert p <= 1, "p must be less than or equal to 1" 49 | assert min_size > 0, "min_size must be larger than zero." 50 | assert max_size <= 1, "max_size must be less than or equals to one." 51 | assert max_size > min_size, "max_size must be larger than min_size." 52 | assert max_ar > min_ar, "max_ar must be larger than min_ar." 53 | assert max_attempts > 0, "max_attempts must be larger than zero." 54 | 55 | def _augment(image, bboxes, classes): 56 | 57 | if (random.random() > p): 58 | return image, bboxes, classes 59 | 60 | height, width, channels = image.shape 61 | overlap_mode = random.choice(overlap_modes) 62 | 63 | if overlap_mode == None: 64 | return image, bboxes, classes 65 | 66 | min_iou, max_iou = overlap_mode 67 | 68 | if min_iou == None: 69 | min_iou = float(-np.inf) 70 | 71 | if max_iou == None: 72 | max_iou = float(np.inf) 73 | 74 | temp_image = image.copy() 75 | 76 | for i in range(max_attempts): 77 | crop_w = random.uniform(min_size * width, max_size * width) 78 | crop_h = random.uniform(min_size * height, max_size * height) 79 | crop_ar = crop_h / crop_w 80 | 81 | if crop_ar < min_ar or crop_ar > max_ar: # crop ar does not match criteria, next attempt 82 | continue 83 | 84 | crop_left = random.uniform(0, width-crop_w) 85 | crop_top = random.uniform(0, height-crop_h) 86 | 87 | crop_rect = np.array([crop_left, crop_top, crop_left + crop_w, crop_top + crop_h], dtype=np.float) 88 | crop_rect = np.expand_dims(crop_rect, axis=0) 89 | crop_rect = np.tile(crop_rect, (bboxes.shape[0], 1)) 90 | 91 | ious = iou(crop_rect, bboxes) 92 | 93 | if ious.min() < min_iou and ious.max() > max_iou: 94 | continue 95 | 96 | bbox_centers = np.zeros((bboxes.shape[0], 2), dtype=np.float) 97 | bbox_centers[:, 0] = (bboxes[:, 0] + bboxes[:, 2]) / 2 98 | bbox_centers[:, 1] = (bboxes[:, 1] + bboxes[:, 3]) / 2 99 | 100 | cx_in_crop = (bbox_centers[:, 0] > crop_left) * (bbox_centers[:, 0] < crop_left + crop_w) 101 | cy_in_crop = (bbox_centers[:, 1] > crop_top) * (bbox_centers[:, 1] < crop_top + crop_h) 102 | boxes_in_crop = cx_in_crop * cy_in_crop 103 | 104 | if not boxes_in_crop.any(): 105 | continue 106 | 107 | temp_image = temp_image[int(crop_top): int(crop_top+crop_h), int(crop_left): int(crop_left+crop_w), :] 108 | temp_classes = np.array(classes, dtype=np.object) 109 | temp_classes = temp_classes[boxes_in_crop] 110 | temp_bboxes = bboxes[boxes_in_crop] 111 | crop_rect = np.array([crop_left, crop_top, crop_left + crop_w, crop_top + crop_h], dtype=np.float) 112 | crop_rect = np.expand_dims(crop_rect, axis=0) 113 | crop_rect = np.tile(crop_rect, (temp_bboxes.shape[0], 1)) 114 | temp_bboxes[:, :2] = np.maximum(temp_bboxes[:, :2], crop_rect[:, :2]) # if bboxes top left is out of crop then use crop's xmin, ymin 115 | temp_bboxes[:, :2] -= crop_rect[:, :2] # translate xmin, ymin to fit crop 116 | temp_bboxes[:, 2:] = np.minimum(temp_bboxes[:, 2:], crop_rect[:, 2:]) 117 | temp_bboxes[:, 2:] -= crop_rect[:, :2] # translate xmax, ymax to fit crop 118 | 119 | return temp_image, temp_bboxes, temp_classes.tolist() 120 | 121 | return image, bboxes, classes 122 | 123 | return _augment 124 | -------------------------------------------------------------------------------- /utils/augmentation_utils/random_crop_quad.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import random 3 | import numpy as np 4 | from utils.bbox_utils import iou, object_coverage 5 | from utils.textboxes_utils import get_bboxes_from_quads 6 | 7 | def random_crop_quad( 8 | image, 9 | quads, 10 | classes, 11 | min_size=0.1, 12 | max_size=1, 13 | min_ar=1, 14 | max_ar=2, 15 | overlap_modes=[ 16 | None, 17 | [0.1, None], 18 | [0.3, None], 19 | [0.7, None], 20 | [0.9, None], 21 | [None, None], 22 | ], 23 | max_attempts=100, 24 | p=0.5 25 | ): 26 | """ Randomly crops a patch from the image. 27 | 28 | Args: 29 | - image: numpy array representing the input image. 30 | - quads: numpy array representing the quads. 31 | - classes: the list of classes associating with each quads. 32 | - min_size: the maximum size a crop can be 33 | - max_size: the maximum size a crop can be 34 | - min_ar: the minimum aspect ratio a crop can be 35 | - max_ar: the maximum aspect ratio a crop can be 36 | - overlap_modes: the list of overlapping modes the function can randomly choose from. 37 | - max_attempts: the max number of attempts to generate a patch. 38 | 39 | Returns: 40 | - image: the modified image 41 | - quads: the modified quads 42 | - classes: the modified classes 43 | """ 44 | assert p >= 0, "p must be larger than or equal to zero" 45 | assert p <= 1, "p must be less than or equal to 1" 46 | assert min_size > 0, "min_size must be larger than zero." 47 | assert max_size <= 1, "max_size must be less than or equals to one." 48 | assert max_size > min_size, "max_size must be larger than min_size." 49 | assert max_ar > min_ar, "max_ar must be larger than min_ar." 50 | assert max_attempts > 0, "max_attempts must be larger than zero." 51 | 52 | # if (random.random() > p): 53 | # return image, bboxes, classes 54 | 55 | height, width, channels = image.shape 56 | overlap_mode = [0.7, None] 57 | # overlap_mode = random.choice(overlap_modes) 58 | 59 | # if overlap_mode == None: 60 | # return image, bboxes, classes 61 | 62 | bboxes = get_bboxes_from_quads(quads) 63 | 64 | min_iou, max_iou = overlap_mode 65 | 66 | if min_iou == None: 67 | min_iou = float(-np.inf) 68 | 69 | if max_iou == None: 70 | max_iou = float(np.inf) 71 | 72 | temp_image = image.copy() 73 | 74 | for i in range(max_attempts): 75 | crop_w = random.uniform(min_size * width, max_size * width) 76 | crop_h = random.uniform(min_size * height, max_size * height) 77 | crop_ar = crop_h / crop_w 78 | 79 | if crop_ar < min_ar or crop_ar > max_ar: # crop ar does not match criteria, next attempt 80 | continue 81 | 82 | crop_left = random.uniform(0, width-crop_w) 83 | crop_top = random.uniform(0, height-crop_h) 84 | 85 | crop_rect = np.array([crop_left, crop_top, crop_left + crop_w, crop_top + crop_h], dtype=np.float) 86 | crop_rect = np.expand_dims(crop_rect, axis=0) 87 | crop_rect = np.tile(crop_rect, (bboxes.shape[0], 1)) 88 | 89 | ious = iou(crop_rect, bboxes) 90 | obj_coverage = object_coverage(crop_rect, bboxes) 91 | 92 | 93 | if (ious.min() < min_iou and ious.max() > max_iou) or (obj_coverage.min() < min_iou and obj_coverage.max() > max_iou): 94 | continue 95 | 96 | bbox_centers = np.zeros((bboxes.shape[0], 2), dtype=np.float) 97 | bbox_centers[:, 0] = (bboxes[:, 0] + bboxes[:, 2]) / 2 98 | bbox_centers[:, 1] = (bboxes[:, 1] + bboxes[:, 3]) / 2 99 | 100 | cx_in_crop = (bbox_centers[:, 0] > crop_left) * (bbox_centers[:, 0] < crop_left + crop_w) 101 | cy_in_crop = (bbox_centers[:, 1] > crop_top) * (bbox_centers[:, 1] < crop_top + crop_h) 102 | boxes_in_crop = cx_in_crop * cy_in_crop 103 | 104 | if not boxes_in_crop.any(): 105 | continue 106 | 107 | print(ious, obj_coverage, boxes_in_crop) 108 | print("======") 109 | 110 | temp_image = temp_image[int(crop_top): int(crop_top+crop_h), int(crop_left): int(crop_left+crop_w), :] 111 | temp_classes = np.array(classes, dtype=np.object) 112 | temp_classes = temp_classes[boxes_in_crop] 113 | temp_bboxes = bboxes[boxes_in_crop] 114 | temp_quads = quads[boxes_in_crop] 115 | crop_rect = np.array([crop_left, crop_top, crop_left + crop_w, crop_top + crop_h], dtype=np.float) 116 | crop_rect = np.expand_dims(crop_rect, axis=0) 117 | crop_rect = np.tile(crop_rect, (temp_bboxes.shape[0], 1)) 118 | 119 | print(temp_quads.shape) 120 | temp_bboxes[:, :2] = np.maximum(temp_bboxes[:, :2], crop_rect[:, :2]) # if bboxes top left is out of crop then use crop's xmin, ymin 121 | temp_bboxes[:, :2] -= crop_rect[:, :2] # translate xmin, ymin to fit crop 122 | temp_bboxes[:, 2:] = np.minimum(temp_bboxes[:, 2:], crop_rect[:, 2:]) 123 | temp_bboxes[:, 2:] -= crop_rect[:, :2] # translate xmax, ymax to fit crop 124 | return temp_image, temp_quads, temp_classes.tolist() 125 | 126 | return image, bboxes, classes 127 | -------------------------------------------------------------------------------- /utils/augmentation_utils/random_expand.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | 5 | def random_expand( 6 | min_ratio=1, 7 | max_ratio=16, 8 | # mean=[0.406, 0.456, 0.485], # BGR 9 | mean=[104, 117, 123], # BGR 10 | # mean=[0.406, 0.456, 0.485], # BGR 11 | p=0.5 12 | ): 13 | """ Randomly expands an image and bounding boxes by a ratio between min_ratio and max_ratio. The image format is assumed to be BGR to match Opencv's standard. 14 | 15 | Args: 16 | - image: numpy array representing the input image. 17 | - bboxes: numpy array representing the bounding boxes. 18 | - classes: the list of classes associating with each bounding boxes. 19 | - min_ratio: The minimum value to expand the image. Defaults to 1. 20 | - max_ratio: The maximum value to expand the image. Defaults to 4. 21 | - p: The probability with which the image is expanded 22 | 23 | Returns: 24 | - image: The modified image 25 | - bboxes: The modified bounding boxes 26 | - classes: The unmodified bounding boxes 27 | 28 | Raises: 29 | - p is smaller than zero 30 | - p is larger than 1 31 | 32 | Webpage References: 33 | - https://www.telesens.co/2018/06/28/data-augmentation-in-ssd/ 34 | 35 | Code References: 36 | - https://github.com/amdegroot/ssd.pytorch/blob/master/utils/augmentations.py 37 | """ 38 | assert p >= 0, "p must be larger than or equal to zero" 39 | assert p <= 1, "p must be less than or equal to 1" 40 | assert min_ratio > 0, "min_ratio must be larger than zero" 41 | assert max_ratio > min_ratio, "max_ratio must be larger than min_ratio" 42 | 43 | def _augment(image, bboxes, classes): 44 | if (random.random() > p): 45 | return image, bboxes, classes 46 | 47 | height, width, depth = image.shape 48 | ratio = random.uniform(min_ratio, max_ratio) 49 | left = random.uniform(0, width * ratio - width) 50 | top = random.uniform(0, height * ratio - height) 51 | temp_image = np.zeros( 52 | (int(height * ratio), int(width * ratio), depth), 53 | dtype=image.dtype 54 | ) 55 | temp_image[:, :, :] = mean 56 | temp_image[int(top):int(top+height), int(left):int(left+width)] = image 57 | temp_bboxes = bboxes.copy() 58 | temp_bboxes[:, :2] += (int(left), int(top)) 59 | temp_bboxes[:, 2:] += (int(left), int(top)) 60 | return temp_image, temp_bboxes, classes 61 | 62 | return _augment 63 | -------------------------------------------------------------------------------- /utils/augmentation_utils/random_expand_quad.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | 5 | def random_expand_quad( 6 | image, 7 | quads, 8 | classes, 9 | min_ratio=1, 10 | max_ratio=4, 11 | mean=[0.406, 0.456, 0.485], # BGR 12 | p=0.5 13 | ): 14 | """ Randomly expands an image and quadrilaterals by a ratio between min_ratio and max_ratio. The image format is assumed to be BGR to match Opencv's standard. 15 | 16 | Args: 17 | - image: numpy array representing the input image. 18 | - quads: numpy array representing the quadrilaterals. 19 | - classes: the list of classes associating with each quadrilaterals. 20 | - min_ratio: The minimum value to expand the image. Defaults to 1. 21 | - max_ratio: The maximum value to expand the image. Defaults to 4. 22 | - p: The probability with which the image is expanded 23 | 24 | Returns: 25 | - image: The modified image 26 | - quads: The modified quadrilaterals 27 | - classes: The unmodified quadrilaterals 28 | 29 | Raises: 30 | - p is smaller than zero 31 | - p is larger than 1 32 | """ 33 | assert p >= 0, "p must be larger than or equal to zero" 34 | assert p <= 1, "p must be less than or equal to 1" 35 | assert min_ratio > 0, "min_ratio must be larger than zero" 36 | assert max_ratio > min_ratio, "max_ratio must be larger than min_ratio" 37 | 38 | if (random.random() > p): 39 | return image, quads, classes 40 | 41 | height, width, depth = image.shape 42 | ratio = random.uniform(min_ratio, max_ratio) 43 | left = random.uniform(0, width * ratio - width) 44 | top = random.uniform(0, height * ratio - height) 45 | temp_image = np.zeros( 46 | (int(height * ratio), int(width * ratio), depth), 47 | dtype=image.dtype 48 | ) 49 | temp_image[:, :, :] = mean 50 | temp_image[int(top):int(top+height), int(left):int(left+width)] = image 51 | temp_quads = quads.copy() 52 | temp_quads[:, :, 0] = quads[:, :, 0] + int(left) 53 | temp_quads[:, :, 1] = quads[:, :, 1] + int(top) 54 | return temp_image, temp_quads, classes 55 | -------------------------------------------------------------------------------- /utils/augmentation_utils/random_horizontal_flip.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import random 4 | 5 | 6 | def random_horizontal_flip(p=0.5): 7 | """ Randomly flipped the image horizontally. The image format is assumed to be BGR to match Opencv's standard. 8 | 9 | Args: 10 | - image: numpy array representing the input image. 11 | - bboxes: numpy array representing the bounding boxes. 12 | - classes: the list of classes associating with each bounding boxes. 13 | - p: The probability with which the image is flipped horizontally 14 | 15 | Returns: 16 | - image: The modified image 17 | - bboxes: The modified bounding boxes 18 | - classes: The unmodified bounding boxes 19 | 20 | Raises: 21 | - p is smaller than zero 22 | - p is larger than 1 23 | 24 | Webpage References: 25 | - https://www.kdnuggets.com/2018/09/data-augmentation-bounding-boxes-image-transforms.html/2 26 | 27 | Code References: 28 | - https://github.com/amdegroot/ssd.pytorch/blob/master/utils/augmentations.py 29 | """ 30 | assert p >= 0, "p must be larger than or equal to zero" 31 | assert p <= 1, "p must be less than or equal to 1" 32 | 33 | def _augment(image, bboxes, classes): 34 | 35 | if (random.random() > p): 36 | return image, bboxes, classes 37 | 38 | temp_bboxes = bboxes.copy() 39 | image_center = np.array(image.shape[:2])[::-1]/2 40 | image_center = np.hstack((image_center, image_center)) 41 | temp_bboxes[:, [0, 2]] += 2*(image_center[[0, 2]] - temp_bboxes[:, [0, 2]]) 42 | boxes_width = abs(temp_bboxes[:, 0] - temp_bboxes[:, 2]) 43 | temp_bboxes[:, 0] -= boxes_width 44 | temp_bboxes[:, 2] += boxes_width 45 | return np.array(cv2.flip(np.uint8(image), 1), dtype=np.float), temp_bboxes, classes 46 | 47 | return _augment 48 | -------------------------------------------------------------------------------- /utils/augmentation_utils/random_horizontal_flip_quad.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import random 4 | 5 | 6 | def random_horizontal_flip_quad( 7 | image, 8 | quads, 9 | classes=None, 10 | p=0.5 11 | ): 12 | """ Randomly flipped the image horizontally. The image format is assumed to be BGR to match Opencv's standard. 13 | 14 | Args: 15 | - image: numpy array representing the input image. 16 | - quads: numpy array representing the quadrilaterals. 17 | - classes: the list of classes associating with each quadrilaterals. 18 | - p: The probability with which the image is flipped horizontally 19 | 20 | Returns: 21 | - image: The modified image 22 | - quads: The modified quadrilaterals 23 | - classes: The unmodified bounding boxes 24 | 25 | Raises: 26 | - p is smaller than zero 27 | - p is larger than 1 28 | """ 29 | 30 | assert p >= 0, "p must be larger than or equal to zero" 31 | assert p <= 1, "p must be less than or equal to 1" 32 | 33 | # if (random.random() > p): 34 | # return image, quads, classes 35 | 36 | temp_quads = quads.copy() 37 | temp_quads[:, :, 0] = image.shape[1] - quads[:, :, 0] 38 | temp = temp_quads.copy() 39 | temp_quads[:, 0] = temp[:, 1] 40 | temp_quads[:, 1] = temp[:, 0] 41 | temp_quads[:, 2] = temp[:, 3] 42 | temp_quads[:, 3] = temp[:, 2] 43 | return np.array(cv2.flip(np.uint8(image), 1), dtype=np.float), temp_quads, classes 44 | -------------------------------------------------------------------------------- /utils/augmentation_utils/random_hue.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import random 3 | import numpy as np 4 | 5 | 6 | def random_hue( 7 | min_delta=-18, 8 | max_delta=18, 9 | p=0.5 10 | ): 11 | """ Changes the Hue of an image by adding/subtracting a delta value 12 | to/from each value in the Hue channel of the image. The image format 13 | is assumed to be BGR to match Opencv's standard. 14 | 15 | Args: 16 | - image: numpy array representing the input image. 17 | - bboxes: numpy array representing the bounding boxes. 18 | - classes: the list of classes associating with each bounding boxes. 19 | - min_delta: minimum delta value. 20 | - max_delta: maximum delta value. 21 | - p: The probability with which the contrast is changed 22 | 23 | Returns: 24 | - image: The modified image 25 | - bboxes: The unmodified bounding boxes 26 | - classes: The unmodified bounding boxes 27 | 28 | Raises: 29 | - min_delta is less than -360.0 30 | - max_delta is larger than 360.0 31 | - p is smaller than zero 32 | - p is larger than 1 33 | 34 | Webpage References: 35 | - https://www.telesens.co/2018/06/28/data-augmentation-in-ssd/ 36 | 37 | Code References: 38 | - https://github.com/amdegroot/ssd.pytorch/blob/master/utils/augmentations.py 39 | """ 40 | assert min_delta >= -360.0, "min_delta must be larger than -360.0" 41 | assert max_delta <= 360.0, "max_delta must be less than 360.0" 42 | assert p >= 0, "p must be larger than or equal to zero" 43 | assert p <= 1, "p must be less than or equal to 1" 44 | 45 | def _augment( 46 | image, 47 | bboxes=None, 48 | classes=None 49 | ): 50 | if (random.random() > p): 51 | return image, bboxes, classes 52 | 53 | temp_image = cv2.cvtColor(np.uint8(image), cv2.COLOR_BGR2HSV) 54 | temp_image = np.array(temp_image, dtype=np.float) 55 | d = random.uniform(min_delta, max_delta) 56 | temp_image[:, :, 0] += d 57 | temp_image = np.clip(temp_image, 0, 360) 58 | temp_image = cv2.cvtColor(np.uint8(temp_image), cv2.COLOR_HSV2BGR) 59 | temp_image = np.array(temp_image, dtype=np.float) 60 | return temp_image, bboxes, classes 61 | 62 | return _augment 63 | -------------------------------------------------------------------------------- /utils/augmentation_utils/random_lighting_noise.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import random 3 | import numpy as np 4 | 5 | 6 | def random_lighting_noise(p=0.5): 7 | """ Changes the lighting of the image by randomly swapping the channels. 8 | The image format is assumed to be BGR to match Opencv's standard. 9 | 10 | Args: 11 | - image: numpy array representing the input image. 12 | - bboxes: numpy array representing the bounding boxes. 13 | - classes: the list of classes associating with each bounding boxes. 14 | - p: The probability with which the contrast is changed 15 | 16 | Returns: 17 | - image: The modified image 18 | - bboxes: The unmodified bounding boxes 19 | - classes: The unmodified bounding boxes 20 | 21 | Raises: 22 | - p is smaller than zero 23 | - p is larger than 1 24 | 25 | Webpage References: 26 | - https://www.telesens.co/2018/06/28/data-augmentation-in-ssd/ 27 | 28 | Code References: 29 | - https://github.com/amdegroot/ssd.pytorch/blob/master/utils/augmentations.py 30 | """ 31 | assert p >= 0, "p must be larger than or equal to zero" 32 | assert p <= 1, "p must be less than or equal to 1" 33 | 34 | def _augment(image, bboxes=None, classes=None): 35 | if (random.random() > p): 36 | return image, bboxes, classes 37 | 38 | temp_image = image.copy() 39 | perms = [ 40 | (0, 1, 2), 41 | (0, 2, 1), 42 | (1, 0, 2), 43 | (1, 2, 0), 44 | (2, 0, 1), 45 | (2, 1, 0) 46 | ] 47 | selected_perm = random.randint(0, len(perms) - 1) 48 | perm = perms[selected_perm] 49 | temp_image = temp_image[:, :, perm] 50 | return temp_image, bboxes, classes 51 | 52 | return _augment 53 | -------------------------------------------------------------------------------- /utils/augmentation_utils/random_saturation.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import random 3 | import numpy as np 4 | 5 | 6 | def random_saturation( 7 | min_delta=0.5, 8 | max_delta=1.5, 9 | p=0.5 10 | ): 11 | """ Changes the saturation of an image by increasing/decreasing each 12 | value in the saturation channel by a factor of delta. The image format 13 | is assumed to be BGR to match Opencv's standard. 14 | 15 | Args: 16 | - image: numpy array representing the input image. 17 | - bboxes: numpy array representing the bounding boxes. 18 | - classes: the list of classes associating with each bounding boxes. 19 | - min_delta: minimum delta value. 20 | - max_delta: maximum delta value. 21 | 22 | Returns: 23 | - image: The modified image 24 | - bboxes: The unmodified bounding boxes 25 | - classes: The unmodified bounding boxes 26 | 27 | Raises: 28 | - min_delta is less than 0 29 | - max_delta is less than min_delta 30 | - p is smaller than zero 31 | - p is larger than 1 32 | 33 | Webpage References: 34 | - https://www.telesens.co/2018/06/28/data-augmentation-in-ssd/ 35 | 36 | Code References: 37 | - https://github.com/amdegroot/ssd.pytorch/blob/master/utils/augmentations.py 38 | 39 | """ 40 | assert min_delta >= 0.0, "min_delta must be larger than zero" 41 | assert max_delta >= min_delta, "max_delta must be larger than min_delta" 42 | assert p >= 0, "p must be larger than or equal to zero" 43 | assert p <= 1, "p must be less than or equal to 1" 44 | 45 | def _augment(image, bboxes=None, classes=None): 46 | if (random.random() > p): 47 | return image, bboxes, classes 48 | 49 | temp_image = cv2.cvtColor(np.uint8(image), cv2.COLOR_BGR2HSV) 50 | temp_image = np.array(temp_image, dtype=np.float) 51 | d = random.uniform(min_delta, max_delta) 52 | temp_image[:, :, 1] *= d 53 | temp_image = cv2.cvtColor(np.uint8(temp_image), cv2.COLOR_HSV2BGR) 54 | temp_image = np.array(temp_image, dtype=np.float) 55 | return temp_image, bboxes, classes 56 | 57 | return _augment 58 | -------------------------------------------------------------------------------- /utils/augmentation_utils/random_vertical_flip.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import random 4 | 5 | 6 | def random_vertical_flip( 7 | image, 8 | bboxes, 9 | classes, 10 | p=0.5 11 | ): 12 | """ Randomly flipped the image vertically. The image format is assumed to be BGR to match Opencv's standard. 13 | 14 | Args: 15 | - image: numpy array representing the input image. 16 | - bboxes: numpy array representing the bounding boxes. 17 | - classes: the list of classes associating with each bounding boxes. 18 | - p: The probability with which the image is flipped vertically 19 | 20 | Returns: 21 | - image: The modified image 22 | - bboxes: The modified bounding boxes 23 | - classes: The unmodified bounding boxes 24 | 25 | Raises: 26 | - p is smaller than zero 27 | - p is larger than 1 28 | 29 | Webpage References: 30 | - https://www.kdnuggets.com/2018/09/data-augmentation-bounding-boxes-image-transforms.html/2 31 | 32 | Code References: 33 | - https://github.com/amdegroot/ssd.pytorch/blob/master/utils/augmentations.py 34 | """ 35 | 36 | assert p >= 0, "p must be larger than or equal to zero" 37 | assert p <= 1, "p must be less than or equal to 1" 38 | 39 | if (random.random() > p): 40 | return image, bboxes, classes 41 | 42 | temp_bboxes = bboxes.copy() 43 | image_center = np.array(image.shape[:2])[::-1]/2 44 | image_center = np.hstack((image_center, image_center)) 45 | temp_bboxes[:, [1, 3]] += 2*(image_center[[1, 3]] - temp_bboxes[:, [1, 3]]) 46 | boxes_height = abs(temp_bboxes[:, 1] - temp_bboxes[:, 3]) 47 | temp_bboxes[:, 1] -= boxes_height 48 | temp_bboxes[:, 3] += boxes_height 49 | return np.array(cv2.flip(np.uint8(image), 0), dtype=np.float), temp_bboxes, classes 50 | -------------------------------------------------------------------------------- /utils/augmentation_utils/random_vertical_flip_quad.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import random 4 | 5 | 6 | def random_vertical_flip_quad( 7 | image, 8 | quads, 9 | classes, 10 | p=0.5 11 | ): 12 | """ Randomly flipped the image vertically. The image format is assumed to be BGR to match Opencv's standard. 13 | 14 | Args: 15 | - image: numpy array representing the input image. 16 | - quads: numpy array representing the quadrilaterals. 17 | - classes: the list of classes associating with each quadrilaterals. 18 | - p: The probability with which the image is flipped vertically 19 | 20 | Returns: 21 | - image: The modified image 22 | - quads: The modified quadrilaterals 23 | - classes: The unmodified bounding boxes 24 | 25 | Raises: 26 | - p is smaller than zero 27 | - p is larger than 1 28 | """ 29 | 30 | assert p >= 0, "p must be larger than or equal to zero" 31 | assert p <= 1, "p must be less than or equal to 1" 32 | 33 | # if (random.random() > p): 34 | # return image, quads, classes 35 | 36 | temp_quads = quads.copy() 37 | temp_quads[:, :, 1] = image.shape[0] - quads[:, :, 1] 38 | temp = temp_quads.copy() 39 | temp_quads[:, 0] = temp[:, 3] 40 | temp_quads[:, 1] = temp[:, 2] 41 | temp_quads[:, 2] = temp[:, 1] 42 | temp_quads[:, 3] = temp[:, 0] 43 | return np.array(cv2.flip(np.uint8(image), 0), dtype=np.float), temp_quads, classes 44 | -------------------------------------------------------------------------------- /utils/augmentation_utils/resize_to_fixed_size.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import random 3 | import numpy as np 4 | 5 | 6 | def resize_to_fixed_size(width, height): 7 | """ Resize the input image and bounding boxes to fixed size. 8 | 9 | Args: 10 | - image: numpy array representing the input image. 11 | - bboxes: numpy array representing the bounding boxes. 12 | - classes: the list of classes associating with each bounding boxes. 13 | - width: minimum delta value. 14 | - height: maximum delta value. 15 | 16 | Returns: 17 | - image: The modified image 18 | - bboxes: The unmodified bounding boxes 19 | - classes: The unmodified bounding boxes 20 | 21 | Raises: 22 | - width is less than 0 23 | - height is less than 0 24 | """ 25 | assert width >= 0, "width must be larger than 0" 26 | assert height >= 0, "height must be larger than 0" 27 | 28 | def _augment( 29 | image, 30 | bboxes, 31 | classes=None 32 | ): 33 | temp_image = np.uint8(image) 34 | o_height, o_width, _ = temp_image.shape 35 | height_scale, width_scale = height / o_height, width / o_width 36 | temp_image = cv2.resize(temp_image, (width, height)) 37 | temp_image = np.array(temp_image, dtype=np.float) 38 | temp_bboxes = bboxes.copy() 39 | temp_bboxes[:, [0, 2]] *= width_scale 40 | temp_bboxes[:, [1, 3]] *= height_scale 41 | temp_bboxes[:, [0, 2]] = np.clip(temp_bboxes[:, [0, 2]], 0, width) 42 | temp_bboxes[:, [1, 3]] = np.clip(temp_bboxes[:, [1, 3]], 0, height) 43 | 44 | return temp_image, temp_bboxes, classes 45 | 46 | return _augment 47 | -------------------------------------------------------------------------------- /utils/bbox_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .center_to_corner import center_to_corner 2 | from .corner_to_center import corner_to_center 3 | from .iou import iou 4 | from .object_coverage import object_coverage 5 | from .center_to_vertices import center_to_vertices -------------------------------------------------------------------------------- /utils/bbox_utils/center_to_corner.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def center_to_corner(boxes): 5 | """ Convert bounding boxes from center format (cx, cy, width, height) to corner format (xmin, ymin, xmax, ymax) 6 | 7 | Args: 8 | - boxes: numpy array of tensor containing all the boxes to be converted 9 | 10 | Returns: 11 | - A numpy array or tensor of converted boxes 12 | """ 13 | temp = boxes.copy() 14 | temp[..., 0] = boxes[..., 0] - (boxes[..., 2] / 2) # xmin 15 | temp[..., 1] = boxes[..., 1] - (boxes[..., 3] / 2) # ymin 16 | temp[..., 2] = boxes[..., 0] + (boxes[..., 2] / 2) # xmax 17 | temp[..., 3] = boxes[..., 1] + (boxes[..., 3] / 2) # ymax 18 | return temp 19 | -------------------------------------------------------------------------------- /utils/bbox_utils/center_to_vertices.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def center_to_vertices(boxes): 5 | """ Convert bounding boxes from center format (cx, cy, width, height) to vertices format (x1, y1, x2, y2, x3, y3, x4, y4) 6 | where (x1, y1) is the top left vertice. 7 | 8 | Args: 9 | - boxes: numpy array of tensor containing all the boxes to be converted 10 | 11 | Returns: 12 | - A numpy array of shape (n, 4, 2) 13 | """ 14 | temp = np.zeros((boxes.shape[0], 8)) 15 | half_width = boxes[..., 2] / 2 16 | half_height = boxes[..., 3] / 2 17 | temp[..., 0] = boxes[..., 0] - half_width 18 | temp[..., 1] = boxes[..., 1] - half_height 19 | temp[..., 2] = boxes[..., 0] + half_width 20 | temp[..., 3] = boxes[..., 1] - half_height 21 | temp[..., 4] = boxes[..., 0] + half_width 22 | temp[..., 5] = boxes[..., 1] + half_height 23 | temp[..., 6] = boxes[..., 0] - half_width 24 | temp[..., 7] = boxes[..., 1] + half_height 25 | return np.reshape(temp, (temp.shape[0], 4, 2)) 26 | -------------------------------------------------------------------------------- /utils/bbox_utils/corner_to_center.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def corner_to_center(boxes): 5 | """ Convert bounding boxes from center format (xmin, ymin, xmax, ymax) to corner format (cx, cy, width, height) 6 | 7 | Args: 8 | - boxes: numpy array of tensor containing all the boxes to be converted 9 | 10 | Returns: 11 | - A numpy array or tensor of converted boxes 12 | """ 13 | temp = boxes.copy() 14 | width = np.abs(boxes[..., 0] - boxes[..., 2]) 15 | height = np.abs(boxes[..., 1] - boxes[..., 3]) 16 | temp[..., 0] = boxes[..., 0] + (width / 2) # cx 17 | temp[..., 1] = boxes[..., 1] + (height / 2) # cy 18 | temp[..., 2] = width # xmax 19 | temp[..., 3] = height # ymax 20 | return temp 21 | -------------------------------------------------------------------------------- /utils/bbox_utils/iou.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def iou(box_group1, box_group2): 5 | """ Calculates the intersection over union (aka. Jaccard Index) between two boxes. 6 | Boxes are assumed to be in corners format (xmin, ymin, xmax, ymax) 7 | 8 | Args: 9 | - box_group1: boxes in group 1 10 | - box_group2: boxes in group 2 11 | 12 | Returns: 13 | - A numpy array of shape (len(box_group1), len(box_group2)) where each value represents the iou between a box in box_group1 to a box in box_group2 14 | 15 | Raises: 16 | - The shape of box_group1 and box_group2 are not the same. 17 | 18 | Code References: 19 | - https://stackoverflow.com/questions/28723670/intersection-over-union-between-two-detections/41660682 20 | """ 21 | assert box_group1.shape == box_group2.shape, "The two boxes array must be the same shape." 22 | xmin_intersect = np.maximum(box_group1[..., 0], box_group2[..., 0]) 23 | ymin_intersect = np.maximum(box_group1[..., 1], box_group2[..., 1]) 24 | xmax_intersect = np.minimum(box_group1[..., 2], box_group2[..., 2]) 25 | ymax_intersect = np.minimum(box_group1[..., 3], box_group2[..., 3]) 26 | 27 | intersect = (xmax_intersect - xmin_intersect) * (ymax_intersect - ymin_intersect) 28 | box_group1_area = (box_group1[..., 2] - box_group1[..., 0]) * (box_group1[..., 3] - box_group1[..., 1]) 29 | box_group2_area = (box_group2[..., 2] - box_group2[..., 0]) * (box_group2[..., 3] - box_group2[..., 1]) 30 | union = box_group1_area + box_group2_area - intersect 31 | res = intersect / union 32 | 33 | # set invalid ious to zeros 34 | res[xmax_intersect < xmin_intersect] = 0 35 | res[ymax_intersect < ymin_intersect] = 0 36 | res[res < 0] = 0 37 | res[res > 1] = 0 38 | return res 39 | -------------------------------------------------------------------------------- /utils/bbox_utils/object_coverage.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def object_coverage(box_group1, box_group2): 5 | assert box_group1.shape == box_group2.shape, "The two boxes array must be the same shape." 6 | xmin_intersect = np.maximum(box_group1[..., 0], box_group2[..., 0]) 7 | ymin_intersect = np.maximum(box_group1[..., 1], box_group2[..., 1]) 8 | xmax_intersect = np.minimum(box_group1[..., 2], box_group2[..., 2]) 9 | ymax_intersect = np.minimum(box_group1[..., 3], box_group2[..., 3]) 10 | 11 | intersect = (xmax_intersect - xmin_intersect) * (ymax_intersect - ymin_intersect) 12 | box_group2_area = (box_group2[..., 2] - box_group2[..., 0]) * (box_group2[..., 3] - box_group2[..., 1]) 13 | res = intersect / box_group2_area 14 | 15 | # set invalid ious to zeros 16 | res[xmax_intersect < xmin_intersect] = 0 17 | res[ymax_intersect < ymin_intersect] = 0 18 | res[res < 0] = 0 19 | res[res > 1] = 0 20 | return res 21 | -------------------------------------------------------------------------------- /utils/command_line_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .str2bool import str2bool 2 | -------------------------------------------------------------------------------- /utils/command_line_utils/str2bool.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def str2bool(v): 5 | if isinstance(v, bool): 6 | return v 7 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 8 | return True 9 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 10 | return False 11 | else: 12 | raise argparse.ArgumentTypeError('Boolean value expected.') 13 | -------------------------------------------------------------------------------- /utils/data_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .get_samples_from_split import get_samples_from_split 2 | from .coco_text import COCO_Text 3 | -------------------------------------------------------------------------------- /utils/data_utils/get_samples_from_split.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def get_samples_from_split(split_file, images_dir, labels_dir): 5 | """ Create a list of samples that can be feed to a data generator. 6 | 7 | Args: 8 | - split_file: Path to the dataset's split file. (e.g. train.txt, val.txt) 9 | - images_dir: Path to images directory. 10 | - labels_dir: Path to labels directory. 11 | 12 | Returns: 13 | - A list of samples. Each sample is a string containing paths to both the image file and its corresponding label file separated by space. 14 | 15 | Raises: 16 | - split_file does not exist. 17 | - images_dir is not a directory. 18 | - labels_dir is not a directory. 19 | """ 20 | assert os.path.isfile(split_file), "split_file does not exists." 21 | assert os.path.isdir(images_dir), "images_dir is not a directory." 22 | assert os.path.isdir(labels_dir), "labels_dir is not a directory." 23 | 24 | samples = [] 25 | with open(split_file, "r") as split_file: 26 | lines = split_file.readlines() 27 | for line in lines: 28 | cols = line.split(" ") 29 | image_filename = cols[0] 30 | label_filename = cols[1] 31 | image_file = os.path.join(images_dir, image_filename) 32 | label_file = os.path.join(labels_dir, label_filename) 33 | sample = f"{image_file} {label_file}" 34 | samples.append(sample) 35 | return samples 36 | -------------------------------------------------------------------------------- /utils/display_tbpp_data_sample.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import json 4 | import argparse 5 | import numpy as np 6 | from glob import glob 7 | from xml.dom import minidom 8 | import xml.etree.cElementTree as ET 9 | from pycocotools.coco import COCO 10 | from textboxes_utils import read_sample 11 | 12 | parser = argparse.ArgumentParser(description='Displays a sample') 13 | parser.add_argument('image', type=str, help='path to image file.') 14 | parser.add_argument('label', type=str, help='path to label file.') 15 | args = parser.parse_args() 16 | 17 | print("loading image file") 18 | 19 | image, quads = read_sample(args.image, args.label) 20 | image = np.uint8(image) 21 | 22 | for quad in quads: 23 | cv2.polylines( 24 | image, 25 | [np.reshape(np.array(quad, dtype=np.int), (-1, 2))], 26 | True, 27 | (0, 255, 0), 28 | 1 29 | ) 30 | 31 | cv2.imshow("image", image) 32 | if cv2.waitKey(0) == ord('q'): 33 | cv2.destroyAllWindows() 34 | -------------------------------------------------------------------------------- /utils/inference_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .ssd_mobilenetv2 import ssd_mobilenetv2 2 | from .ssd_mobilenetv1 import ssd_mobilenetv1 3 | from .ssd_vgg16 import ssd_vgg16 4 | from .tbpp_vgg16 import tbpp_vgg16 5 | -------------------------------------------------------------------------------- /utils/inference_utils/ssd_mobilenetv1.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from networks import SSD_MOBILENET 4 | from tensorflow.keras.applications import mobilenet 5 | from utils import ssd_utils 6 | 7 | 8 | def ssd_mobilenetv1(config, args): 9 | assert args.label_maps is not None, "please specify a label map file" 10 | assert os.path.exists(args.label_maps), "label_maps file does not exist" 11 | with open(args.label_maps, "r") as file: 12 | label_maps = [line.strip("\n") for line in file.readlines()] 13 | 14 | model = SSD_MOBILENET( 15 | config, 16 | label_maps, 17 | is_training=False, 18 | num_predictions=args.num_predictions) 19 | process_input_fn = mobilenet.preprocess_input 20 | return model, process_input_fn, label_maps 21 | -------------------------------------------------------------------------------- /utils/inference_utils/ssd_mobilenetv2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from networks import SSD_MOBILENETV2 4 | from tensorflow.keras.applications import mobilenet_v2 5 | from utils import ssd_utils 6 | 7 | 8 | def ssd_mobilenetv2(config, args): 9 | assert args.label_maps is not None, "please specify a label map file" 10 | assert os.path.exists(args.label_maps), "label_maps file does not exist" 11 | with open(args.label_maps, "r") as file: 12 | label_maps = [line.strip("\n") for line in file.readlines()] 13 | 14 | model = SSD_MOBILENETV2( 15 | config, 16 | label_maps, 17 | is_training=False, 18 | num_predictions=args.num_predictions 19 | ) 20 | process_input_fn = mobilenet_v2.preprocess_input 21 | 22 | return model, process_input_fn, label_maps 23 | -------------------------------------------------------------------------------- /utils/inference_utils/ssd_vgg16.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import numpy as np 4 | from networks import SSD_VGG16 5 | from tensorflow.keras.applications import vgg16 6 | from utils import ssd_utils 7 | 8 | 9 | def ssd_vgg16(config, args): 10 | assert args.label_maps is not None, "please specify a label map file" 11 | assert os.path.exists(args.label_maps), "label_maps file does not exist" 12 | with open(args.label_maps, "r") as file: 13 | label_maps = [line.strip("\n") for line in file.readlines()] 14 | 15 | model = SSD_VGG16( 16 | config, 17 | label_maps, 18 | is_training=False, 19 | num_predictions=args.num_predictions 20 | ) 21 | process_input_fn = vgg16.preprocess_input 22 | 23 | return model, process_input_fn, label_maps 24 | -------------------------------------------------------------------------------- /utils/inference_utils/tbpp_vgg16.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from networks import TBPP_VGG16 4 | from tensorflow.keras.applications import vgg16 5 | from utils import textboxes_utils 6 | 7 | 8 | def tbpp_vgg16(config, args): 9 | model = TBPP_VGG16( 10 | config, 11 | is_training=False, 12 | num_predictions=args.num_predictions) 13 | process_input_fn = vgg16.preprocess_input 14 | 15 | return model, process_input_fn, ["text"] 16 | -------------------------------------------------------------------------------- /utils/one_hot_class_label.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def one_hot_class_label(classname, label_maps): 5 | """ Turn classname to one hot encoded label. 6 | 7 | Args: 8 | - classname: String representing the classname 9 | - label_maps: A list of strings containing all the classes 10 | 11 | Returns: 12 | - A numpy array of shape (len(label_maps), ) 13 | 14 | Raises: 15 | - Classname does not includes in label maps 16 | """ 17 | assert classname in label_maps, "classname must be included in label maps" 18 | temp = np.zeros((len(label_maps)), dtype=np.int) 19 | temp[label_maps.index(classname)] = 1 20 | return temp 21 | -------------------------------------------------------------------------------- /utils/pascal_voc_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .read_label import read_label 2 | -------------------------------------------------------------------------------- /utils/pascal_voc_utils/read_label.py: -------------------------------------------------------------------------------- 1 | import os 2 | import xml.etree.ElementTree as ET 3 | 4 | 5 | def read_label(label_path): 6 | assert os.path.exists(label_path), "Label file does not exist." 7 | 8 | xml_root = ET.parse(label_path).getroot() 9 | objects = xml_root.findall("object") 10 | bboxes, classes = [], [] 11 | for i, obj in enumerate(objects): 12 | name = obj.find("name").text 13 | bndbox = obj.find("bndbox") 14 | # the reason why we use float() is because some value in bndbox are float 15 | xmin = float(bndbox.find("xmin").text) 16 | ymin = float(bndbox.find("ymin").text) 17 | xmax = float(bndbox.find("xmax").text) 18 | ymax = float(bndbox.find("ymax").text) 19 | bboxes.append([xmin, ymin, xmax, ymax]) 20 | classes.append(name) 21 | 22 | return bboxes, classes 23 | -------------------------------------------------------------------------------- /utils/prepare_coco_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import argparse 4 | from glob import glob 5 | from xml.dom import minidom 6 | import xml.etree.cElementTree as ET 7 | from pycocotools.coco import COCO 8 | 9 | parser = argparse.ArgumentParser(description='Converts the coco dataset to a format suitable for training ssd with this repo.') 10 | parser.add_argument('annotations_file', type=str, help='path to annotations file.') 11 | parser.add_argument('images_dir', type=str, help='path to images dir.') 12 | parser.add_argument('output_dir', type=str, help='path to output dir.') 13 | args = parser.parse_args() 14 | 15 | assert os.path.exists(args.annotations_file), "annotations_file does not exist" 16 | assert os.path.exists(args.images_dir), "images_dir does not exist" 17 | if not os.path.exists(args.output_dir): 18 | os.makedirs(args.output_dir) 19 | 20 | coco = COCO(args.annotations_file) 21 | categories = coco.cats 22 | 23 | print("writing label maps to label_maps.txt") 24 | with open(os.path.join(args.output_dir, "label_maps.txt"), "w+") as label_maps_file: 25 | for cat_id in categories: 26 | label_maps_file.write(f"{categories[cat_id]['name']}\n") 27 | print("-- done") 28 | 29 | num_samples = 0 30 | print("-- converting coco annotations to xml files") 31 | with open(os.path.join(args.output_dir, "split.txt"), "w+") as split_file: 32 | images_ids = list(coco.imgs.keys()) 33 | num_images = len(images_ids) 34 | for i, image_id in enumerate(images_ids): 35 | print(f"-- image {i+1}/{num_images}") 36 | annotations = coco.loadAnns(coco.getAnnIds([image_id])) 37 | image_info = coco.loadImgs([image_id])[0] 38 | image_filename = image_info["file_name"] 39 | if len(annotations) == 0: 40 | print(f"\n---- skipped: {image_filename}\n") 41 | continue 42 | xml_root = ET.Element("annotation") 43 | xml_filename = ET.SubElement(xml_root, "filename").text = image_filename 44 | xml_size = ET.SubElement(xml_root, "size") 45 | xml_size_width = ET.SubElement(xml_size, "width").text = str(image_info["width"]) 46 | xml_size_height = ET.SubElement(xml_size, "height").text = str(image_info["height"]) 47 | xml_size_depth = ET.SubElement(xml_size, "depth").text = str(3) 48 | for annotation in annotations: 49 | category_id = annotation['category_id'] 50 | bbox = annotation['bbox'] 51 | label = coco.cats[category_id]["name"] 52 | xml_object = ET.SubElement(xml_root, "object") 53 | xml_object_name = ET.SubElement(xml_object, "name").text = label 54 | xml_object_bndbox = ET.SubElement(xml_object, "bndbox") 55 | xml_object_bndbox_xmin = ET.SubElement(xml_object_bndbox, "xmin").text = str(bbox[0]) 56 | xml_object_bndbox_ymin = ET.SubElement(xml_object_bndbox, "ymin").text = str(bbox[1]) 57 | xml_object_bndbox_xmax = ET.SubElement(xml_object_bndbox, "xmax").text = str(bbox[0] + bbox[2]) 58 | xml_object_bndbox_ymax = ET.SubElement(xml_object_bndbox, "ymax").text = str(bbox[1] + bbox[3]) 59 | xml_tree = ET.ElementTree(xml_root) 60 | xml_file_name = f"{image_filename[:image_filename.index('.')]}.xml" 61 | with open(os.path.join(args.output_dir, xml_file_name), "wb+") as xml_file: 62 | xml_tree.write(xml_file) 63 | split_file.write(f"{image_filename} {xml_file_name}\n") 64 | num_samples += 1 65 | print("-- done") 66 | print(f"num_samples: {num_samples}") 67 | print(f"split_file lines: {len(split_file.readlines())}") 68 | print(f"num files in annotations folder: {len(list(glob(os.path.join(args.output_dir, '*.xml'))))}") 69 | -------------------------------------------------------------------------------- /utils/prepare_cocotextv2_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import argparse 4 | from glob import glob 5 | import json 6 | from xml.dom import minidom 7 | import xml.etree.cElementTree as ET 8 | from data_utils import COCO_Text 9 | import numpy as np 10 | import shutil 11 | 12 | parser = argparse.ArgumentParser(description='Converts the coco dataset to a format suitable for training ssd with this repo.') 13 | parser.add_argument('annotations_file', type=str, help='path to annotations file.') 14 | parser.add_argument('images_dir', type=str, help='path to images dir.') 15 | parser.add_argument('output_dir', type=str, help='path to output dir.') 16 | args = parser.parse_args() 17 | 18 | assert os.path.exists(args.annotations_file), "annotations_file does not exist" 19 | assert os.path.exists(args.images_dir), "images_dir does not exist" 20 | val_dir = os.path.join(os.path.join(args.output_dir, "val")) 21 | train_dir = os.path.join(os.path.join(args.output_dir, "train")) 22 | os.makedirs(os.path.join(val_dir, "images"), exist_ok=True) 23 | os.makedirs(os.path.join(val_dir, "labels"), exist_ok=True) 24 | os.makedirs(os.path.join(train_dir, "images"), exist_ok=True) 25 | os.makedirs(os.path.join(train_dir, "labels"), exist_ok=True) 26 | 27 | coco = COCO_Text(annotation_file=args.annotations_file) 28 | 29 | print("-- copying images for validation sets") 30 | with open(os.path.join(args.output_dir, "test.txt"), "w") as test_file: 31 | for i, image_id in enumerate(coco.val): 32 | print(f"image {i+1} / {len(coco.val)}") 33 | annotations = coco.loadAnns(coco.getAnnIds([image_id])) 34 | image_info = coco.loadImgs([image_id])[0] 35 | image_filename = image_info["file_name"] 36 | 37 | if len(annotations) == 0: 38 | continue 39 | 40 | filter_annotations = [] 41 | 42 | for annotation in annotations: 43 | quad = annotation["polygon"] 44 | if len(quad) != 8: 45 | continue 46 | filter_annotations.append(annotation) 47 | 48 | if len(filter_annotations) == 0: 49 | continue 50 | 51 | shutil.copy( 52 | os.path.join(args.images_dir, image_filename), 53 | os.path.join(os.path.join(val_dir, "images"), image_filename) 54 | ) 55 | 56 | label_file_name = f"{image_filename[:image_filename.index('.')]}.txt" 57 | 58 | with open(os.path.join(os.path.join(val_dir, "labels"), label_file_name), "w") as label_file: 59 | for annotation in filter_annotations: 60 | quad = annotation["polygon"] 61 | try: 62 | text = annotation["utf8_string"] 63 | except: 64 | text = "###" 65 | for num in quad: 66 | label_file.write(f"{float(num)},") 67 | label_file.write(f"{text}\n") 68 | 69 | test_file.write(f"{image_filename} {label_file_name}\n") 70 | 71 | print("-- copying images for training sets") 72 | with open(os.path.join(args.output_dir, "train.txt"), "w") as train_file: 73 | for i, image_id in enumerate(coco.train): 74 | print(f"image {i+1} / {len(coco.train)}") 75 | annotations = coco.loadAnns(coco.getAnnIds([image_id])) 76 | image_info = coco.loadImgs([image_id])[0] 77 | image_filename = image_info["file_name"] 78 | 79 | if len(annotations) == 0: 80 | continue 81 | 82 | filter_annotations = [] 83 | 84 | for annotation in annotations: 85 | quad = annotation["polygon"] 86 | if len(quad) != 8: 87 | continue 88 | filter_annotations.append(annotation) 89 | 90 | if len(filter_annotations) == 0: 91 | continue 92 | 93 | shutil.copy( 94 | os.path.join(args.images_dir, image_filename), 95 | os.path.join(os.path.join(train_dir, "images"), image_filename) 96 | ) 97 | 98 | label_file_name = f"{image_filename[:image_filename.index('.')]}.txt" 99 | 100 | with open(os.path.join(os.path.join(train_dir, "labels"), label_file_name), "w") as label_file: 101 | for annotation in filter_annotations: 102 | quad = annotation["polygon"] 103 | try: 104 | text = annotation["utf8_string"] 105 | except: 106 | text = "###" 107 | for num in quad: 108 | label_file.write(f"{float(num)},") 109 | label_file.write(f"{text}\n") 110 | 111 | train_file.write(f"{image_filename} {label_file_name}\n") 112 | -------------------------------------------------------------------------------- /utils/prepare_icdar-2013_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import argparse 4 | from glob import glob 5 | import json 6 | from xml.dom import minidom 7 | import xml.etree.cElementTree as ET 8 | from data_utils import COCO_Text 9 | import numpy as np 10 | import shutil 11 | import os 12 | 13 | 14 | parser = argparse.ArgumentParser(description='Converts the icdar 2013 dataset to a format suitable for training tbpp with this repo.') 15 | parser.add_argument('dataset_dir', type=str, help='path to dataset dir.') 16 | parser.add_argument('output_dir', type=str, help='path to output dir.') 17 | args = parser.parse_args() 18 | 19 | assert os.path.exists(args.dataset_dir), "dataset_dir does not exist" 20 | testing_dir = os.path.join(os.path.join(args.output_dir, "test")) 21 | train_dir = os.path.join(os.path.join(args.output_dir, "train")) 22 | os.makedirs(os.path.join(testing_dir, "images"), exist_ok=True) 23 | os.makedirs(os.path.join(testing_dir, "labels"), exist_ok=True) 24 | os.makedirs(os.path.join(train_dir, "images"), exist_ok=True) 25 | os.makedirs(os.path.join(train_dir, "labels"), exist_ok=True) 26 | 27 | print("-- copy images for training sets") 28 | training_images = sorted(list(glob(os.path.join(args.dataset_dir, os.path.join("Challenge2_Training_Task12_Images", "*.jpg"))))) 29 | test_images = sorted(list(glob(os.path.join(args.dataset_dir, os.path.join("Challenge2_Test_Task12_Images", "*.jpg"))))) 30 | 31 | with open(os.path.join(args.output_dir, "train.txt"), "w") as train_split: 32 | for i, train_image in enumerate(training_images): 33 | print(f"image {i+1}/{len(training_images)}") 34 | image_filename = os.path.basename(train_image) 35 | label_filename = f"gt_{image_filename[:image_filename.index('.')]}.txt" 36 | shutil.copy( 37 | os.path.join(os.path.join(args.dataset_dir, "Challenge2_Training_Task12_Images"), image_filename), 38 | os.path.join(os.path.join(train_dir, "images"), image_filename) 39 | ) 40 | with open(os.path.join(os.path.join(args.dataset_dir, "Challenge2_Training_Task1_GT"), label_filename), "r") as label_file: 41 | quads = label_file.readlines() 42 | with open(os.path.join(os.path.join(train_dir, "labels"), label_filename), "w") as output_label_file: 43 | for quad in quads: 44 | quad = quad.strip("\n") 45 | quad = quad.split(" ") 46 | quad[-1] = quad[-1][1:-1] 47 | quad = [i.strip(",") for i in quad] 48 | quad[:4] = [float(i) for i in quad[:4]] 49 | w = abs(quad[0] - quad[2]) 50 | h = abs(quad[1] - quad[3]) 51 | x1 = quad[0] 52 | y1 = quad[1] 53 | x2 = quad[0] + w 54 | y2 = quad[1] 55 | x3 = quad[0] + w 56 | y3 = quad[1] + h 57 | x4 = quad[0] 58 | y4 = quad[1] + h 59 | output_label_file.write(f"{x1},{y1},{x2},{y2},{x3},{y3},{x4},{y4},{quad[-1]}\n") 60 | train_split.write(f"{image_filename} {label_filename}\n") 61 | 62 | with open(os.path.join(args.output_dir, "test.txt"), "w") as test_split: 63 | for i, test_image in enumerate(test_images): 64 | print(f"image {i+1}/{len(test_images)}") 65 | image_filename = os.path.basename(test_image) 66 | label_filename = f"gt_{image_filename[:image_filename.index('.')]}.txt" 67 | shutil.copy( 68 | os.path.join(os.path.join(args.dataset_dir, "Challenge2_Test_Task12_Images"), image_filename), 69 | os.path.join(os.path.join(testing_dir, "images"), image_filename) 70 | ) 71 | with open(os.path.join(os.path.join(args.dataset_dir, "Challenge2_Test_Task1_GT"), label_filename), "r") as label_file: 72 | quads = label_file.readlines() 73 | with open(os.path.join(os.path.join(testing_dir, "labels"), label_filename), "w") as output_label_file: 74 | for quad in quads: 75 | quad = quad.strip("\n") 76 | quad = quad.split(" ") 77 | quad[-1] = quad[-1][1:-1] 78 | quad = [i.strip(",") for i in quad] 79 | quad[:4] = [float(i) for i in quad[:4]] 80 | w = abs(quad[0] - quad[2]) 81 | h = abs(quad[1] - quad[3]) 82 | x1 = quad[0] 83 | y1 = quad[1] 84 | x2 = quad[0] + w 85 | y2 = quad[1] 86 | x3 = quad[0] + w 87 | y3 = quad[1] + h 88 | x4 = quad[0] 89 | y4 = quad[1] + h 90 | output_label_file.write(f"{x1},{y1},{x2},{y2},{x3},{y3},{x4},{y4},{quad[-1]}\n") 91 | test_split.write(f"{image_filename} {label_filename}\n") 92 | -------------------------------------------------------------------------------- /utils/prepare_icdar-2015_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import argparse 4 | from glob import glob 5 | import json 6 | from xml.dom import minidom 7 | import xml.etree.cElementTree as ET 8 | from data_utils import COCO_Text 9 | import numpy as np 10 | import shutil 11 | import os 12 | 13 | 14 | parser = argparse.ArgumentParser(description='Converts the icdar 2015 dataset to a format suitable for training tbpp with this repo.') 15 | parser.add_argument('dataset_dir', type=str, help='path to dataset dir.') 16 | parser.add_argument('output_dir', type=str, help='path to output dir.') 17 | args = parser.parse_args() 18 | 19 | assert os.path.exists(args.dataset_dir), "dataset_dir does not exist" 20 | testing_dir = os.path.join(os.path.join(args.output_dir, "test")) 21 | train_dir = os.path.join(os.path.join(args.output_dir, "train")) 22 | os.makedirs(os.path.join(testing_dir, "images"), exist_ok=True) 23 | os.makedirs(os.path.join(testing_dir, "labels"), exist_ok=True) 24 | os.makedirs(os.path.join(train_dir, "images"), exist_ok=True) 25 | os.makedirs(os.path.join(train_dir, "labels"), exist_ok=True) 26 | 27 | print("-- copy images for training sets") 28 | training_images = sorted(list(glob(os.path.join(args.dataset_dir, os.path.join("ch4_training_images", "*.jpg"))))) 29 | test_images = sorted(list(glob(os.path.join(args.dataset_dir, os.path.join("ch4_test_images", "*.jpg"))))) 30 | 31 | with open(os.path.join(args.output_dir, "train.txt"), "w") as train_split: 32 | for i, train_image in enumerate(training_images): 33 | print(f"image {i+1}/{len(training_images)}") 34 | image_filename = os.path.basename(train_image) 35 | label_filename = f"gt_{image_filename[:image_filename.index('.')]}.txt" 36 | shutil.copy( 37 | os.path.join(os.path.join(args.dataset_dir, "ch4_training_images"), image_filename), 38 | os.path.join(os.path.join(train_dir, "images"), image_filename) 39 | ) 40 | shutil.copy( 41 | os.path.join(os.path.join(args.dataset_dir, "ch4_training_localization_transcription_gt"), label_filename), 42 | os.path.join(os.path.join(train_dir, "labels"), label_filename) 43 | ) 44 | train_split.write(f"{image_filename} {label_filename}\n") 45 | 46 | with open(os.path.join(args.output_dir, "test.txt"), "w") as test_split: 47 | for i, test_image in enumerate(test_images): 48 | print(f"image {i+1}/{len(test_images)}") 49 | image_filename = os.path.basename(test_image) 50 | label_filename = f"gt_{image_filename[:image_filename.index('.')]}.txt" 51 | shutil.copy( 52 | os.path.join(os.path.join(args.dataset_dir, "ch4_test_images"), image_filename), 53 | os.path.join(os.path.join(testing_dir, "images"), image_filename) 54 | ) 55 | shutil.copy( 56 | os.path.join(os.path.join(args.dataset_dir, "Challenge4_Test_Task1_GT"), label_filename), 57 | os.path.join(os.path.join(testing_dir, "labels"), label_filename) 58 | ) 59 | test_split.write(f"{image_filename} {label_filename}\n") 60 | -------------------------------------------------------------------------------- /utils/prepare_pascal-voc-2007-2012_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import argparse 4 | from glob import glob 5 | import json 6 | from xml.dom import minidom 7 | import xml.etree.cElementTree as ET 8 | from data_utils import COCO_Text 9 | import numpy as np 10 | import shutil 11 | import os 12 | 13 | 14 | def str2bool(v): 15 | if isinstance(v, bool): 16 | return v 17 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 18 | return True 19 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 20 | return False 21 | else: 22 | raise argparse.ArgumentTypeError('Boolean value expected.') 23 | 24 | 25 | parser = argparse.ArgumentParser( 26 | description='Converts the Pascal VOC 2007 and 2012 dataset to a format suitable for training tbpp with this repo.') 27 | parser.add_argument('dataset_dir', type=str, help='path to dataset dir.') 28 | parser.add_argument('output_dir', type=str, help='path to output dir.') 29 | args = parser.parse_args() 30 | 31 | assert os.path.exists(args.dataset_dir), "dataset_dir does not exist" 32 | out_images_dir = os.path.join(args.output_dir, "images") 33 | out_labels_dir = os.path.join(args.output_dir, "labels") 34 | os.makedirs(out_images_dir, exist_ok=True) 35 | os.makedirs(out_labels_dir, exist_ok=True) 36 | 37 | 38 | datasets = ["VOC2007", "VOC2012"] 39 | train_samples, val_samples, trainval_samples, test_samples = [], [], [], [] 40 | for dataset in datasets: 41 | print(f"-- gather data from: {dataset}") 42 | dataset_dir = os.path.abspath(args.dataset_dir) 43 | dataset_dir = os.path.join(dataset_dir, dataset) 44 | images_dir = os.path.join(dataset_dir, "JPEGImages") 45 | labels_dir = os.path.join(dataset_dir, "Annotations") 46 | 47 | print(f"---- copy images") 48 | for image in list(glob(os.path.join(images_dir, "*jpg"))): 49 | destination_filename = os.path.basename(image) 50 | if dataset == "VOC2007": 51 | destination_filename = f"2007_{destination_filename}" 52 | dest = os.path.join(out_images_dir, destination_filename) 53 | shutil.copy(image, dest) 54 | 55 | print(f"---- copy labels") 56 | for label in list(glob(os.path.join(labels_dir, "*xml"))): 57 | destination_filename = os.path.basename(label) 58 | if dataset == "VOC2007": 59 | destination_filename = f"2007_{destination_filename}" 60 | dest = os.path.join(out_labels_dir, destination_filename) 61 | shutil.copy(label, dest) 62 | 63 | train_split = os.path.join(dataset_dir, "ImageSets/Main/train.txt") 64 | val_split = os.path.join(dataset_dir, "ImageSets/Main/val.txt") 65 | trainval_split = os.path.join(dataset_dir, "ImageSets/Main/trainval.txt") 66 | 67 | # train split 68 | print(f"---- gather train samples") 69 | with open(train_split, "r") as train_file: 70 | samples = train_file.readlines() 71 | for sample in samples: 72 | if dataset == "VOC2007": 73 | sample = "2007_" + sample.strip("\n") 74 | else: 75 | sample = sample.strip("\n") 76 | if sample not in train_samples: 77 | train_samples.append(sample) 78 | 79 | # val split 80 | print(f"---- gather val samples") 81 | with open(val_split, "r") as val_file: 82 | samples = val_file.readlines() 83 | for sample in samples: 84 | if dataset == "VOC2007": 85 | sample = "2007_" + sample.strip("\n") 86 | else: 87 | sample = sample.strip("\n") 88 | if sample not in val_samples: 89 | val_samples.append(sample) 90 | 91 | # trainval split 92 | print(f"---- gather trainval samples") 93 | with open(trainval_split, "r") as trainval_file: 94 | samples = trainval_file.readlines() 95 | for sample in samples: 96 | if dataset == "VOC2007": 97 | sample = "2007_" + sample.strip("\n") 98 | else: 99 | sample = sample.strip("\n") 100 | if sample not in trainval_samples: 101 | trainval_samples.append(sample) 102 | 103 | if dataset == "VOC2007": 104 | print(f"---- gather test samples") 105 | with open(os.path.join(dataset_dir, "ImageSets/Main/test.txt"), "r") as test_file: 106 | samples = test_file.readlines() 107 | for sample in samples: 108 | if dataset == "VOC2007": 109 | sample = "2007_" + sample.strip("\n") 110 | else: 111 | sample = sample.strip("\n") 112 | if sample not in test_samples: 113 | test_samples.append(sample) 114 | 115 | 116 | def save_samples_to_split(s, name): 117 | with open(os.path.join(args.output_dir, name), "w") as outfile: 118 | for i in s: 119 | outfile.write(f"{i}.jpg {i}.xml\n") 120 | 121 | 122 | print(f"-- num_train: {len(train_samples)}") 123 | save_samples_to_split(train_samples, "train.txt") 124 | print(f"-- num_val: {len(val_samples)}") 125 | save_samples_to_split(val_samples, "val.txt") 126 | print(f"-- num_trainval: {len(trainval_samples)}") 127 | save_samples_to_split(trainval_samples, "trainval.txt") 128 | print(f"-- num_test: {len(test_samples)}") 129 | save_samples_to_split(test_samples, "test.txt") 130 | 131 | print(f"-- writing label_maps.txt") 132 | dataset_dir = os.path.abspath(args.dataset_dir) 133 | dataset_dir = os.path.join(dataset_dir, "VOC2007") 134 | 135 | with open(os.path.join(args.output_dir, "label_maps.txt"), "w") as label_maps_file: 136 | labels = list( 137 | glob(os.path.join(dataset_dir, "ImageSets/Main/*_train.txt"))) 138 | labels = [os.path.basename(i) for i in labels] 139 | labels = sorted([i[:i.index("_")] for i in labels]) 140 | for classname in labels: 141 | label_maps_file.write(f"{classname}\n") 142 | -------------------------------------------------------------------------------- /utils/prepare_pascal_voc_2007_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import argparse 4 | from glob import glob 5 | import json 6 | from xml.dom import minidom 7 | import xml.etree.cElementTree as ET 8 | from data_utils import COCO_Text 9 | import numpy as np 10 | import shutil 11 | import os 12 | 13 | 14 | parser = argparse.ArgumentParser( 15 | description='Converts the Pascal VOC 2007 dataset to a format suitable for training ssd with this repo.') 16 | parser.add_argument('dataset_dir', type=str, help='path to dataset dir.') 17 | parser.add_argument('output_dir', type=str, help='path to output dir.') 18 | args = parser.parse_args() 19 | 20 | assert os.path.exists(args.dataset_dir), "dataset_dir does not exist" 21 | images_dir = os.path.join(args.dataset_dir, "JPEGImages") 22 | labels_dir = os.path.join(args.dataset_dir, "Annotations") 23 | out_images_dir = os.path.join(args.output_dir, "images") 24 | out_labels_dir = os.path.join(args.output_dir, "labels") 25 | os.makedirs(out_images_dir, exist_ok=True) 26 | os.makedirs(out_labels_dir, exist_ok=True) 27 | 28 | print(f"-- creating split files") 29 | print(f"---- train.txt") 30 | with open(os.path.join(args.output_dir, "train.txt"), "w") as train_split_file: 31 | with open(os.path.join(args.dataset_dir, "ImageSets/Main/train.txt"), "r") as train_file: 32 | samples = train_file.readlines() 33 | for i, sample in enumerate(samples): 34 | sample = sample.strip("\n") 35 | train_split_file.write(f"{sample}.jpg {sample}.xml\n") 36 | 37 | print(f"---- val.txt") 38 | with open(os.path.join(args.output_dir, "val.txt"), "w") as val_split_file: 39 | with open(os.path.join(args.dataset_dir, "ImageSets/Main/val.txt"), "r") as val_file: 40 | samples = val_file.readlines() 41 | for sample in samples: 42 | sample = sample.strip("\n") 43 | val_split_file.write(f"{sample}.jpg {sample}.xml\n") 44 | 45 | print(f"---- test.txt") 46 | with open(os.path.join(args.output_dir, "test.txt"), "w") as val_split_file: 47 | with open(os.path.join(args.dataset_dir, "ImageSets/Main/test.txt"), "r") as val_file: 48 | samples = val_file.readlines() 49 | for sample in samples: 50 | sample = sample.strip("\n") 51 | val_split_file.write(f"{sample}.jpg {sample}.xml\n") 52 | 53 | print(f"---- trainval.txt") 54 | with open(os.path.join(args.output_dir, "split.txt"), "w") as trainval_split_file: 55 | with open(os.path.join(args.dataset_dir, "ImageSets/Main/trainval.txt"), "r") as trainval_file: 56 | samples = trainval_file.readlines() 57 | for sample in samples: 58 | sample = sample.strip("\n") 59 | trainval_split_file.write(f"{sample}.jpg {sample}.xml\n") 60 | 61 | print(f"-- copying images") 62 | for i, sample in enumerate(list(glob(os.path.join(images_dir, "*jpg")))): 63 | filename = os.path.basename(sample) 64 | shutil.copy( 65 | sample, 66 | os.path.join(out_images_dir, filename) 67 | ) 68 | 69 | print(f"-- copying labels") 70 | for i, sample in enumerate(list(glob(os.path.join(labels_dir, "*xml")))): 71 | filename = os.path.basename(sample) 72 | shutil.copy( 73 | sample, 74 | os.path.join(out_labels_dir, filename) 75 | ) 76 | 77 | print(f"-- writing label_maps.txt") 78 | with open(os.path.join(args.output_dir, "label_maps.txt"), "w") as label_maps_file: 79 | labels = list( 80 | glob(os.path.join(args.dataset_dir, "ImageSets/Main/*_train.txt"))) 81 | labels = [os.path.basename(i) for i in labels] 82 | labels = sorted([i[:i.index("_")] for i in labels]) 83 | for classname in labels: 84 | label_maps_file.write(f"{classname}\n") 85 | -------------------------------------------------------------------------------- /utils/prepare_pascal_voc_2012_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import argparse 4 | from glob import glob 5 | import json 6 | from xml.dom import minidom 7 | import xml.etree.cElementTree as ET 8 | from data_utils import COCO_Text 9 | import numpy as np 10 | import shutil 11 | import os 12 | 13 | 14 | parser = argparse.ArgumentParser( 15 | description='Converts the Pascal VOC 2012 dataset to a format suitable for training tbpp with this repo.') 16 | parser.add_argument('dataset_dir', type=str, help='path to dataset dir.') 17 | parser.add_argument('output_dir', type=str, help='path to output dir.') 18 | args = parser.parse_args() 19 | 20 | assert os.path.exists(args.dataset_dir), "dataset_dir does not exist" 21 | images_dir = os.path.join(args.dataset_dir, "JPEGImages") 22 | labels_dir = os.path.join(args.dataset_dir, "Annotations") 23 | out_images_dir = os.path.join(args.output_dir, "images") 24 | out_labels_dir = os.path.join(args.output_dir, "labels") 25 | os.makedirs(out_images_dir, exist_ok=True) 26 | os.makedirs(out_labels_dir, exist_ok=True) 27 | 28 | print(f"-- creating split files") 29 | print(f"---- train.txt") 30 | with open(os.path.join(args.output_dir, "train.txt"), "w") as train_split_file: 31 | with open(os.path.join(args.dataset_dir, "ImageSets/Main/train.txt"), "r") as train_file: 32 | samples = train_file.readlines() 33 | for i, sample in enumerate(samples): 34 | sample = sample.strip("\n") 35 | train_split_file.write(f"{sample}.jpg {sample}.xml\n") 36 | 37 | print(f"---- val.txt") 38 | with open(os.path.join(args.output_dir, "val.txt"), "w") as val_split_file: 39 | with open(os.path.join(args.dataset_dir, "ImageSets/Main/val.txt"), "r") as val_file: 40 | samples = val_file.readlines() 41 | for sample in samples: 42 | sample = sample.strip("\n") 43 | val_split_file.write(f"{sample}.jpg {sample}.xml\n") 44 | 45 | print(f"---- trainval.txt") 46 | with open(os.path.join(args.output_dir, "split.txt"), "w") as trainval_split_file: 47 | with open(os.path.join(args.dataset_dir, "ImageSets/Main/trainval.txt"), "r") as trainval_file: 48 | samples = trainval_file.readlines() 49 | for sample in samples: 50 | sample = sample.strip("\n") 51 | trainval_split_file.write(f"{sample}.jpg {sample}.xml\n") 52 | 53 | print(f"-- copying images") 54 | for i, sample in enumerate(list(glob(os.path.join(images_dir, "*jpg")))): 55 | filename = os.path.basename(sample) 56 | shutil.copy( 57 | sample, 58 | os.path.join(out_images_dir, filename) 59 | ) 60 | 61 | print(f"-- copying labels") 62 | for i, sample in enumerate(list(glob(os.path.join(labels_dir, "*xml")))): 63 | filename = os.path.basename(sample) 64 | shutil.copy( 65 | sample, 66 | os.path.join(out_labels_dir, filename) 67 | ) 68 | 69 | print(f"-- writing label_maps.txt") 70 | with open(os.path.join(args.output_dir, "label_maps.txt"), "w") as label_maps_file: 71 | labels = list( 72 | glob(os.path.join(args.dataset_dir, "ImageSets/Main/*_train.txt"))) 73 | labels = [os.path.basename(i) for i in labels] 74 | labels = sorted([i[:i.index("_")] for i in labels]) 75 | for classname in labels: 76 | label_maps_file.write(f"{classname}\n") 77 | -------------------------------------------------------------------------------- /utils/prepare_svt_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import argparse 4 | from glob import glob 5 | import json 6 | from xml.dom import minidom 7 | import xml.etree.cElementTree as ET 8 | from data_utils import COCO_Text 9 | import numpy as np 10 | import shutil 11 | import os 12 | 13 | 14 | parser = argparse.ArgumentParser(description='Converts the coco dataset to a format suitable for training ssd with this repo.') 15 | parser.add_argument('dataset_dir', type=str, help='path to dataset dir.') 16 | parser.add_argument('output_dir', type=str, help='path to output dir.') 17 | args = parser.parse_args() 18 | 19 | assert os.path.exists(args.dataset_dir), "dataset_dir does not exist" 20 | testing_dir = os.path.join(os.path.join(args.output_dir, "test")) 21 | train_dir = os.path.join(os.path.join(args.output_dir, "train")) 22 | os.makedirs(os.path.join(testing_dir, "images"), exist_ok=True) 23 | os.makedirs(os.path.join(testing_dir, "labels"), exist_ok=True) 24 | os.makedirs(os.path.join(train_dir, "images"), exist_ok=True) 25 | os.makedirs(os.path.join(train_dir, "labels"), exist_ok=True) 26 | 27 | test = ET.parse(os.path.join(args.dataset_dir, "test.xml")) 28 | train = ET.parse(os.path.join(args.dataset_dir, "train.xml")) 29 | 30 | print("-- copy images for train sets") 31 | training_images = train.getroot().findall("image") 32 | for i, image in enumerate(training_images): 33 | print(f"image {i+1}/{len(training_images)}") 34 | image_filename = image.find("imageName").text 35 | image_filename = os.path.basename(image_filename) 36 | label_filename = f"{image_filename[:image_filename.index('.')]}.txt" 37 | rectangles = image.find("taggedRectangles").findall("taggedRectangle") 38 | 39 | shutil.copy( 40 | os.path.join(os.path.join(args.dataset_dir, "img"), image_filename), 41 | os.path.join(os.path.join(train_dir, "images"), image_filename) 42 | ) 43 | 44 | with open(os.path.join(os.path.join(train_dir, "labels"), label_filename), "w") as label_file: 45 | for rectangle in rectangles: 46 | text = rectangle.find("tag").text 47 | xmin = int(rectangle.attrib["x"]) 48 | ymin = int(rectangle.attrib["y"]) 49 | width = int(rectangle.attrib["width"]) 50 | height = int(rectangle.attrib["height"]) 51 | label_file.write(f"{xmin},{ymin},{xmin+width},{ymin},{xmin+width},{ymin+height},{xmin},{ymin+height},{text}\n") 52 | 53 | print("-- copy images for test sets") 54 | testing_images = test.getroot().findall("image") 55 | for i, image in enumerate(testing_images): 56 | print(f"image {i+1}/{len(testing_images)}") 57 | image_filename = image.find("imageName").text 58 | image_filename = os.path.basename(image_filename) 59 | label_filename = f"{image_filename[:image_filename.index('.')]}.txt" 60 | rectangles = image.find("taggedRectangles").findall("taggedRectangle") 61 | 62 | shutil.copy( 63 | os.path.join(os.path.join(args.dataset_dir, "img"), image_filename), 64 | os.path.join(os.path.join(testing_dir, "images"), image_filename) 65 | ) 66 | 67 | with open(os.path.join(os.path.join(testing_dir, "labels"), label_filename), "w") as label_file: 68 | for rectangle in rectangles: 69 | text = rectangle.find("tag").text 70 | xmin = int(rectangle.attrib["x"]) 71 | ymin = int(rectangle.attrib["y"]) 72 | width = int(rectangle.attrib["width"]) 73 | height = int(rectangle.attrib["height"]) 74 | label_file.write(f"{xmin},{ymin},{xmin+width},{ymin},{xmin+width},{ymin+height},{xmin},{ymin+height},{text}\n") 75 | -------------------------------------------------------------------------------- /utils/prepare_synthtext_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import numpy as np 4 | import argparse 5 | import shutil 6 | from scipy import io 7 | from glob import glob 8 | 9 | parser = argparse.ArgumentParser(description='Converts the synthtext dataset to a format suitable for training textboxes plus plus with this repo.') 10 | parser.add_argument('annotations_file', type=str, help='path to annotations file.') 11 | parser.add_argument('images_dir', type=str, help='path to images dir.') 12 | parser.add_argument('output_dir', type=str, help='path to output dir.') 13 | args = parser.parse_args() 14 | 15 | assert os.path.exists(args.annotations_file), "annotations_file does not exist" 16 | assert os.path.exists(args.images_dir), "images_dir does not exist" 17 | 18 | images_output_dir = os.path.join(args.output_dir, "images") 19 | labels_output_dir = os.path.join(args.output_dir, "labels") 20 | 21 | os.makedirs(images_output_dir, exist_ok=True) 22 | os.makedirs(labels_output_dir, exist_ok=True) 23 | 24 | ground_truth_file = io.loadmat(args.annotations_file) 25 | 26 | 27 | def clip_polygon(p, image): 28 | image_height, image_width, _ = image.shape 29 | polygon = p.copy() 30 | for n in [0, 2, 4, 6]: 31 | if polygon[n] < 0: 32 | polygon[n] = 0 33 | elif polygon[n] > image_width: 34 | polygon[n] = image_width 35 | for n in [1, 3, 5, 7]: 36 | if polygon[n] < 0: 37 | polygon[n] = 0 38 | elif polygon[n] > image_height: 39 | polygon[n] = image_height 40 | return polygon 41 | 42 | 43 | with open(os.path.join(args.output_dir, "samples.txt"), "w") as samples_file: 44 | for img_id in range(ground_truth_file["imnames"].shape[-1]): 45 | print(f"image: {img_id+1}/{ground_truth_file['imnames'].shape[-1]}") 46 | imname = ground_truth_file["imnames"][0][img_id][0] 47 | texts = ground_truth_file["txt"][0][img_id] 48 | wordBboxes = ground_truth_file["wordBB"][0] 49 | polygons = np.concatenate( 50 | [ 51 | np.expand_dims(wordBboxes[img_id][0].transpose(), axis=-1), 52 | np.expand_dims(wordBboxes[img_id][1].transpose(), axis=-1), 53 | ], 54 | axis=-1 55 | ) 56 | 57 | words = [] 58 | for word in texts: 59 | for i in word.split("\n"): 60 | for j in i.split(" "): 61 | if j != "": 62 | words.append(j) 63 | 64 | filename = os.path.basename(imname) 65 | sample = f"{filename} {filename[:filename.index('.')]}.txt" 66 | 67 | shutil.copy(os.path.join(args.images_dir, imname), os.path.join(images_output_dir, filename)) 68 | with open(os.path.join(labels_output_dir, f"{filename[:filename.index('.')]}.txt"), "w") as label_file: 69 | image = cv2.imread(os.path.join(images_output_dir, filename)) 70 | if len(polygons.shape) == 2: 71 | word = words[0] 72 | polygon = np.reshape(polygons, (8,)) 73 | polygon = clip_polygon(polygon, image) 74 | 75 | for coord in polygon: 76 | label_file.write(str(float(coord))) 77 | label_file.write(",") 78 | label_file.write(word) 79 | label_file.write("\n") 80 | else: 81 | for i, polygon in enumerate(polygons): 82 | word = words[i] 83 | polygon = np.reshape(polygon, (8,)) 84 | polygon = clip_polygon(polygon, image) 85 | 86 | for coord in polygon: 87 | label_file.write(str(float(coord))) 88 | label_file.write(",") 89 | label_file.write(word) 90 | label_file.write("\n") 91 | 92 | samples_file.write(sample) 93 | samples_file.write("\n") 94 | -------------------------------------------------------------------------------- /utils/ssd_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .decode_predictions import decode_predictions 2 | from .encode_bboxes import encode_bboxes 3 | from .generate_default_boxes_for_feature_map import generate_default_boxes_for_feature_map 4 | from .get_number_default_boxes import get_number_default_boxes 5 | from .match_gt_boxes_to_default_boxes import match_gt_boxes_to_default_boxes 6 | from .read_sample import read_sample 7 | -------------------------------------------------------------------------------- /utils/ssd_utils/encode_bboxes.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def encode_bboxes(y, epsilon=10e-5): 5 | """ Encode the label to a proper format suitable for training SSD network. 6 | 7 | Args: 8 | - y: A numpy of shape (num_default_boxes, num_classes + 12) representing a label sample. 9 | 10 | Returns: 11 | - A numpy array with the same shape as y but its gt boxes values has been encoded to the proper SSD format. 12 | 13 | Paper References: 14 | - Liu, W., Anguelov, D., Erhan, D., Szegedy, C., Reed, S., Fu, C.Y., & Berg, A. C. (2016). 15 | SSD: Single Shot MultiBox Detector. https://arxiv.org/abs/1512.02325 16 | 17 | Webpage References: 18 | - https://leimao.github.io/blog/Bounding-Box-Encoding-Decoding/ 19 | 20 | Code References: 21 | - https://github.com/pierluigiferrari/ssd_keras/blob/master/ssd_encoder_decoder/ssd_input_encoder.py 22 | """ 23 | gt_boxes = y[:, -12:-8] 24 | df_boxes = y[:, -8:-4] 25 | variances = y[:, -4:] 26 | encoded_gt_boxes_cx = ((gt_boxes[:, 0] - df_boxes[:, 0]) / (df_boxes[:, 2])) / np.sqrt(variances[:, 0]) 27 | encoded_gt_boxes_cy = ((gt_boxes[:, 1] - df_boxes[:, 1]) / (df_boxes[:, 3])) / np.sqrt(variances[:, 1]) 28 | encoded_gt_boxes_w = np.log(epsilon + gt_boxes[:, 2] / df_boxes[:, 2]) / np.sqrt(variances[:, 2]) 29 | encoded_gt_boxes_h = np.log(epsilon + gt_boxes[:, 3] / df_boxes[:, 3]) / np.sqrt(variances[:, 3]) 30 | y[:, -12] = encoded_gt_boxes_cx 31 | y[:, -11] = encoded_gt_boxes_cy 32 | y[:, -10] = encoded_gt_boxes_w 33 | y[:, -9] = encoded_gt_boxes_h 34 | return y 35 | -------------------------------------------------------------------------------- /utils/ssd_utils/generate_default_boxes_for_feature_map.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .get_number_default_boxes import get_number_default_boxes 3 | from utils.bbox_utils import center_to_corner, corner_to_center 4 | 5 | 6 | def generate_default_boxes_for_feature_map( 7 | feature_map_size, 8 | image_size, 9 | offset, 10 | scale, 11 | next_scale, 12 | aspect_ratios, 13 | variances, 14 | extra_box_for_ar_1, 15 | clip_boxes=True, 16 | ): 17 | """ Generates a 4D Tensor representing default boxes. 18 | 19 | Note: 20 | - The structure of a default box is [xmin, ymin, xmax, ymax] 21 | 22 | Args: 23 | - feature_map_size: The size of the feature map. (must be square) 24 | - image_size: The size of the input image. (must be square) 25 | - offset: The offset for the center of the default boxes. The order is (offset_x, offset_y) 26 | - scale: The current scale of the default boxes. 27 | - next_scale: The next scale of the default boxes. 28 | - aspect_ratios: A list of aspect ratios representing the default boxes. 29 | - variance: ... 30 | - extra_box_for_ar_1: Whether to add an extra box for default box with aspect ratio 1. 31 | 32 | Returns: 33 | - A 4D numpy array of shape (feature_map_size, feature_map_size, num_default_boxes, 8) 34 | 35 | Raises: 36 | - offset does not have a len of 2 37 | 38 | Code References: 39 | - https://github.com/pierluigiferrari/ssd_keras/blob/master/keras_layers/keras_layer_AnchorBoxes.py 40 | """ 41 | assert len(offset) == 2, "offset must be of len 2" 42 | 43 | grid_size = image_size / feature_map_size 44 | offset_x, offset_y = offset 45 | num_default_boxes = get_number_default_boxes( 46 | aspect_ratios, 47 | extra_box_for_ar_1=extra_box_for_ar_1 48 | ) 49 | # get all width and height of default boxes 50 | wh_list = [] 51 | for ar in aspect_ratios: 52 | if ar == 1.0 and extra_box_for_ar_1: 53 | wh_list.append([ 54 | image_size * np.sqrt(scale * next_scale) * np.sqrt(ar), 55 | image_size * np.sqrt(scale * next_scale) * (1 / np.sqrt(ar)), 56 | ]) 57 | wh_list.append([ 58 | image_size * scale * np.sqrt(ar), 59 | image_size * scale * (1 / np.sqrt(ar)), 60 | ]) 61 | wh_list = np.array(wh_list, dtype=np.float) 62 | # get all center points of each grid cells 63 | cx = np.linspace(offset_x * grid_size, image_size - (offset_x * grid_size), feature_map_size) 64 | cy = np.linspace(offset_y * grid_size, image_size - (offset_y * grid_size), feature_map_size) 65 | cx_grid, cy_grid = np.meshgrid(cx, cy) 66 | cx_grid, cy_grid = np.expand_dims(cx_grid, axis=-1), np.expand_dims(cy_grid, axis=-1) 67 | cx_grid, cy_grid = np.tile(cx_grid, (1, 1, num_default_boxes)), np.tile(cy_grid, (1, 1, num_default_boxes)) 68 | # 69 | default_boxes = np.zeros((feature_map_size, feature_map_size, num_default_boxes, 4)) 70 | default_boxes[:, :, :, 0] = cx_grid 71 | default_boxes[:, :, :, 1] = cy_grid 72 | default_boxes[:, :, :, 2] = wh_list[:, 0] 73 | default_boxes[:, :, :, 3] = wh_list[:, 1] 74 | # clip overflow default boxes 75 | if clip_boxes: 76 | default_boxes = center_to_corner(default_boxes) 77 | x_coords = default_boxes[:, :, :, [0, 2]] 78 | x_coords[x_coords >= image_size] = image_size - 1 79 | x_coords[x_coords < 0] = 0 80 | default_boxes[:, :, :, [0, 2]] = x_coords 81 | y_coords = default_boxes[:, :, :, [1, 3]] 82 | y_coords[y_coords >= image_size] = image_size - 1 83 | y_coords[y_coords < 0] = 0 84 | default_boxes[:, :, :, [1, 3]] = y_coords 85 | default_boxes = corner_to_center(default_boxes) 86 | # 87 | default_boxes[:, :, :, [0, 2]] /= image_size 88 | default_boxes[:, :, :, [1, 3]] /= image_size 89 | # 90 | variances_tensor = np.zeros_like(default_boxes) 91 | variances_tensor += variances 92 | default_boxes = np.concatenate([default_boxes, variances_tensor], axis=-1) 93 | return default_boxes 94 | -------------------------------------------------------------------------------- /utils/ssd_utils/get_number_default_boxes.py: -------------------------------------------------------------------------------- 1 | def get_number_default_boxes(aspect_ratios, extra_box_for_ar_1=True): 2 | """ Get the number of default boxes for each grid cell based on the number of aspect ratios 3 | and whether to add a extra box for aspect ratio 1 4 | 5 | Args: 6 | - aspect_ratios: A list containing the different aspect ratios of default boxes. 7 | - extra_box_for_ar_1: Whether to add a extra box for aspect ratio 1. 8 | 9 | Returns: 10 | - An integer for the number of default boxes. 11 | """ 12 | num_aspect_ratios = len(aspect_ratios) 13 | return num_aspect_ratios + 1 if (1.0 in aspect_ratios) and extra_box_for_ar_1 else num_aspect_ratios 14 | -------------------------------------------------------------------------------- /utils/ssd_utils/match_gt_boxes_to_default_boxes.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from utils.bbox_utils import iou, center_to_corner 3 | 4 | 5 | def match_gt_boxes_to_default_boxes( 6 | gt_boxes, 7 | default_boxes, 8 | match_threshold=0.5, 9 | neutral_threshold=0.3 10 | ): 11 | """ Matches ground truth bounding boxes to default boxes based on the SSD paper. 12 | 13 | 'We begin by matching each ground truth box to the default box with the best jaccard overlap (as in MultiBox [7]). 14 | Unlike MultiBox, we then match default boxes to any ground truth with jaccard overlap higher than a threshold (0.5)' 15 | 16 | Args: 17 | - gt_boxes: A numpy array or tensor of shape (num_gt_boxes, 4). Structure [cx, cy, w, h] 18 | - default_boxes: A numpy array of tensor of shape (num_default_boxes, 4). Structure [cx, cy, w, h] 19 | - threshold: A float representing a target to decide whether the box is matched 20 | - default_boxes: A numpy array of tensor of shape (num_default_boxes, 4). Structure [cx, cy, w, h] 21 | 22 | Returns: 23 | - matches: A numpy array of shape (num_matches, 2). The first index in the last dimension is the index 24 | of the ground truth box and the last index is the default box index. 25 | - neutral_boxes: A numpy array of shape (num_neutral_boxes, 2). The first index in the last dimension is the index 26 | of the ground truth box and the last index is the default box index. 27 | 28 | Raises: 29 | - Either the shape of ground truth's boxes array or the default boxes array is not 2 30 | 31 | Code References: 32 | - https://github.com/pierluigiferrari/ssd_keras/blob/master/ssd_encoder_decoder/matching_utils.py 33 | 34 | Paper References: 35 | - Liu, W., Anguelov, D., Erhan, D., Szegedy, C., Reed, S., Fu, C.Y., & Berg, A. C. (2016). 36 | SSD: Single Shot MultiBox Detector. https://arxiv.org/abs/1512.02325 37 | """ 38 | 39 | assert len(gt_boxes.shape) == 2, "Shape of ground truth boxes array must be 2" 40 | assert len(default_boxes.shape) == 2, "Shape of default boxes array must be 2" 41 | 42 | # convert gt_boxes and default_boxes to [xmin, ymin, xmax, ymax] 43 | gt_boxes = center_to_corner(gt_boxes) 44 | default_boxes = center_to_corner(default_boxes) 45 | 46 | num_gt_boxes = gt_boxes.shape[0] 47 | num_default_boxes = default_boxes.shape[0] 48 | 49 | matches = np.zeros((num_gt_boxes, 2), dtype=np.int) 50 | 51 | # match ground truth to default box with highest iou 52 | for i in range(num_gt_boxes): 53 | gt_box = gt_boxes[i] 54 | gt_box = np.tile( 55 | np.expand_dims(gt_box, axis=0), 56 | (num_default_boxes, 1) 57 | ) 58 | ious = iou(gt_box, default_boxes) 59 | matches[i] = [i, np.argmax(ious)] 60 | 61 | # match default boxes to ground truths with overlap higher than threshold 62 | gt_boxes = np.tile(np.expand_dims(gt_boxes, axis=1), (1, num_default_boxes, 1)) 63 | default_boxes = np.tile(np.expand_dims(default_boxes, axis=0), (num_gt_boxes, 1, 1)) 64 | ious = iou(gt_boxes, default_boxes) 65 | ious[:, matches[:, 1]] = 0 66 | 67 | matched_gt_boxes_idxs = np.argmax(ious, axis=0) # for each default boxes, select the ground truth box that has the highest iou 68 | matched_ious = ious[matched_gt_boxes_idxs, list(range(num_default_boxes))] # get iou scores between gt and default box that were selected above 69 | matched_df_boxes_idxs = np.nonzero(matched_ious >= match_threshold)[0] # select only matched default boxes that has iou larger than threshold 70 | matched_gt_boxes_idxs = matched_gt_boxes_idxs[matched_df_boxes_idxs] 71 | 72 | # concat the results of the two matching process together 73 | matches = np.concatenate([ 74 | matches, 75 | np.concatenate([ 76 | np.expand_dims(matched_gt_boxes_idxs, axis=-1), 77 | np.expand_dims(matched_df_boxes_idxs, axis=-1) 78 | ], axis=-1), 79 | ], axis=0) 80 | ious[:, matches[:, 1]] = 0 81 | 82 | # find neutral boxes (ious that are higher than neutral_threshold but below threshold) 83 | # these boxes are neither background nor has enough ious score to qualify as a match. 84 | background_gt_boxes_idxs = np.argmax(ious, axis=0) 85 | background_gt_boxes_ious = ious[background_gt_boxes_idxs, list(range(num_default_boxes))] 86 | neutral_df_boxes_idxs = np.nonzero(background_gt_boxes_ious >= neutral_threshold)[0] 87 | neutral_gt_boxes_idxs = background_gt_boxes_idxs[neutral_df_boxes_idxs] 88 | neutral_boxes = np.concatenate([ 89 | np.expand_dims(neutral_gt_boxes_idxs, axis=-1), 90 | np.expand_dims(neutral_df_boxes_idxs, axis=-1) 91 | ], axis=-1) 92 | 93 | return matches, neutral_boxes 94 | -------------------------------------------------------------------------------- /utils/ssd_utils/read_sample.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import xml.etree.ElementTree as ET 4 | import numpy as np 5 | from utils import pascal_voc_utils 6 | 7 | 8 | def read_sample(image_path, label_path): 9 | """ Read image and label file in xml format. 10 | 11 | Args: 12 | - image_path: path to image file 13 | - label_path: path to label xml file 14 | 15 | Returns: 16 | - image: a numpy array with a data type of float 17 | - bboxes: a numpy array with a data type of float 18 | - classes: a list of strings 19 | 20 | Raises: 21 | - Image file does not exist 22 | - Label file does not exist 23 | """ 24 | image_path = image_path.strip("\n") 25 | label_path = label_path.strip("\n") 26 | assert os.path.exists(image_path), "Image file does not exist." 27 | bboxes, classes = pascal_voc_utils.read_label(label_path) 28 | image = cv2.imread(image_path) # read image in bgr format 29 | return np.array(image, dtype=np.float), np.array(bboxes, dtype=np.float), classes 30 | -------------------------------------------------------------------------------- /utils/textboxes_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .decode_predictions import decode_predictions 2 | from .get_bboxes_from_quads import get_bboxes_from_quads 3 | from .sort_quads_vertices import sort_quads_vertices 4 | from .read_sample import read_sample 5 | from .encode_textboxes import encode_textboxes 6 | from .get_samples import get_samples 7 | from .get_num_quads import get_num_quads 8 | -------------------------------------------------------------------------------- /utils/textboxes_utils/encode_textboxes.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from utils import bbox_utils 3 | 4 | 5 | def encode_textboxes(y, epsilon=10e-5): 6 | """ Encode the label to a proper format suitable for training TextBoxes PlusPlus network. 7 | 8 | Args: 9 | - y: A numpy of shape (num_default_boxes, 2 + 12 + 8) representing a label sample. 10 | 11 | Returns: 12 | - A numpy array with the same shape as y but its gt boxes values has been encoded to the proper TextBoxes PlusPlus format. 13 | 14 | Paper References: 15 | - Liao, M., Shi, B., & Bai, X. (2018). TextBoxes++: A Single-Shot Oriented Scene Text Detector. https://arxiv.org/abs/1512.02325 16 | """ 17 | gt_textboxes = y[:, -20:-8] 18 | df_boxes = y[:, -8:-4] 19 | df_boxes_vertices = bbox_utils.center_to_vertices(df_boxes) 20 | variances = y[:, -4:] 21 | encoded_gt_textboxes_cx = ((gt_textboxes[:, 0] - df_boxes[:, 0]) / (df_boxes[:, 2])) / np.sqrt(variances[:, 0]) 22 | encoded_gt_textboxes_cy = ((gt_textboxes[:, 1] - df_boxes[:, 1]) / (df_boxes[:, 3])) / np.sqrt(variances[:, 1]) 23 | encoded_gt_textboxes_w = np.log(epsilon + gt_textboxes[:, 2] / df_boxes[:, 2]) / np.sqrt(variances[:, 2]) 24 | encoded_gt_textboxes_h = np.log(epsilon + gt_textboxes[:, 3] / df_boxes[:, 3]) / np.sqrt(variances[:, 3]) 25 | encoded_gt_textboxes_x1 = ((gt_textboxes[:, 4] - df_boxes_vertices[:, 0, 0]) / df_boxes[:, 2]) / np.sqrt(variances[:, 0]) 26 | encoded_gt_textboxes_y1 = ((gt_textboxes[:, 5] - df_boxes_vertices[:, 0, 1]) / df_boxes[:, 3]) / np.sqrt(variances[:, 1]) 27 | encoded_gt_textboxes_x2 = ((gt_textboxes[:, 6] - df_boxes_vertices[:, 1, 0]) / df_boxes[:, 2]) / np.sqrt(variances[:, 0]) 28 | encoded_gt_textboxes_y2 = ((gt_textboxes[:, 7] - df_boxes_vertices[:, 1, 1]) / df_boxes[:, 3]) / np.sqrt(variances[:, 1]) 29 | encoded_gt_textboxes_x3 = ((gt_textboxes[:, 8] - df_boxes_vertices[:, 2, 0]) / df_boxes[:, 2]) / np.sqrt(variances[:, 0]) 30 | encoded_gt_textboxes_y3 = ((gt_textboxes[:, 9] - df_boxes_vertices[:, 2, 1]) / df_boxes[:, 3]) / np.sqrt(variances[:, 1]) 31 | encoded_gt_textboxes_x4 = ((gt_textboxes[:, 10] - df_boxes_vertices[:, 3, 0]) / df_boxes[:, 2]) / np.sqrt(variances[:, 0]) 32 | encoded_gt_textboxes_y4 = ((gt_textboxes[:, 11] - df_boxes_vertices[:, 3, 1]) / df_boxes[:, 3]) / np.sqrt(variances[:, 1]) 33 | y[:, -20] = encoded_gt_textboxes_cx 34 | y[:, -19] = encoded_gt_textboxes_cy 35 | y[:, -18] = encoded_gt_textboxes_w 36 | y[:, -17] = encoded_gt_textboxes_h 37 | y[:, -16] = encoded_gt_textboxes_x1 38 | y[:, -15] = encoded_gt_textboxes_y1 39 | y[:, -14] = encoded_gt_textboxes_x2 40 | y[:, -13] = encoded_gt_textboxes_y2 41 | y[:, -12] = encoded_gt_textboxes_x3 42 | y[:, -11] = encoded_gt_textboxes_y3 43 | y[:, -10] = encoded_gt_textboxes_x4 44 | y[:, -9] = encoded_gt_textboxes_y4 45 | return y 46 | -------------------------------------------------------------------------------- /utils/textboxes_utils/get_bboxes_from_quads.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def get_bboxes_from_quads(quads): 5 | """ Extracts minimum bounding rectangle from quadrilaterals. 6 | 7 | Args: 8 | - quad: A numpy of shape (n, 4, 2) representing the verticies of a quadrilateral. 9 | 10 | Returns: 11 | - A numpy array with the shape of (n, 4) for cx, cy, width, height 12 | """ 13 | assert quads.shape[1] == 4 and quads.shape[2] == 2, "quad must have a shape of (n, 4, 2)" 14 | xmin = np.min(quads[:, :, 0], axis=-1, keepdims=True) 15 | ymin = np.min(quads[:, :, 1], axis=-1, keepdims=True) 16 | xmax = np.max(quads[:, :, 0], axis=-1, keepdims=True) 17 | ymax = np.max(quads[:, :, 1], axis=-1, keepdims=True) 18 | cx = (xmax + xmin) / 2 19 | cy = (ymax + ymin) / 2 20 | width = np.abs(xmax - xmin) 21 | height = np.abs(ymax - ymin) 22 | return np.concatenate([cx, cy, width, height], axis=-1) 23 | -------------------------------------------------------------------------------- /utils/textboxes_utils/get_num_quads.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | 4 | 5 | def get_num_quads(label_file): 6 | """""" 7 | label_path = label_file.strip("\n") 8 | assert os.path.exists(label_path), "Label file does not exist." 9 | 10 | with open(label_path, "r") as label_file: 11 | temp_labels = label_file.readlines() 12 | 13 | num_labels = 0 14 | 15 | for label in temp_labels: 16 | label = label.strip("\ufeff").strip("\n") 17 | label = label.split(",") 18 | 19 | if len(label[:-1]) != 8: 20 | continue 21 | 22 | num_labels += 1 23 | 24 | return num_labels 25 | -------------------------------------------------------------------------------- /utils/textboxes_utils/get_samples.py: -------------------------------------------------------------------------------- 1 | import os 2 | from glob import glob 3 | from utils import textboxes_utils 4 | 5 | 6 | def get_samples(images_dir, labels_dir): 7 | """ Create a list of samples that can be feed to a data generator. 8 | 9 | Args: 10 | - images_dir: Path to images directory. 11 | - labels_dir: Path to labels directory. 12 | 13 | Returns: 14 | - A list of samples. Each sample is a string containing paths to both the image file and its corresponding label file separated by space. 15 | 16 | Raises: 17 | - images_dir is not a directory. 18 | - labels_dir is not a directory. 19 | """ 20 | assert os.path.isdir(images_dir), "images_dir is not a directory." 21 | assert os.path.isdir(labels_dir), "labels_dir is not a directory." 22 | 23 | images = sorted(list(glob(os.path.join(images_dir, "*.jpg")))) 24 | labels = sorted(list(glob(os.path.join(labels_dir, "*.txt")))) 25 | 26 | assert len(images) == len(labels), "the number of images and the number of labels does not match" 27 | 28 | samples = [] 29 | 30 | all_samples = list(zip(images, labels)) 31 | num_samples = len(all_samples) 32 | 33 | for i, (image_path, label_path) in enumerate(all_samples): 34 | 35 | if (i % 100 == 0): 36 | print(f"{i+1}/{num_samples}") 37 | 38 | num_quads = textboxes_utils.get_num_quads(label_path) 39 | if num_quads == 0: 40 | continue 41 | 42 | samples.append(f"{image_path} {label_path}") 43 | 44 | return samples 45 | -------------------------------------------------------------------------------- /utils/textboxes_utils/read_sample.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import xml.etree.ElementTree as ET 4 | import numpy as np 5 | 6 | 7 | def read_sample(image_path, label_path): 8 | """ Read image and label file in xml format. 9 | 10 | Args: 11 | - image_path: path to image file 12 | - label_path: path to label xml file 13 | 14 | Returns: 15 | - image: a numpy array with a data type of float 16 | - quads: a numpy array with a data type of float 17 | 18 | Raises: 19 | - Image file does not exist 20 | - Label file does not exist 21 | """ 22 | image_path = image_path.strip("\n") 23 | label_path = label_path.strip("\n") 24 | assert os.path.exists(image_path), "Image file does not exist." 25 | assert os.path.exists(label_path), "Label file does not exist." 26 | 27 | image = cv2.imread(image_path) # read image in bgr format 28 | 29 | with open(label_path, "r") as label_file: 30 | temp_labels = label_file.readlines() 31 | 32 | labels = [] 33 | 34 | for label in temp_labels: 35 | label = label.strip("\ufeff").strip("\n") 36 | label = label.split(",") 37 | 38 | if len(label) != 9: 39 | continue 40 | 41 | label = [float(i) for i in label[:8]] 42 | labels.append(label) 43 | 44 | labels = np.array(labels) 45 | quads = np.reshape(labels, (labels.shape[0], 4, 2)) 46 | 47 | return np.array(image, dtype=np.float), np.array(quads, dtype=np.float) 48 | -------------------------------------------------------------------------------- /utils/textboxes_utils/sort_quads_vertices.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .get_bboxes_from_quads import get_bboxes_from_quads 3 | from utils import bbox_utils 4 | import math 5 | import cv2 6 | 7 | 8 | def sort_quads_vertices(quads_prime): 9 | """ Sort quadrilateral vertices. 10 | 11 | Args: 12 | - quads_prime: A numpy of shape (n, 4, 2) representing the quadrilaterals. 13 | 14 | Returns: 15 | - A numpy array with the same shape as quads but its boxes are sorted based on the logic from Liao, Shi & Bai (2018). 16 | 17 | Paper References: 18 | - Liao, M., Shi, B., & Bai, X. (2018). TextBoxes++: A Single-Shot Oriented Scene Text Detector. https://arxiv.org/abs/1512.02325 19 | """ 20 | num_quads = quads_prime.shape[0] 21 | quads = quads_prime.copy() 22 | bboxes = get_bboxes_from_quads(quads_prime) 23 | bboxes = bbox_utils.center_to_vertices(bboxes) 24 | 25 | deltas = np.reshape(np.tile(np.reshape(np.expand_dims(np.array([0, 1, 2, 3]), axis=0), (4, 1)), (1, 4)), (16, 1)) 26 | i = np.reshape(np.tile(np.expand_dims(np.array([1, 2, 3, 4]), axis=0), (1, 4)), (16, 1)) 27 | q_indexes = (i + deltas - 1) % 4 + 1 28 | indexes = np.concatenate([i, q_indexes], axis=-1) 29 | 30 | pts_b = bboxes[:, indexes[:, 0] - 1] 31 | pts_q = quads[:, indexes[:, 1] - 1] 32 | distance = np.sqrt((pts_b[..., 0] - pts_q[..., 0]) ** 2 + (pts_b[..., 1] - pts_q[..., 1]) ** 2) 33 | distance = np.reshape(distance, (num_quads, 4, 4)) 34 | distance = np.sum(distance, axis=-1) 35 | 36 | delta_ms = np.argmin(distance, axis=-1) 37 | delta_ms = np.expand_dims(delta_ms, axis=-1) 38 | delta_ms = np.tile(delta_ms, (1, 4)) 39 | delta_ms = np.reshape(delta_ms, (num_quads, 4, 1)) 40 | 41 | i_prime = np.array([1, 2, 3, 4]) 42 | i_prime = np.expand_dims(i_prime, axis=-1) 43 | i_prime = np.expand_dims(i_prime, axis=0) 44 | i_prime = np.tile(i_prime, (num_quads, 1, 1)) 45 | q_idx_prime = (i_prime + delta_ms - 1) % 4 + 1 46 | i_prime = np.reshape(i_prime, (num_quads, 4)) - 1 47 | q_idx_prime = np.reshape(q_idx_prime, (num_quads, 4)) - 1 48 | 49 | for i in range(num_quads): 50 | quads[i, i_prime[i]] = quads_prime[i, q_idx_prime[i]] 51 | 52 | return quads 53 | -------------------------------------------------------------------------------- /utils/training_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .ssd_mobilenetv2 import ssd_mobilenetv2 2 | from .ssd_mobilenetv1 import ssd_mobilenetv1 3 | from .ssd_vgg16 import ssd_vgg16 4 | from .tbpp_vgg16 import tbpp_vgg16 5 | -------------------------------------------------------------------------------- /utils/training_utils/ssd_mobilenetv1.py: -------------------------------------------------------------------------------- 1 | import os 2 | from losses import SSD_LOSS 3 | from utils import data_utils 4 | from networks import SSD_MOBILENET 5 | from tensorflow.keras.optimizers import SGD 6 | from data_generators import SSD_DATA_GENERATOR 7 | from tensorflow.keras.callbacks import ModelCheckpoint 8 | from tensorflow.keras.applications.mobilenet import preprocess_input 9 | 10 | 11 | def ssd_mobilenetv1(config, args): 12 | training_config = config["training"] 13 | with open(args.label_maps, "r") as label_map_file: 14 | label_maps = [i.strip("\n") for i in label_map_file.readlines()] 15 | 16 | training_samples = data_utils.get_samples_from_split( 17 | split_file=args.training_split, 18 | images_dir=args.images_dir, 19 | labels_dir=args.labels_dir 20 | ) 21 | 22 | if args.validation_split is not None: 23 | validation_samples = data_utils.get_samples_from_split( 24 | split_file=args.validation_split, 25 | images_dir=args.images_dir, 26 | labels_dir=args.labels_dir 27 | ) 28 | 29 | training_data_generator = SSD_DATA_GENERATOR( 30 | samples=training_samples, 31 | config=config, 32 | label_maps=label_maps, 33 | shuffle=args.shuffle, 34 | batch_size=args.batch_size, 35 | augment=args.augment, 36 | process_input_fn=preprocess_input 37 | ) 38 | 39 | if args.validation_split is not None: 40 | print("-- validation split specified") 41 | validation_data_generator = SSD_DATA_GENERATOR( 42 | samples=validation_samples, 43 | config=config, 44 | label_maps=label_maps, 45 | shuffle=args.shuffle, 46 | batch_size=args.batch_size, 47 | augment=False, 48 | process_input_fn=preprocess_input 49 | ) 50 | 51 | loss = SSD_LOSS( 52 | alpha=training_config["alpha"], 53 | min_negative_boxes=training_config["min_negative_boxes"], 54 | negative_boxes_ratio=training_config["negative_boxes_ratio"] 55 | ) 56 | 57 | model = SSD_MOBILENET( 58 | config=config, 59 | label_maps=label_maps, 60 | is_training=True 61 | ) 62 | 63 | optimizer = SGD( 64 | lr=args.learning_rate, 65 | momentum=0.9, 66 | decay=0.0005, 67 | nesterov=False 68 | ) 69 | 70 | model.compile( 71 | optimizer=optimizer, 72 | loss=loss.compute 73 | ) 74 | 75 | if args.checkpoint is not None: 76 | assert os.path.exists(args.checkpoint), "checkpoint does not exist" 77 | model.load_weights(args.checkpoint, by_name=True) 78 | 79 | model.fit( 80 | x=training_data_generator, 81 | validation_data=validation_data_generator if args.validation_split is not None else None, 82 | batch_size=args.batch_size, 83 | validation_batch_size=args.batch_size, 84 | epochs=args.epochs, 85 | callbacks=[ 86 | ModelCheckpoint( 87 | filepath=os.path.join( 88 | args.output_dir, 89 | "cp_{epoch:02d}_loss-{loss:.2f}.h5" if args.validation_split is None else "cp_{epoch:02d}_loss-{loss:.2f}_valloss-{val_loss:.2f}.h5" 90 | ), 91 | save_weights_only=True, 92 | monitor='loss' if args.validation_split is None else 'val_loss', 93 | mode='min' 94 | ) 95 | ] 96 | ) 97 | 98 | model.save_weights(os.path.join(args.output_dir, "model.h5")) 99 | -------------------------------------------------------------------------------- /utils/training_utils/ssd_mobilenetv2.py: -------------------------------------------------------------------------------- 1 | import os 2 | from losses import SSD_LOSS 3 | from utils import data_utils 4 | from networks import SSD_MOBILENETV2 5 | from tensorflow.keras.optimizers import SGD 6 | from data_generators import SSD_DATA_GENERATOR 7 | from tensorflow.keras.callbacks import ModelCheckpoint 8 | from tensorflow.keras.applications.mobilenet_v2 import preprocess_input 9 | 10 | 11 | def ssd_mobilenetv2(config, args): 12 | training_config = config["training"] 13 | with open(args.label_maps, "r") as label_map_file: 14 | label_maps = [i.strip("\n") for i in label_map_file.readlines()] 15 | 16 | training_samples = data_utils.get_samples_from_split( 17 | split_file=args.training_split, 18 | images_dir=args.images_dir, 19 | labels_dir=args.labels_dir 20 | ) 21 | 22 | if args.validation_split is not None: 23 | validation_samples = data_utils.get_samples_from_split( 24 | split_file=args.validation_split, 25 | images_dir=args.images_dir, 26 | labels_dir=args.labels_dir 27 | ) 28 | 29 | training_data_generator = SSD_DATA_GENERATOR( 30 | samples=training_samples, 31 | config=config, 32 | label_maps=label_maps, 33 | shuffle=args.shuffle, 34 | batch_size=args.batch_size, 35 | augment=args.augment, 36 | process_input_fn=preprocess_input 37 | ) 38 | 39 | if args.validation_split is not None: 40 | print("-- validation split specified") 41 | validation_data_generator = SSD_DATA_GENERATOR( 42 | samples=validation_samples, 43 | config=config, 44 | label_maps=label_maps, 45 | shuffle=args.shuffle, 46 | batch_size=args.batch_size, 47 | augment=False, 48 | process_input_fn=preprocess_input 49 | ) 50 | 51 | loss = SSD_LOSS( 52 | alpha=training_config["alpha"], 53 | min_negative_boxes=training_config["min_negative_boxes"], 54 | negative_boxes_ratio=training_config["negative_boxes_ratio"] 55 | ) 56 | 57 | model = SSD_MOBILENETV2( 58 | config=config, 59 | label_maps=label_maps, 60 | is_training=True 61 | ) 62 | 63 | optimizer = SGD( 64 | lr=args.learning_rate, 65 | momentum=0.9, 66 | decay=0.0005, 67 | nesterov=False 68 | ) 69 | 70 | model.compile( 71 | optimizer=optimizer, 72 | loss=loss.compute 73 | ) 74 | 75 | if args.checkpoint is not None: 76 | assert os.path.exists(args.checkpoint), "checkpoint does not exist" 77 | model.load_weights(args.checkpoint, by_name=True) 78 | 79 | model.fit( 80 | x=training_data_generator, 81 | validation_data=validation_data_generator if args.validation_split is not None else None, 82 | batch_size=args.batch_size, 83 | validation_batch_size=args.batch_size, 84 | epochs=args.epochs, 85 | callbacks=[ 86 | ModelCheckpoint( 87 | filepath=os.path.join( 88 | args.output_dir, 89 | "cp_{epoch:02d}_loss-{loss:.2f}.h5" if args.validation_split is None else "cp_{epoch:02d}_loss-{loss:.2f}_valloss-{val_loss:.2f}.h5" 90 | ), 91 | save_weights_only=True, 92 | monitor='loss' if args.validation_split is None else 'val_loss', 93 | mode='min' 94 | ) 95 | ] 96 | ) 97 | 98 | model.save_weights(os.path.join(args.output_dir, "model.h5")) 99 | -------------------------------------------------------------------------------- /utils/training_utils/ssd_vgg16.py: -------------------------------------------------------------------------------- 1 | import os 2 | from losses import SSD_LOSS 3 | from utils import data_utils 4 | from networks import SSD_VGG16 5 | import tensorflow as tf 6 | from tensorflow.keras.optimizers import SGD, Adam 7 | from data_generators import SSD_DATA_GENERATOR 8 | from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger, TerminateOnNaN, LearningRateScheduler 9 | from tensorflow.keras.applications.vgg16 import preprocess_input 10 | 11 | 12 | def ssd_vgg16(config, args, callbacks): 13 | training_config = config["training"] 14 | with open(args.label_maps, "r") as label_map_file: 15 | label_maps = [i.strip("\n") for i in label_map_file.readlines()] 16 | 17 | training_samples = data_utils.get_samples_from_split( 18 | split_file=args.training_split, 19 | images_dir=args.images_dir, 20 | labels_dir=args.labels_dir 21 | ) 22 | 23 | if args.validation_split is not None: 24 | validation_samples = data_utils.get_samples_from_split( 25 | split_file=args.validation_split, 26 | images_dir=args.images_dir, 27 | labels_dir=args.labels_dir 28 | ) 29 | 30 | training_data_generator = SSD_DATA_GENERATOR( 31 | samples=training_samples, 32 | config=config, 33 | label_maps=label_maps, 34 | shuffle=args.shuffle, 35 | batch_size=args.batch_size, 36 | augment=args.augment, 37 | process_input_fn=preprocess_input 38 | ) 39 | 40 | if args.validation_split is not None: 41 | print("-- validation split specified") 42 | validation_data_generator = SSD_DATA_GENERATOR( 43 | samples=validation_samples, 44 | config=config, 45 | label_maps=label_maps, 46 | shuffle=args.shuffle, 47 | batch_size=args.batch_size, 48 | augment=False, 49 | process_input_fn=preprocess_input 50 | ) 51 | 52 | loss = SSD_LOSS( 53 | alpha=training_config["alpha"], 54 | min_negative_boxes=training_config["min_negative_boxes"], 55 | negative_boxes_ratio=training_config["negative_boxes_ratio"] 56 | ) 57 | 58 | if training_config["optimizer"]["name"] == "adam": 59 | optimizer = Adam( 60 | learning_rate=args.learning_rate, 61 | beta_1=training_config["optimizer"]["beta_1"], 62 | beta_2=training_config["optimizer"]["beta_2"], 63 | epsilon=training_config["optimizer"]["epsilon"], 64 | decay=training_config["optimizer"]["decay"] 65 | ) 66 | elif training_config["optimizer"]["name"] == "sgd": 67 | optimizer = SGD( 68 | learning_rate=args.learning_rate, 69 | momentum=training_config["optimizer"]["momentum"], 70 | decay=training_config["optimizer"]["decay"], 71 | nesterov=training_config["optimizer"]["nesterov"] 72 | ) 73 | else: 74 | optimizer = Adam( 75 | learning_rate=args.learning_rate, 76 | beta_1=0.9, 77 | beta_2=0.999, 78 | epsilon=1e-08, 79 | decay=0.0 80 | ) 81 | 82 | model = SSD_VGG16( 83 | config=config, 84 | label_maps=label_maps, 85 | is_training=True 86 | ) 87 | 88 | if args.show_network_structure: 89 | model.summary() 90 | 91 | model.compile( 92 | optimizer=optimizer, 93 | loss=loss.compute 94 | ) 95 | 96 | if args.checkpoint is not None: 97 | assert os.path.exists(args.checkpoint), "checkpoint does not exist" 98 | model.load_weights(args.checkpoint, by_name=True) 99 | 100 | model.fit( 101 | x=training_data_generator, 102 | validation_data=validation_data_generator if args.validation_split is not None else None, 103 | batch_size=args.batch_size, 104 | validation_batch_size=args.batch_size, 105 | epochs=args.epochs, 106 | initial_epoch=args.initial_epoch, 107 | callbacks=callbacks, 108 | ) 109 | 110 | model.save_weights(os.path.join(args.output_dir, "model.h5")) 111 | -------------------------------------------------------------------------------- /utils/training_utils/tbpp_vgg16.py: -------------------------------------------------------------------------------- 1 | import os 2 | from utils import data_utils 3 | from losses import TBPP_LOSS 4 | from networks import TBPP_VGG16 5 | from tensorflow.keras.optimizers import Adam 6 | from data_generators import TBPP_DATA_GENERATOR 7 | from tensorflow.keras.callbacks import ModelCheckpoint 8 | from tensorflow.keras.applications.vgg16 import preprocess_input 9 | 10 | 11 | def tbpp_vgg16(config, args): 12 | training_config = config["training"] 13 | 14 | training_samples = data_utils.get_samples_from_split( 15 | split_file=args.training_split, 16 | images_dir=args.images_dir, 17 | labels_dir=args.labels_dir 18 | ) 19 | 20 | if args.validation_split is not None: 21 | validation_samples = data_utils.get_samples_from_split( 22 | split_file=args.validation_split, 23 | images_dir=args.images_dir, 24 | labels_dir=args.labels_dir 25 | ) 26 | 27 | print("creating data generator for tbpp_vgg16") 28 | training_data_generator = TBPP_DATA_GENERATOR( 29 | samples=training_samples, 30 | config=config, 31 | shuffle=args.shuffle, 32 | batch_size=args.batch_size, 33 | augment=args.augment, 34 | process_input_fn=preprocess_input 35 | ) 36 | 37 | if args.validation_split is not None: 38 | print("-- validation split specified") 39 | validation_data_generator = TBPP_DATA_GENERATOR( 40 | samples=validation_samples, 41 | config=config, 42 | shuffle=args.shuffle, 43 | batch_size=args.batch_size, 44 | augment=False, 45 | process_input_fn=preprocess_input 46 | ) 47 | 48 | loss = TBPP_LOSS( 49 | alpha=training_config["alpha"], 50 | min_negative_boxes=training_config["min_negative_boxes"], 51 | negative_boxes_ratio=training_config["negative_boxes_ratio"] 52 | ) 53 | 54 | model = TBPP_VGG16( 55 | config=config, 56 | is_training=True 57 | ) 58 | 59 | optimizer = Adam( 60 | lr=args.learning_rate, 61 | beta_1=0.9, 62 | beta_2=0.999, 63 | epsilon=0.001, 64 | decay=0.0 65 | ) 66 | 67 | model.compile( 68 | optimizer=optimizer, 69 | loss=loss.compute 70 | ) 71 | 72 | if args.checkpoint is not None: 73 | assert os.path.exists(args.checkpoint), "checkpoint does not exist" 74 | model.load_weights(args.checkpoint, by_name=True) 75 | 76 | model.fit( 77 | x=training_data_generator, 78 | validation_data=validation_data_generator if args.validation_split is not None else None, 79 | batch_size=args.batch_size, 80 | validation_batch_size=args.batch_size, 81 | epochs=args.epochs, 82 | callbacks=[ 83 | ModelCheckpoint( 84 | filepath=os.path.join( 85 | args.output_dir, 86 | "cp_{epoch:02d}_loss-{loss:.2f}.h5" if args.validation_split is None else "cp_{epoch:02d}_loss-{loss:.2f}_valloss-{val_loss:.2f}.h5" 87 | ), 88 | save_weights_only=True, 89 | monitor='loss' if args.validation_split is None else 'val_loss', 90 | mode='min' 91 | ) 92 | ] 93 | ) 94 | 95 | model.save_weights(os.path.join(args.output_dir, "model.h5")) 96 | -------------------------------------------------------------------------------- /utils/visualize_training_metrics.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | 6 | parser = argparse.ArgumentParser( 7 | description='Visualize training metrics.') 8 | parser.add_argument('logfile', type=str, help='path to dataset dir.') 9 | args = parser.parse_args() 10 | 11 | assert os.path.exists(args.logfile), "logfile does not exist" 12 | 13 | data = pd.read_csv(args.logfile) 14 | 15 | plt.plot(data["epoch"], data["loss"], label="loss") 16 | plt.plot(data["epoch"], data["val_loss"], label="val_loss") 17 | plt.xlabel("Epoch") 18 | plt.ylabel("Loss") 19 | plt.legend() 20 | plt.show() 21 | -------------------------------------------------------------------------------- /webcam.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import json 4 | import argparse 5 | import numpy as np 6 | from networks import SSD_VGG16 7 | from tensorflow.keras.applications import vgg16, mobilenet_v2 8 | from utils import bbox_utils 9 | from networks import SSD_MOBILENETV2 10 | 11 | parser = argparse.ArgumentParser( 12 | description='run inference from images on webcam.') 13 | parser.add_argument('config', type=str, help='path to config file.') 14 | parser.add_argument('weights', type=str, help='path to the weight file.') 15 | parser.add_argument('--label_maps', type=str, help='path to label maps file.') 16 | parser.add_argument('--confidence_threshold', type=float, 17 | help='the confidence score a detection should match in order to be counted.', default=0.9) 18 | parser.add_argument('--num_predictions', type=int, 19 | help='the number of detections to be output as final detections', default=10) 20 | args = parser.parse_args() 21 | 22 | with open(args.config, "r") as config_file: 23 | config = json.load(config_file) 24 | 25 | input_size = config["model"]["input_size"] 26 | model_config = config["model"] 27 | 28 | if model_config["name"] == "ssd_mobilenetv2": 29 | with open(args.label_maps, "r") as file: 30 | label_maps = [line.strip("\n") for line in file.readlines()] 31 | model = SSD_MOBILENETV2( 32 | config, 33 | label_maps, 34 | is_training=False, 35 | num_predictions=args.num_predictions) 36 | process_input_fn = mobilenet_v2.preprocess_input 37 | else: 38 | print("model have not been implemented") 39 | exit() 40 | 41 | model.load_weights(args.weights) 42 | 43 | webcam = cv2.VideoCapture(0) 44 | 45 | while True: 46 | check, image = webcam.read() 47 | display_image = image.copy() 48 | image_height, image_width, _ = image.shape 49 | height_scale, width_scale = input_size/image_height, input_size/image_width 50 | 51 | image = cv2.resize(image, (input_size, input_size)) 52 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 53 | image = process_input_fn(image) 54 | 55 | image = np.expand_dims(image, axis=0) 56 | y_pred = model.predict(image) 57 | 58 | for i, pred in enumerate(y_pred[0]): 59 | classname = label_maps[int(pred[0]) - 1].upper() 60 | confidence_score = pred[1] 61 | 62 | score = f"{'%.2f' % (confidence_score * 100)}%" 63 | print(f"-- {classname}: {score}") 64 | 65 | if confidence_score <= 1 and confidence_score > args.confidence_threshold: 66 | xmin = max(int(pred[2] / width_scale), 1) 67 | ymin = max(int(pred[3] / height_scale), 1) 68 | xmax = min(int(pred[4] / width_scale), image_width-1) 69 | ymax = min(int(pred[5] / height_scale), image_height-1) 70 | x1 = max(min(int(pred[6] / width_scale), image_width), 0) 71 | y1 = max(min(int(pred[7] / height_scale), image_height), 0) 72 | x2 = max(min(int(pred[8] / width_scale), image_width), 0) 73 | y2 = max(min(int(pred[9] / height_scale), image_height), 0) 74 | x3 = max(min(int(pred[10] / width_scale), image_width), 0) 75 | y3 = max(min(int(pred[11] / height_scale), image_height), 0) 76 | x4 = max(min(int(pred[12] / width_scale), image_width), 0) 77 | y4 = max(min(int(pred[13] / height_scale), image_height), 0) 78 | 79 | quad = np.array( 80 | [[x1, y1], [x2, y2], [x3, y3], [x4, y4]], dtype=np.int) 81 | 82 | cv2.putText( 83 | display_image, 84 | classname, 85 | (int(xmin), int(ymin)), 86 | cv2.FONT_HERSHEY_PLAIN, 87 | 1, 88 | (100, 100, 255), 89 | 1, 1) 90 | 91 | cv2.polylines( 92 | display_image, 93 | [quad], 94 | True, 95 | (0, 255, 0), 96 | 2 97 | ) 98 | 99 | cv2.rectangle( 100 | display_image, 101 | (xmin, ymin), 102 | (xmax, ymax), 103 | (255, 0, 0), 104 | 1 105 | ) 106 | 107 | cv2.imshow('video', display_image) 108 | 109 | if cv2.waitKey(1) == ord('q'): 110 | break 111 | 112 | webcam.release() 113 | cv2.destroyAllWindows() 114 | --------------------------------------------------------------------------------