├── .gitignore ├── README.md ├── object_detection ├── config │ ├── __init__.py │ ├── config_factory.py │ ├── faster_rcnn_config.py │ └── fpn_config.py ├── dataset │ ├── README.md │ ├── __init__.py │ ├── coco_tf_dataset_generator.py │ ├── dataset_factory.py │ ├── eval_pascal_tf_dataset.py │ ├── pascal_tf_dataset_generator.py │ ├── pascal_tf_dataset_local_file.py │ └── utils │ │ ├── __init__.py │ │ ├── label_map_utils.py │ │ ├── tf_dataset_utils.py │ │ └── tf_record_utils.py ├── evaluation │ ├── detectron_pascal_evaluation_utils.py │ ├── pascal_eval_files_utils.py │ └── pascal_voc_map_utils.py ├── model │ ├── __init__.py │ ├── anchor_target.py │ ├── faster_rcnn │ │ ├── __init__.py │ │ ├── base_faster_rcnn_model.py │ │ ├── resnet_faster_rcnn.py │ │ └── vgg16_faster_rcnn.py │ ├── fpn │ │ ├── __init__.py │ │ ├── base_fpn_model.py │ │ └── resnet_fpn.py │ ├── losses.py │ ├── model_factory.py │ ├── prediction.py │ ├── proposal_target.py │ ├── region_proposal.py │ └── roi_pooling.py ├── protos │ ├── __init__.py │ └── string_int_label_map.proto └── utils │ ├── __init__.py │ ├── anchor_generator.py │ ├── bbox_np.py │ ├── bbox_tf.py │ ├── bbox_transform.py │ ├── pytorch_to_tf.py │ └── visual_utils.py └── scripts ├── eval_coco.py ├── eval_pascal.py ├── generate_pascal_tf_records.py ├── label_map_src └── pascal_label_map.pbtxt └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *pb2.py 3 | .ipynb_checkpoints/ 4 | logs* 5 | ./pycocotools -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TF EAGER OBJECT DETECTION 2 | 3 | ## 0. Targets 4 | + TensorFlow Eager Mode. 5 | + Object detection models. 6 | 7 | ## 1. Architecture 8 | + `scripts`: 9 | + `generate_pascal_tf_records.py`: generate tfrecords files from pascal source files. 10 | + `train.py`: train coco or pascal. 11 | + `eval_pascal.py`: eval pascal dataset. 12 | + `label_map_src`: copy from TensorFlow Object Detection API. 13 | + `object_detection/dataset`: 14 | + `utils`: 15 | + `label_map_utils.py`: copy from TensorFlow Object Detection API. 16 | + `tf_record_utils.py`: utils to generate tfrecords files. 17 | + `tf_dataset_utils.py`: utils to generate `tf.data.Dataset` objects. 18 | + `pascal_tf_dataset_generator.py`: get training pascal `tf.data.Dataset` object from tfrecords files. 19 | + `pascal_tf_dataset_local_file.py`: get training pascal `tf.data.Dataset` by local files. 20 | + `coco_tf_dataset_generator.py`: get training coco `tf.data.Dataset` object. 21 | + `eval_pascal_tf_dataset.py`: get eval pascal `tf.data.Dataset` object. 22 | + `object_detection/evaluation`: 23 | + `detectron_pascal_evaluation_utils.py`: copy from `Detectron`, eval pascal with local detection results. 24 | + `pascal_eval_files_utils.py`: generate local detection result files. 25 | + `pascal_voc_map_utils.py`: get pascal map results. 26 | + `object_detection/model`: 27 | + `faster_rcnn`: 28 | + `base_faster_rcnn_model.py`: base class for faster rcnn. 29 | + `vgg16_faster_rcnn.py`: vgg16 faster rcnn model. 30 | + `resnet_faster_rcnn.py`: resnet faster rcnn model. 31 | + `fpn`: 32 | + `base_fpn_model.py`: base class for fpn. 33 | + `resnet_fpn.py`: resnet fpn model. 34 | + `model_factory`: factory for model creation. 35 | + `anchor_target.py`: generate anchor target for rpn training. 36 | + `losses.py`: smooth l1 loss & cross entropy loss. 37 | + `prediction.py`: generate predictions after roi head. 38 | + `proposal_target.py`: generate proposal target for roi training. 39 | + `region_proposal.py`: generate region proposals for both training & testing procedure. 40 | + `roi_pooling.py`: roi pooling results. 41 | + `object_detection/protos`: protobuf source files. 42 | + `protoc ./object_detection/protos/*.proto --python_out=./object_detection/protos/ ` 43 | + `object_detection/utils`: 44 | + `anchor_generator.py`: generate anchors. 45 | + `bbox_np.py`: cal iou, bbox range filter and bbox clip filter by np. 46 | + `bbox_tf.py`: cal iou, bbox range filter and bbox clip filter by tf. 47 | + `bbox_transform.py`: convert between bbox(xmin, ymin, xmax, ymax) and pred(tx, ty, tw, th) 48 | + `visual_utils.py`: draw bboxes in an image. 49 | + `pytorch_to_tf.py`: convert pytorch model to pickle map. 50 | 51 | 52 | --- 53 | 54 | 55 | ## 2. TODO 56 | 57 | ### 2.1. dataset 58 | + [x] pascal training dataset. 59 | + [x] pascal evaluating dataset. 60 | + [x] coco training dataset. 61 | + [x] coco evaluating dataset. 62 | 63 | ### 2.2. model 64 | + [x] faster rcnn 65 | + [x] fpn 66 | + [ ] mask rcnn 67 | 68 | ### 2.3. training & evaluating 69 | + [ ] use `defun` in all components. 70 | + [ ] multi gpu support. 71 | 72 | ### 2.4. others 73 | + [x] BUG: after a few epochs, gpu memory will boomed twice... #issue 27288 74 | + [ ] jupyter samples. 75 | + [ ] add global step in restore variables. 76 | 77 | --- 78 | 79 | ## 3. training records 80 | 81 | ### 3.1. VOC Pascal 2007 trainval & test 82 | | Models | mAP | 83 | |:------:|:-----:| 84 | |vgg16 tf-faster-rcnn(source)|0.708| 85 | |vgg16 tf-faster-rcnn(load pre-trained model)|0.7106| 86 | |**vgg16 faster rcnn typical configs**|0.6935/0.6869/0.6751| 87 | |**resnet50 faster rcnn typical configs**|0.7294/0.7304| 88 | |resnet101 faster rcnn tf-faster-rcnn(source)|0.757| 89 | |resnet101 faster rcnn tf-faster-rcnn(load pre-trained model)|0.7578| 90 | |**resnet101 faster rcnn typical configs**|0.7456/0.7303/0.7247/0.7261| 91 | |resnet50 fpn FPN_Tensorflow(source)|0.7426| 92 | |resnet50 fpn FPN_Tensorflow(load pre-trained model)|0.7430| 93 | |**resnet50 fpn typical configs**|0.7465/0.7377/0.7392| 94 | |resnet101 fpn FPN_Tensorflow(source)|0.7614| 95 | |**resnet101 fpn typical configs**|0.7604/0.7618/0.7599| 96 | 97 | ### 3.2. COCO 2014 minival 98 | | Models | mAP | 99 | |:------:|:-----:| 100 | |vgg16 tf-faster-rcnn(source)|0.302| 101 | |vgg16 tf-faster-rcnn(load pre-trained model)|0.302| 102 | |resnet50 tf-faster-rcnn(source)|0.324| 103 | |resnet50 tf-faster-rcnn(load pre-trained model)|0.324| 104 | 105 | 106 | --- 107 | 108 | ## 4. 可有可无的教程…… 109 | + training on pascal voc 2007 trainval set, evaluating on pascal voc 2007 test set. 110 | + Step 0: generate python protos by `protoc ./object_detection/protos/*.proto --python_out=./object_detection/protos/ `. 111 | + Step 1: generate trainval datasets, set configs and use `python scripts/generate_pascal_tf_records.py`. 112 | + Step 2: training by `python scripts/train.py`, get logs at `/path/to/logs_dir/`. 113 | + Step 3: evaluating by `python scripts/eval_pascal.py /path/to/logs_dir/ckpt`. -------------------------------------------------------------------------------- /object_detection/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irvingzhang0512/tf_eager_object_detection/e977217cb9d5fc2975292d55b1e5cae484f9e0dd/object_detection/config/__init__.py -------------------------------------------------------------------------------- /object_detection/config/config_factory.py: -------------------------------------------------------------------------------- 1 | 2 | def config_factory(data_type, model_type): 3 | if model_type == 'faster_rcnn': 4 | if data_type == 'pascal': 5 | from object_detection.config.faster_rcnn_config import PASCAL_CONFIG 6 | return PASCAL_CONFIG 7 | elif data_type == 'coco': 8 | from object_detection.config.faster_rcnn_config import COCO_CONFIG 9 | return COCO_CONFIG 10 | elif model_type == 'fpn': 11 | if data_type == 'pascal': 12 | from object_detection.config.fpn_config import PASCAL_CONFIG 13 | return PASCAL_CONFIG 14 | 15 | raise ValueError('config for dataset type {} and model type {} doesn\'t exist'.format(data_type, model_type)) -------------------------------------------------------------------------------- /object_detection/config/faster_rcnn_config.py: -------------------------------------------------------------------------------- 1 | def get_default_pascal_faster_rcnn_config(): 2 | return { 3 | # vgg16 4 | 'vgg16_roi_feature_size': (7, 7, 512), 5 | 'roi_head_keep_dropout_rate': 0.5, 6 | 'vgg16_roi_pooling_max_pooling_flag': True, 7 | 8 | # resnet 9 | 'resnet_roi_feature_size': (7, 7, 1024), 10 | 'resnet_roi_pooling_max_pooling_flag': False, 11 | 12 | # base configs 13 | 'num_classes': 21, 14 | 'weight_decay': 0.0001, 15 | 16 | # anchors configs 17 | 'ratios': [0.5, 1.0, 2.0], 18 | 'scales': [8, 16, 32], 19 | 'extractor_stride': 16, 20 | 21 | # training configs 22 | 'learning_rate_multi_decay_steps': [80000], # 50000 for pascal 2007, 80000 for pascal 0712 23 | 'learning_rate_multi_lrs': [1e-3, 1e-4], 24 | 'learning_rate_bias_double': True, 25 | 'optimizer_momentum': 0.9, 26 | 'epochs': 8, # 14 for pascal 2007, 8 for pascal 0712 27 | 28 | # preprocessing configs 29 | 'image_max_size': 1000, 30 | 'image_min_size': 600, 31 | 'bgr_pixel_means': [103.939, 116.779, 123.68], 32 | # 'bgr_pixel_means': [102.9801, 115.9465, 122.7717], # for tf-faster-rcnn 33 | 34 | # predict & evaluate configs 35 | 'evaluate_iou_threshold': 0.5, # 计算map时使用,pred与gt的iou大于该阈值,则当前pred为TP,否则为FP 36 | 'max_objects_per_class_per_image': 50, 37 | 'max_objects_per_image': 50, 38 | 'prediction_nms_iou_threshold': 0.3, 39 | 'prediction_score_threshold': 0.0, 40 | 'show_image_score_threshold': 0.3, # 用于图像展示 41 | 42 | # anchor target & region proposal 43 | 'rpn_proposal_means': [0, 0, 0, 0], 44 | 'rpn_proposal_stds': [1.0, 1.0, 1.0, 1.0], 45 | 46 | # anchor target 47 | 'rpn_sigma': 3.0, 48 | 'rpn_pos_iou_threshold': 0.7, 49 | 'rpn_neg_iou_threshold': 0.3, 50 | 'rpn_total_sample_number': 256, 51 | 'rpn_pos_sample_max_number': 128, 52 | 53 | # region proposal 54 | 'rpn_proposal_train_pre_nms_sample_number': 12000, 55 | 'rpn_proposal_train_after_nms_sample_number': 2000, 56 | 'rpn_proposal_test_pre_nms_sample_number': 6000, 57 | 'rpn_proposal_test_after_nms_sample_number': 300, 58 | 'rpn_proposal_nms_iou_threshold': 0.7, 59 | 60 | # proposal target & prediction 61 | 'roi_proposal_means': [0, 0, 0, 0], 62 | 'roi_proposal_stds': [0.1, 0.1, 0.2, 0.2], 63 | 64 | # roi pooling 65 | 'roi_pooling_size': 7, 66 | 67 | # proposal target 68 | 'roi_sigma': 1.0, 69 | 'roi_pos_iou_threshold': 0.5, 70 | 'roi_neg_iou_threshold': 0., 71 | 'roi_total_sample_number': 128, 72 | 'roi_pos_sample_max_number': 32, 73 | 74 | } 75 | 76 | 77 | def get_default_coco_faster_rcnn_config(): 78 | return { 79 | # vgg16 80 | 'vgg16_roi_feature_size': (7, 7, 512), 81 | 'roi_head_keep_dropout_rate': 0.5, 82 | 'vgg16_roi_pooling_max_pooling_flag': True, 83 | 84 | # resnet 85 | 'resnet_roi_feature_size': (7, 7, 1024), 86 | 'resnet_roi_pooling_max_pooling_flag': False, 87 | 88 | # base configs 89 | 'num_classes': 81, 90 | 'weight_decay': 0.0001, 91 | 92 | # anchors configs 93 | 'ratios': [0.5, 1.0, 2.0], 94 | 'scales': [4, 8, 16, 32], 95 | 'extractor_stride': 16, 96 | 97 | # training configs 98 | 'learning_rate_multi_decay_steps': [350000], 99 | 'learning_rate_multi_lrs': [1e-3, 1e-4], 100 | 'learning_rate_bias_double': True, 101 | 'optimizer_momentum': 0.9, 102 | 'epochs': 6, 103 | 104 | # preprocessing configs 105 | 'image_max_size': 1000, 106 | 'image_min_size': 600, 107 | # 'bgr_pixel_means': [103.939, 116.779, 123.68], 108 | 'bgr_pixel_means': [102.9801, 115.9465, 122.7717], # for tf-faster-rcnn 109 | 110 | # predict & evaluate configs 111 | 'evaluate_iou_threshold': 0.5, # 计算map时使用,pred与gt的iou大于该阈值,则当前pred为TP,否则为FP 112 | 'max_objects_per_class_per_image': 100, 113 | 'max_objects_per_image': 100, 114 | 'prediction_nms_iou_threshold': 0.3, 115 | 'prediction_score_threshold': 0.0, 116 | 'show_image_score_threshold': 0.3, # 用于图像展示 117 | 118 | # anchor target & region proposal 119 | 'rpn_proposal_means': [0, 0, 0, 0], 120 | 'rpn_proposal_stds': [1.0, 1.0, 1.0, 1.0], 121 | 122 | # anchor target 123 | 'rpn_sigma': 3.0, 124 | 'rpn_pos_iou_threshold': 0.7, 125 | 'rpn_neg_iou_threshold': 0.3, 126 | 'rpn_total_sample_number': 256, 127 | 'rpn_pos_sample_max_number': 128, 128 | 129 | # region proposal 130 | 'rpn_proposal_train_pre_nms_sample_number': 12000, 131 | 'rpn_proposal_train_after_nms_sample_number': 2000, 132 | 'rpn_proposal_test_pre_nms_sample_number': 6000, 133 | 'rpn_proposal_test_after_nms_sample_number': 300, 134 | 'rpn_proposal_nms_iou_threshold': 0.7, 135 | 136 | # proposal target & prediction 137 | 'roi_proposal_means': [0, 0, 0, 0], 138 | 'roi_proposal_stds': [0.1, 0.1, 0.2, 0.2], 139 | 140 | # roi pooling 141 | 'roi_pooling_size': 7, 142 | 143 | # proposal target 144 | 'roi_sigma': 1.0, 145 | 'roi_pos_iou_threshold': 0.5, 146 | 'roi_neg_iou_threshold': 0., 147 | 'roi_total_sample_number': 128, 148 | 'roi_pos_sample_max_number': 32, 149 | } 150 | 151 | 152 | PASCAL_CONFIG = get_default_pascal_faster_rcnn_config() 153 | COCO_CONFIG = get_default_coco_faster_rcnn_config() 154 | -------------------------------------------------------------------------------- /object_detection/config/fpn_config.py: -------------------------------------------------------------------------------- 1 | def get_default_pascal_faster_rcnn_config(): 2 | return { 3 | # 不同backbone参数 4 | 'resnet_roi_feature_size': [7, 7, 256], 5 | 'roi_head_keep_dropout_rate': 0.5, 6 | 7 | # base configs 8 | 'num_classes': 21, 9 | 10 | # fpn 特有参数 11 | 'level_name_list': ['p2', 'p3', 'p4', 'p5', 'p6'], 12 | 'min_level': 2, 13 | 'max_level': 5, 14 | 'top_down_dims': 256, 15 | 16 | # preprocessing configs 17 | 'image_max_size': 1000, 18 | 'image_min_size': 600, 19 | 'bgr_pixel_means': [103.939, 116.779, 123.68], 20 | 21 | # predict & evaluate configs 22 | 'evaluate_iou_threshold': 0.5, # 计算map时使用,pred与gt的iou大于该阈值,则当前pred为TP,否则为FP 23 | 'max_objects_per_class_per_image': 50, 24 | 'max_objects_per_image': 50, 25 | 'prediction_nms_iou_threshold': 0.3, 26 | 'prediction_score_threshold': 0.0, 27 | 'show_image_score_threshold': 0.3, # 用于图像展示 28 | 29 | # anchors configs 30 | 'ratios': [0.5, 1.0, 2.0], 31 | 'scales': [1.], 32 | 'anchor_stride_list': [4, 8, 16, 32, 64], 33 | 'base_anchor_size_list': [32, 64, 128, 256, 512], 34 | 35 | # training configs 36 | 'learning_rate_multi_decay_steps': [60000, 80000], 37 | 'learning_rate_multi_lrs': [1e-3, 1e-4, 1e-5], 38 | 'optimizer_momentum': 0.9, 39 | 'learning_rate_bias_double': False, 40 | 'weight_decay': 0.0001, 41 | 'epochs': 30, 42 | 43 | # rpn net configs 44 | 'rpn_proposal_means': [0, 0, 0, 0], 45 | 'rpn_proposal_stds': [1.0, 1.0, 1.0, 1.0], 46 | 'rpn_sigma': 3.0, 47 | 'rpn_pos_iou_threshold': 0.7, 48 | 'rpn_neg_iou_threshold': 0.3, 49 | 'rpn_total_sample_number': 256, 50 | 'rpn_pos_sample_max_number': 128, 51 | 'rpn_proposal_train_pre_nms_sample_number': 12000, 52 | 'rpn_proposal_train_after_nms_sample_number': 2000, 53 | 'rpn_proposal_test_pre_nms_sample_number': 6000, 54 | 'rpn_proposal_test_after_nms_sample_number': 1000, 55 | 'rpn_proposal_nms_iou_threshold': 0.7, 56 | 57 | 'roi_pooling_size': 7, 58 | 'roi_pooling_max_pooling_flag': True, 59 | 60 | # roi net configs 61 | 'roi_proposal_means': [0, 0, 0, 0], 62 | 'roi_proposal_stds': [0.1, 0.1, 0.2, 0.2], 63 | 'roi_sigma': 1.0, 64 | 'roi_pos_iou_threshold': 0.5, 65 | 'roi_neg_iou_threshold': 0., 66 | 'roi_total_sample_number': 256, 67 | 'roi_pos_sample_max_number': 64, 68 | 69 | } 70 | 71 | 72 | PASCAL_CONFIG = get_default_pascal_faster_rcnn_config() 73 | -------------------------------------------------------------------------------- /object_detection/dataset/README.md: -------------------------------------------------------------------------------- 1 | # Dataset Module 2 | + target: generate `tf.data.Dataset` object for object detection tasks. 3 | + classification: 4 | + training dataset 5 | + eval dataset 6 | + utils 7 | 8 | --- 9 | 10 | ## 1. training dataset 11 | 12 | ### 1.1. iter features 13 | + every iter for each training set generate 3 features: preprocessed image, bboxes and labels. 14 | + preprocessed image: 15 | + dtype: `tf.float32` 16 | + shape: `[1, None, None, 3]` 17 | + PS: `bgr` format. 18 | + bboxes: 19 | + dytpe: `tf.float32` 20 | + shape: `[1, None, 4]` 21 | + format: `ymin, xmin, ymax, xmax` 22 | + range: `[0, image_height - 1]` or `[0, image_width - 1]` 23 | + labels: 24 | + dtype: `tf.int32` or `tf.int64` 25 | + shape: `[1, None,]` 26 | 27 | ### 1.2. data flow 28 | + input: rgb uint8 raw image. 29 | + data argument: 30 | + random flip left and right. 31 | + resize image with min_edge and max_edge. 32 | + preprocessing(one of the following methods): 33 | + method 1(caffe): convert 'rgb' to 'bgr', and then subject imagenet means. 34 | + method 2(tf): convert `[0, 255]` to `[-1, 1]` 35 | 36 | --- 37 | 38 | ## 2. eval dataset 39 | 40 | ### 2.1. iter features 41 | + every iter for each training set generate 3 features: preprocessed image, image scale , image raw height and image raw width. 42 | + preprocessed image: 43 | + dtype: `tf.float32` 44 | + shape: `[1, None, None, 3]` 45 | + PS: `bgr` or `rgb` 46 | + image scale: 47 | + dytpe: `tf.float64` 48 | + shape: `[1,]` 49 | + image height: 50 | + dtype: `tf.int32` or `tf.int64` 51 | + shape: `[1,]` 52 | + image width: 53 | + dtype: `tf.int32` or `tf.int64` 54 | + shape: `[1,]` 55 | + image_id: 56 | + dtype: `tf.float32` 57 | + shape: `[1,]` 58 | + PS: for COCO dataset only. used in coco eval tools. 59 | 60 | ### 2.2. data flow 61 | + input: rgb uint8 raw image. 62 | + resize image with min_edge and max_edge. 63 | + preprocessing(one of the following methods): 64 | + method 1(caffe): convert 'rgb' to 'bgr', and then subject imagenet means. 65 | + method 2(tf): convert `[0, 255]` to `[-1, 1]` 66 | + convert `rgb` to `bgr` if necessary. 67 | -------------------------------------------------------------------------------- /object_detection/dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irvingzhang0512/tf_eager_object_detection/e977217cb9d5fc2975292d55b1e5cae484f9e0dd/object_detection/dataset/__init__.py -------------------------------------------------------------------------------- /object_detection/dataset/coco_tf_dataset_generator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import tensorflow as tf 4 | from functools import partial 5 | from pycocotools.coco import COCO 6 | 7 | from object_detection.dataset.utils.tf_dataset_utils import image_argument_with_imgaug, preprocessing_training_func, \ 8 | preprocessing_eval_func 9 | 10 | _COCO_TRAIN_DATASET = None 11 | _COCO_VAL_DATASET = None 12 | _COCO_TEST_DATASET = None 13 | 14 | 15 | def _get_global_dataset(mode, year, root_dir): 16 | global _COCO_TRAIN_DATASET, _COCO_VAL_DATASET, _COCO_TEST_DATASET 17 | if mode not in ['train', 'val', 'test', 'minival']: 18 | raise ValueError('unknown mode {}'.format(mode)) 19 | if mode == 'train': 20 | if _COCO_TRAIN_DATASET is None: 21 | _COCO_TRAIN_DATASET = CocoDataset(root_dir=root_dir, sub_dir=mode, year=year) 22 | coco_dataset = _COCO_TRAIN_DATASET 23 | elif mode == 'val': 24 | if _COCO_VAL_DATASET is None: 25 | _COCO_VAL_DATASET = CocoDataset(root_dir=root_dir, sub_dir=mode, year=year) 26 | coco_dataset = _COCO_VAL_DATASET 27 | else: 28 | if _COCO_TEST_DATASET is None: 29 | _COCO_TEST_DATASET = CocoDataset(root_dir=root_dir, sub_dir=mode, year=year) 30 | coco_dataset = _COCO_TEST_DATASET 31 | return coco_dataset 32 | 33 | 34 | class CocoDataset: 35 | def __init__(self, root_dir='/ssd/zhangyiyang/COCO2017', sub_dir='train', year="2017", 36 | min_edge=32, ): 37 | if sub_dir not in ['train', 'val', 'minival']: 38 | raise ValueError('unknown sub dir {}'.format(sub_dir)) 39 | if year not in ['2014', '2017']: 40 | raise ValueError('unknown year dir {}'.format(year)) 41 | 42 | annotation_file_path = os.path.join(root_dir, 'annotations', 'instances_{}{}.json'.format(sub_dir, year)) 43 | if sub_dir == 'minival': 44 | sub_dir = 'val' 45 | self._image_dir = os.path.join(root_dir, sub_dir + year) 46 | 47 | self._coco = COCO(annotation_file=annotation_file_path) 48 | self._get_cat_id_name_dict() 49 | self._img_ids, self._img_info_dict = self._filter_images(min_edge=min_edge) 50 | 51 | @property 52 | def img_ids(self): 53 | return self._img_ids 54 | 55 | @property 56 | def img_info_dict(self): 57 | return self._img_info_dict 58 | 59 | @property 60 | def cat_id_to_name_dict(self): 61 | return self._cat_id_to_name_dict 62 | 63 | @property 64 | def name_to_cat_id_dict(self): 65 | return self._name_to_cat_id_dict 66 | 67 | @property 68 | def cat_id_to_raw_id(self): 69 | return self._cat_id_to_raw_id 70 | 71 | @property 72 | def raw_id_to_cat_id(self): 73 | return self._raw_id_to_cat_id 74 | 75 | def _get_cat_id_name_dict(self): 76 | cat_ids = self._coco.getCatIds() 77 | cat_id_to_name = {0: 'background'} 78 | name_to_cat_id = {'background': 0} 79 | cat_id_to_raw_id = {} 80 | raw_id_to_cat_id = {} 81 | for idx, cat_id in enumerate(cat_ids): 82 | cat_name = self._coco.loadCats(cat_id)[0]['name'] 83 | cat_id_to_name[cat_id] = cat_name 84 | name_to_cat_id[cat_name] = cat_id 85 | cat_id_to_raw_id[cat_id] = idx + 1 86 | raw_id_to_cat_id[idx + 1] = cat_id 87 | self._cat_id_to_name_dict = cat_id_to_name 88 | self._name_to_cat_id_dict = name_to_cat_id 89 | self._cat_id_to_raw_id = cat_id_to_raw_id 90 | self._raw_id_to_cat_id = raw_id_to_cat_id 91 | 92 | def _filter_images(self, min_edge): 93 | all_img_ids = list(set([_['image_id'] for _ in self._coco.anns.values()])) 94 | img_ids = [] 95 | img_info_dict = {} 96 | for i in all_img_ids: 97 | info = self._coco.loadImgs(i)[0] 98 | 99 | ann_ids = self._coco.getAnnIds(imgIds=i) 100 | ann_info = self._coco.loadAnns(ann_ids) 101 | _, labels, _ = self._parse_ann_info(ann_info) 102 | 103 | if min(info['width'], info['height']) >= min_edge and labels.shape[0] != 0: 104 | img_ids.append(i) 105 | img_info_dict[i] = info 106 | return img_ids, img_info_dict 107 | 108 | def _parse_ann_info(self, ann_infos): 109 | """Parse bbox annotation. 110 | 111 | Args 112 | --- 113 | ann_info (list[dict]): Annotation info of an image. 114 | 115 | Returns 116 | --- 117 | dict: A dict containing the following keys: bboxes, 118 | bboxes_ignore, labels. 119 | """ 120 | gt_bboxes = [] 121 | gt_labels = [] 122 | gt_labels_text = [] 123 | 124 | for i, ann in enumerate(ann_infos): 125 | if ann.get('ignore', False): 126 | continue 127 | x1, y1, w, h = ann['bbox'] 128 | if ann['area'] <= 0 or w < 1 or h < 1: 129 | continue 130 | bbox = [y1, x1, y1 + h - 1., x1 + w - 1.] 131 | gt_bboxes.append(bbox) 132 | gt_labels.append(self._cat_id_to_raw_id[ann['category_id']]) 133 | gt_labels_text.append(self._cat_id_to_name_dict[ann['category_id']]) 134 | 135 | if gt_bboxes: 136 | gt_bboxes = np.array(gt_bboxes, dtype=np.float32) 137 | gt_labels = np.array(gt_labels, dtype=np.int64) 138 | gt_labels_text = np.array(gt_labels_text, dtype=np.string_) 139 | else: 140 | gt_bboxes = np.zeros((0, 4), dtype=np.float32) 141 | gt_labels = np.array([], dtype=np.int64) 142 | gt_labels_text = np.array([], dtype=np.string_) 143 | 144 | return gt_bboxes, gt_labels, gt_labels_text 145 | 146 | def __getitem__(self, img_id): 147 | # 获取 annotation dict 信息 148 | ann_ids = self._coco.getAnnIds(imgIds=img_id) 149 | ann_infos = self._coco.loadAnns(ann_ids) 150 | gt_bboxes, gt_labels, _ = self._parse_ann_info(ann_infos) 151 | 152 | # 设置 bboxes 范围为 [0, 1] 153 | image_height, image_width = self._img_info_dict[img_id]['height'], self._img_info_dict[img_id]['width'] 154 | gt_bboxes[:, ::2] = gt_bboxes[:, ::2] / image_height 155 | gt_bboxes[:, 1::2] = gt_bboxes[:, 1::2] / image_width 156 | 157 | file_path = os.path.join(self._image_dir, self._img_info_dict[img_id]['file_name']) 158 | return file_path, gt_bboxes, image_height, image_width, gt_labels 159 | 160 | 161 | def get_training_dataset(root_dir='D:\\data\\COCO2017', 162 | mode='train', year="2017", 163 | min_size=600, max_size=1000, 164 | preprocessing_type='caffe', caffe_pixel_means=None, 165 | batch_size=1, 166 | repeat=1, 167 | shuffle=False, shuffle_buffer_size=1000, 168 | prefetch=False, prefetch_buffer_size=1000, 169 | argument=True, iaa_sequence=None): 170 | coco_dataset = _get_global_dataset(mode, year, root_dir) 171 | 172 | def _parse_coco_data_py(img_id): 173 | file_path, gt_bboxes, image_height, image_width, gt_labels = coco_dataset[img_id] 174 | return file_path, gt_bboxes, image_height, image_width, gt_labels 175 | 176 | tf_dataset = tf.data.Dataset.from_tensor_slices(coco_dataset.img_ids).map( 177 | lambda img_id: tuple([*tf.py_func(_parse_coco_data_py, [img_id], 178 | [tf.string, tf.float32, tf.int64, tf.int64, tf.int64])]) 179 | ) 180 | tf_dataset = tf_dataset.map( 181 | lambda file_path, gt_bboxes, image_height, image_width, gt_labels: tuple([ 182 | tf.image.decode_jpeg(tf.io.read_file(file_path), channels=3), 183 | gt_bboxes, image_height, image_width, gt_labels 184 | ]) 185 | ) 186 | 187 | if argument: 188 | image_argument_partial = partial(image_argument_with_imgaug, iaa_sequence=iaa_sequence) 189 | tf_dataset = tf_dataset.map( 190 | lambda image, bboxes, image_height, image_width, labels: tuple([ 191 | *tf.py_func(image_argument_partial, [image, bboxes], [image.dtype, bboxes.dtype]), 192 | image_height, image_width, labels]), 193 | num_parallel_calls=5 194 | ) 195 | 196 | preprocessing_partial_func = partial(preprocessing_training_func, 197 | min_size=min_size, max_size=max_size, 198 | preprocessing_type=preprocessing_type, caffe_pixel_means=caffe_pixel_means) 199 | 200 | tf_dataset = tf_dataset.batch(batch_size=batch_size).map(preprocessing_partial_func, num_parallel_calls=5) 201 | 202 | if shuffle: 203 | tf_dataset = tf_dataset.shuffle(buffer_size=shuffle_buffer_size) 204 | if prefetch: 205 | tf_dataset = tf_dataset.prefetch(buffer_size=prefetch_buffer_size) 206 | 207 | return tf_dataset.repeat(repeat) 208 | 209 | 210 | def get_eval_dataset(root_dir='D:\\data\\COCO2017', 211 | mode='train', year='2017', 212 | min_size=600, max_size=1000, 213 | preprocessing_type='caffe', caffe_pixel_means=None, 214 | batch_size=1, 215 | repeat=1, ): 216 | coco_dataset = _get_global_dataset(mode, year, root_dir) 217 | 218 | preprocessing_partial_func = partial(preprocessing_eval_func, 219 | min_size=min_size, max_size=max_size, 220 | preprocessing_type=preprocessing_type, caffe_pixel_means=caffe_pixel_means) 221 | 222 | def _parse_coco_data_py(img_id): 223 | file_path, _, img_height, img_width, _ = coco_dataset[img_id] 224 | img = tf.image.decode_jpeg(tf.io.read_file(file_path), channels=3) 225 | return img, img_height, img_width, img_id 226 | 227 | def _preprocessing_after_batch(img, img_height, img_width, img_id): 228 | img, img_scale, img_height, img_width = preprocessing_partial_func(img, img_height, img_width) 229 | return img, img_scale, img_height, img_width, img_id[0] 230 | 231 | tf_dataset = tf.data.Dataset.from_tensor_slices(coco_dataset.img_ids).map( 232 | lambda img_id: tuple([*tf.py_func(_parse_coco_data_py, [img_id], 233 | [tf.uint8, tf.int64, tf.int64, tf.int32])]) 234 | ).batch(batch_size).map(_preprocessing_after_batch) 235 | 236 | return tf_dataset.repeat(repeat) 237 | -------------------------------------------------------------------------------- /object_detection/dataset/dataset_factory.py: -------------------------------------------------------------------------------- 1 | from object_detection.dataset.coco_tf_dataset_generator import get_training_dataset as get_coco_train_dataset 2 | from object_detection.dataset.coco_tf_dataset_generator import get_eval_dataset as get_coco_eval_dataset 3 | from object_detection.dataset.pascal_tf_dataset_generator import get_dataset as get_pascal_train_dataset 4 | from object_detection.dataset.eval_pascal_tf_dataset import get_dataset_by_local_file as get_pascal_eval_dataset 5 | 6 | 7 | def dataset_factory(dataset_type, mode, configs): 8 | if dataset_type == 'pascal': 9 | if mode == 'train': 10 | return get_pascal_train_dataset(**configs) 11 | elif mode == 'test': 12 | return get_pascal_eval_dataset('test', **configs) 13 | raise ValueError('unknown mode {} for dataset type {}'.format(mode, dataset_type)) 14 | 15 | if dataset_type == 'coco': 16 | if mode == 'train': 17 | return get_coco_train_dataset(**configs) 18 | elif mode == 'val': 19 | return get_coco_eval_dataset(**configs) 20 | raise ValueError('unknown mode {} for dataset type {}'.format(mode, dataset_type)) 21 | 22 | raise ValueError('unknown dataset type {}'.format(dataset_type)) 23 | -------------------------------------------------------------------------------- /object_detection/dataset/eval_pascal_tf_dataset.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import cv2 4 | import os 5 | from functools import partial 6 | 7 | 8 | __all__ = ['get_dataset_by_tf_records', 'get_dataset_by_local_file'] 9 | 10 | 11 | def get_dataset_by_local_file(mode, root_path, image_format='bgr', 12 | preprocessing_type='caffe', caffe_pixel_means=None, 13 | min_edge=600, max_edge=1000): 14 | """ 15 | 根据 /path/to/VOC2007 or VOC2012/ImageSets/Main/{}.txt 读取图片列表,读取图片 16 | :param mode: 17 | :param root_path: 18 | :param image_format: 19 | :param caffe_pixel_means: 20 | :param preprocessing_type: 21 | :param min_edge: 22 | :param max_edge: 23 | :return: 24 | """ 25 | if image_format not in ['rgb', 'bgr']: 26 | raise ValueError('unknown image format {}'.format(image_format)) 27 | with open(os.path.join(root_path, 'ImageSets', 'Main', '%s.txt' % mode), 'r') as f: 28 | lines = f.readlines() 29 | examples_list = [line.strip() for line in lines] 30 | img_dir = os.path.join(root_path, 'JPEGImages') 31 | 32 | def _map_from_cv2(example): 33 | example = example.decode() 34 | img_file_path = os.path.join(img_dir, example + '.jpg') 35 | img = cv2.imread(img_file_path).astype(np.float32) 36 | if preprocessing_type == 'caffe': 37 | img -= np.array([[caffe_pixel_means]]) 38 | elif preprocessing_type == 'tf': 39 | img = img / 255.0 * 2.0 - 1.0 40 | else: 41 | raise ValueError('unknown preprocessing type {}'.format(preprocessing_type)) 42 | h, w, _ = img.shape 43 | scale1 = min_edge / min(h, w) 44 | scale2 = max_edge / max(h, w) 45 | scale = min(scale1, scale2) 46 | new_h = int(scale * h) 47 | new_w = int(scale * w) 48 | 49 | img = cv2.resize(img, (new_w, new_h)) 50 | if image_format == 'rgb': 51 | img = img[..., ::-1] 52 | return img, float(scale), h, w 53 | 54 | dataset = tf.data.Dataset.from_tensor_slices(examples_list).map( 55 | lambda example: tf.py_func(_map_from_cv2, 56 | [example], 57 | [tf.float32, tf.float64, tf.int64, tf.int64] # linux 58 | # [tf.float32, tf.float64, tf.int32, tf.int32] # windows 59 | ) 60 | ).batch(1) 61 | 62 | return dataset, examples_list 63 | 64 | 65 | def _caffe_preprocessing(image, pixel_means): 66 | """ 67 | 输入 uint8 RGB 的图像,转换为 tf.float32 BGR 格式,并减去 imagenet 平均数 68 | :param image: 69 | :return: 70 | """ 71 | image = tf.to_float(image) 72 | image = tf.reverse(image, axis=[-1]) 73 | channels = tf.split(axis=-1, num_or_size_splits=3, value=image) 74 | for i in range(3): 75 | channels[i] -= pixel_means[i] 76 | return tf.concat(axis=-1, values=channels) 77 | 78 | 79 | def _tf_preprocessing(image): 80 | """ 81 | 输入 uint8 RGB 的图像,转换为 tf.float32 RGB 格式,取值范围[-1, 1] 82 | :param image: 83 | :return: 84 | """ 85 | return tf.image.convert_image_dtype(image, dtype=tf.float32) * 2.0 - 1.0 86 | 87 | 88 | def get_dataset_by_tf_records(mode, root_path, 89 | preprocessing_type='caffe', caffe_pixel_means=None, 90 | min_edge=600, max_edge=1000): 91 | with open(os.path.join(root_path, 'ImageSets', 'Main', '%s.txt' % mode), 'r') as f: 92 | lines = f.readlines() 93 | examples_list = [line.strip() for line in lines] 94 | img_dir = os.path.join(root_path, 'JPEGImages') 95 | example_path_list = [os.path.join(img_dir, example+'.jpg') for example in examples_list] 96 | 97 | def _map_from_tf_image(example_path): 98 | img = tf.image.decode_jpeg(tf.io.read_file(example_path), channels=3) 99 | if preprocessing_type == 'caffe': 100 | preprocessing_fn = partial(_caffe_preprocessing, pixel_means=caffe_pixel_means) 101 | elif preprocessing_type == 'tf': 102 | preprocessing_fn = _tf_preprocessing 103 | else: 104 | raise ValueError('unknown preprocessing type {}'.format(preprocessing_type)) 105 | img = preprocessing_fn(img) 106 | 107 | # TODO: could not get image shape 108 | h, w, _ = img.get_shape().as_list() 109 | scale1 = min_edge / min(h, w) 110 | scale2 = max_edge / max(h, w) 111 | scale = min(scale1, scale2) 112 | img = tf.image.resize_bilinear(img, [tf.to_int32(scale*h), tf.to_int32(scale*w)]) 113 | return img, float(scale), h, w 114 | 115 | dataset = tf.data.Dataset.from_tensor_slices(example_path_list).map(_map_from_tf_image).batch(1) 116 | 117 | return dataset, examples_list 118 | -------------------------------------------------------------------------------- /object_detection/dataset/pascal_tf_dataset_generator.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from functools import partial 3 | 4 | from object_detection.dataset.utils.tf_dataset_utils import image_argument_with_imgaug, preprocessing_training_func 5 | 6 | __all__ = ['get_dataset'] 7 | 8 | 9 | def _parse_tf_records(serialized_example): 10 | features = tf.parse_single_example(serialized_example, 11 | features={'image/height': tf.FixedLenFeature([1], tf.int64), 12 | 'image/width': tf.FixedLenFeature([1], tf.int64), 13 | 'image/filename': tf.FixedLenFeature([1], tf.string), 14 | 'image/encoded': tf.FixedLenFeature([1], tf.string), 15 | 'image/object/bbox/xmin': tf.VarLenFeature(tf.float32), 16 | 'image/object/bbox/xmax': tf.VarLenFeature(tf.float32), 17 | 'image/object/bbox/ymin': tf.VarLenFeature(tf.float32), 18 | 'image/object/bbox/ymax': tf.VarLenFeature(tf.float32), 19 | 'image/object/class/label': tf.VarLenFeature(tf.int64), 20 | 'image/object/class/text': tf.VarLenFeature(tf.string), 21 | } 22 | ) 23 | features['image/object/bbox/xmin'] = tf.sparse_tensor_to_dense(features['image/object/bbox/xmin']) 24 | features['image/object/bbox/xmax'] = tf.sparse_tensor_to_dense(features['image/object/bbox/xmax']) 25 | features['image/object/bbox/ymin'] = tf.sparse_tensor_to_dense(features['image/object/bbox/ymin']) 26 | features['image/object/bbox/ymax'] = tf.sparse_tensor_to_dense(features['image/object/bbox/ymax']) 27 | features['image/object/class/label'] = tf.sparse_tensor_to_dense(features['image/object/class/label']) 28 | image = tf.image.decode_jpeg(features['image/encoded'][0]) 29 | bboxes = tf.transpose(tf.stack((features['image/object/bbox/ymin'], 30 | features['image/object/bbox/xmin'], 31 | features['image/object/bbox/ymax'], 32 | features['image/object/bbox/xmax'])), name='bboxes') 33 | return image, bboxes, features['image/height'][0], features['image/width'][0], features['image/object/class/label'] 34 | 35 | 36 | def get_dataset(tf_records_list, 37 | min_size=600, max_size=1000, 38 | preprocessing_type='caffe', caffe_pixel_means=None, 39 | batch_size=1, repeat=1, 40 | shuffle=False, shuffle_buffer_size=1000, 41 | prefetch=False, prefetch_buffer_size=1000, 42 | argument=True, iaa_sequence=None): 43 | """ 44 | 获取数据集,操作过程如下: 45 | 46 | 1) 从 tfrecords 文件中读取基本数据; 47 | 2) 如果需要数据增强,则通过输入的 iaa_sequence 进行; 48 | 3) 数据归一化,将 uint8 转换为 float,可能是转换到[0, 1]之间,也可能是减去像素平均数 49 | 4) shuffle 操作; 50 | 5) prefetch 操作; 51 | 6) batch 操作。 52 | 7) repeat 操作 53 | 54 | 其中,默认数据增强包括: 55 | ``` 56 | iaa_sequence = [ 57 | iaa.Fliplr(0.5), 58 | ] 59 | ``` 60 | 1) 随机水平; 61 | 62 | 当通过 itr 进行操作时,该 dataset 返回的数据包括: 63 | image, bboxes, labels 64 | 数据类型分别是:tf.float32([0, 1]), tf.float32([0, 边长]), tf.int32([0, num_classes]) 65 | shape为:[1, height, width, 3], [1, num_bboxes, 4], [num_bboxes] 66 | 67 | :param tf_records_list: 68 | :param min_size: 69 | :param max_size: 70 | :param preprocessing_type: 71 | :param caffe_pixel_means: 72 | :param repeat: 73 | :param batch_size: 74 | :param shuffle: 75 | :param shuffle_buffer_size: 76 | :param prefetch: 77 | :param prefetch_buffer_size: 78 | :param argument: 79 | :param iaa_sequence: 80 | :return: 81 | """ 82 | 83 | dataset = tf.data.TFRecordDataset(tf_records_list).map(_parse_tf_records) 84 | 85 | if argument: 86 | image_argument_partial = partial(image_argument_with_imgaug, iaa_sequence=iaa_sequence) 87 | dataset = dataset.map( 88 | lambda image, bboxes, image_height, image_width, labels: tuple([ 89 | *tf.py_func(image_argument_partial, [image, bboxes], [image.dtype, bboxes.dtype]), 90 | image_height, image_width, labels]) 91 | ) 92 | 93 | preprocessing_partial_func = partial(preprocessing_training_func, 94 | min_size=min_size, max_size=max_size, 95 | preprocessing_type=preprocessing_type, 96 | caffe_pixel_means=caffe_pixel_means) 97 | 98 | dataset = dataset.batch(batch_size=batch_size).map(preprocessing_partial_func) 99 | 100 | if shuffle: 101 | dataset = dataset.shuffle(buffer_size=shuffle_buffer_size) 102 | if prefetch: 103 | dataset = dataset.prefetch(buffer_size=prefetch_buffer_size) 104 | 105 | return dataset.repeat(repeat) 106 | -------------------------------------------------------------------------------- /object_detection/dataset/pascal_tf_dataset_local_file.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import cv2 4 | import os 5 | import object_detection.dataset.utils.label_map_utils as label_map_utils 6 | import object_detection.dataset.utils.tf_record_utils as dataset_utils 7 | from lxml import etree 8 | 9 | 10 | def _read_image(file_path): 11 | img = cv2.imread(file_path).astype(np.float32) 12 | img -= np.array([[[102.9801, 115.9465, 122.7717]]]) 13 | h, w, _ = img.shape 14 | min_edge = 600 15 | max_edge = 1000 16 | scale1 = min_edge / min(h, w) 17 | scale2 = max_edge / max(h, w) 18 | scale = min(scale1, scale2) 19 | img = cv2.resize(img, None, None, fx=scale, fy=scale, 20 | interpolation=cv2.INTER_LINEAR) 21 | return img, scale 22 | 23 | 24 | def get_dataset(mode, root_path, label_map_file_path): 25 | label_map_dict = label_map_utils.get_label_map_dict(label_map_file_path) 26 | with open(os.path.join(root_path, 'ImageSets', 'Main', 'aeroplane_%s.txt' % mode), 'r') as f: 27 | lines = f.readlines() 28 | examples_list = [line.strip().split(' ')[0] for line in lines] 29 | annotations_dir = os.path.join(root_path, 'Annotations') 30 | img_dir = os.path.join(root_path, 'JPEGImages') 31 | 32 | def _map_from_xml_and_cv2(example): 33 | example = example.decode() 34 | with open(os.path.join(annotations_dir, str(example) + '.xml'), 'r') as f: 35 | xml_str = f.read() 36 | xml_dict = dataset_utils.recursive_parse_xml_to_dict(etree.fromstring(xml_str))['annotation'] 37 | img_file_path = os.path.join(img_dir, xml_dict['filename']) 38 | img, img_scale = _read_image(img_file_path) 39 | xmin = [] 40 | ymin = [] 41 | xmax = [] 42 | ymax = [] 43 | classes = [] 44 | if 'object' in xml_dict: 45 | for obj in xml_dict['object']: 46 | xmin.append((float(obj['bndbox']['xmin']) - 1) * img_scale) 47 | ymin.append((float(obj['bndbox']['ymin']) - 1) * img_scale) 48 | xmax.append((float(obj['bndbox']['xmax']) - 1) * img_scale) 49 | ymax.append((float(obj['bndbox']['ymax']) - 1) * img_scale) 50 | classes.append(label_map_dict[obj['name']]) 51 | 52 | return img, np.stack([ymin, xmin, ymax, xmax], axis=0).transpose().astype(np.float32), np.array(classes).astype( 53 | np.int32) 54 | 55 | dataset = tf.data.Dataset.from_tensor_slices(examples_list).map( 56 | lambda example: tf.py_func(_map_from_xml_and_cv2, 57 | [example], 58 | [tf.float32, tf.float32, tf.int32]) 59 | ).batch(1) 60 | 61 | return dataset 62 | -------------------------------------------------------------------------------- /object_detection/dataset/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irvingzhang0512/tf_eager_object_detection/e977217cb9d5fc2975292d55b1e5cae484f9e0dd/object_detection/dataset/utils/__init__.py -------------------------------------------------------------------------------- /object_detection/dataset/utils/label_map_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import tensorflow as tf 4 | from google.protobuf import text_format 5 | from object_detection.protos import string_int_label_map_pb2 6 | 7 | 8 | def _validate_label_map(label_map): 9 | """Checks if a label map is valid. 10 | 11 | Args: 12 | label_map: StringIntLabelMap to validate. 13 | 14 | Raises: 15 | ValueError: if label map is invalid. 16 | """ 17 | for item in label_map.item: 18 | if item.id < 0: 19 | raise ValueError('Label map ids should be >= 0.') 20 | if (item.id == 0 and item.name != 'background' and 21 | item.display_name != 'background'): 22 | raise ValueError('Label map id 0 is reserved for the background label') 23 | 24 | 25 | def create_category_index(categories): 26 | """Creates dictionary of COCO compatible categories keyed by category id. 27 | 28 | Args: 29 | categories: a list of dicts, each of which has the following keys: 30 | 'id': (required) an integer id uniquely identifying this category. 31 | 'name': (required) string representing category name 32 | e.g., 'cat', 'dog', 'pizza'. 33 | 34 | Returns: 35 | category_index: a dict containing the same entries as categories, but keyed 36 | by the 'id' field of each category. 37 | """ 38 | category_index = {} 39 | for cat in categories: 40 | category_index[cat['id']] = cat 41 | return category_index 42 | 43 | 44 | def get_max_label_map_index(label_map): 45 | """Get maximum index in label map. 46 | 47 | Args: 48 | label_map: a StringIntLabelMapProto 49 | 50 | Returns: 51 | an integer 52 | """ 53 | return max([item.id for item in label_map.item]) 54 | 55 | 56 | def convert_label_map_to_categories(label_map, 57 | max_num_classes, 58 | use_display_name=True): 59 | """Given label map proto returns categories list compatible with eval. 60 | 61 | This function converts label map proto and returns a list of dicts, each of 62 | which has the following keys: 63 | 'id': (required) an integer id uniquely identifying this category. 64 | 'name': (required) string representing category name 65 | e.g., 'cat', 'dog', 'pizza'. 66 | We only allow class into the list if its id-label_id_offset is 67 | between 0 (inclusive) and max_num_classes (exclusive). 68 | If there are several items mapping to the same id in the label map, 69 | we will only keep the first one in the categories list. 70 | 71 | Args: 72 | label_map: a StringIntLabelMapProto or None. If None, a default categories 73 | list is created with max_num_classes categories. 74 | max_num_classes: maximum number of (consecutive) label indices to include. 75 | use_display_name: (boolean) choose whether to load 'display_name' field as 76 | category name. If False or if the display_name field does not exist, uses 77 | 'name' field as category names instead. 78 | 79 | Returns: 80 | categories: a list of dictionaries representing all possible categories. 81 | """ 82 | categories = [] 83 | list_of_ids_already_added = [] 84 | if not label_map: 85 | label_id_offset = 1 86 | for class_id in range(max_num_classes): 87 | categories.append({ 88 | 'id': class_id + label_id_offset, 89 | 'name': 'category_{}'.format(class_id + label_id_offset) 90 | }) 91 | return categories 92 | for item in label_map.item: 93 | if not 0 < item.id <= max_num_classes: 94 | logging.info( 95 | 'Ignore item %d since it falls outside of requested ' 96 | 'label range.', item.id) 97 | continue 98 | if use_display_name and item.HasField('display_name'): 99 | name = item.display_name 100 | else: 101 | name = item.name 102 | if item.id not in list_of_ids_already_added: 103 | list_of_ids_already_added.append(item.id) 104 | categories.append({'id': item.id, 'name': name}) 105 | return categories 106 | 107 | 108 | def load_labelmap(path): 109 | """Loads label map proto. 110 | 111 | Args: 112 | path: path to StringIntLabelMap proto text file. 113 | Returns: 114 | a StringIntLabelMapProto 115 | """ 116 | with tf.gfile.GFile(path, 'r') as fid: 117 | label_map_string = fid.read() 118 | label_map = string_int_label_map_pb2.StringIntLabelMap() 119 | try: 120 | text_format.Merge(label_map_string, label_map) 121 | except text_format.ParseError: 122 | label_map.ParseFromString(label_map_string) 123 | _validate_label_map(label_map) 124 | return label_map 125 | 126 | 127 | def get_label_map_dict(label_map_path, 128 | use_display_name=False, 129 | fill_in_gaps_and_background=False): 130 | """Reads a label map and returns a dictionary of label names to id. 131 | 132 | Args: 133 | label_map_path: path to StringIntLabelMap proto text file. 134 | use_display_name: whether to use the label map items' display names as keys. 135 | fill_in_gaps_and_background: whether to fill in gaps and background with 136 | respect to the id field in the proto. The id: 0 is reserved for the 137 | 'background' class and will be added if it is missing. All other missing 138 | ids in range(1, max(id)) will be added with a dummy class name 139 | ("class_") if they are missing. 140 | 141 | Returns: 142 | A dictionary mapping label names to id. 143 | 144 | Raises: 145 | ValueError: if fill_in_gaps_and_background and label_map has non-integer or 146 | negative values. 147 | """ 148 | label_map = load_labelmap(label_map_path) 149 | label_map_dict = {} 150 | for item in label_map.item: 151 | if use_display_name: 152 | label_map_dict[item.display_name] = item.id 153 | else: 154 | label_map_dict[item.name] = item.id 155 | 156 | if fill_in_gaps_and_background: 157 | values = set(label_map_dict.values()) 158 | 159 | if 0 not in values: 160 | label_map_dict['background'] = 0 161 | if not all(isinstance(value, int) for value in values): 162 | raise ValueError('The values in label map must be integers in order to' 163 | 'fill_in_gaps_and_background.') 164 | if not all(value >= 0 for value in values): 165 | raise ValueError('The values in the label map must be positive.') 166 | 167 | if len(values) != max(values) + 1: 168 | # there are gaps in the labels, fill in gaps. 169 | for value in range(1, max(values)): 170 | if value not in values: 171 | label_map_dict['class_' + str(value)] = value 172 | 173 | return label_map_dict 174 | 175 | 176 | def create_categories_from_labelmap(label_map_path, use_display_name=True): 177 | """Reads a label map and returns categories list compatible with eval. 178 | 179 | This function converts label map proto and returns a list of dicts, each of 180 | which has the following keys: 181 | 'id': an integer id uniquely identifying this category. 182 | 'name': string representing category name e.g., 'cat', 'dog'. 183 | 184 | Args: 185 | label_map_path: Path to `StringIntLabelMap` proto text file. 186 | use_display_name: (boolean) choose whether to load 'display_name' field 187 | as category name. If False or if the display_name field does not exist, 188 | uses 'name' field as category names instead. 189 | 190 | Returns: 191 | categories: a list of dictionaries representing all possible categories. 192 | """ 193 | label_map = load_labelmap(label_map_path) 194 | max_num_classes = max(item.id for item in label_map.item) 195 | return convert_label_map_to_categories(label_map, max_num_classes, 196 | use_display_name) 197 | 198 | 199 | def create_category_index_from_labelmap(label_map_path, use_display_name=True): 200 | """Reads a label map and returns a category index. 201 | 202 | Args: 203 | label_map_path: Path to `StringIntLabelMap` proto text file. 204 | use_display_name: (boolean) choose whether to load 'display_name' field 205 | as category name. If False or if the display_name field does not exist, 206 | uses 'name' field as category names instead. 207 | 208 | Returns: 209 | A category index, which is a dictionary that maps integer ids to dicts 210 | containing categories, e.g. 211 | {1: {'id': 1, 'name': 'dog'}, 2: {'id': 2, 'name': 'cat'}, ...} 212 | """ 213 | categories = create_categories_from_labelmap(label_map_path, use_display_name) 214 | return create_category_index(categories) 215 | 216 | 217 | def create_class_agnostic_category_index(): 218 | """Creates a category index with a single `object` class.""" 219 | return {1: {'id': 1, 'name': 'object'}} 220 | -------------------------------------------------------------------------------- /object_detection/dataset/utils/tf_dataset_utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import imgaug as ia 3 | from imgaug import augmenters as iaa 4 | import numpy as np 5 | from functools import partial 6 | 7 | __all__ = ['image_argument_with_imgaug', 'preprocessing_training_func', 'preprocessing_eval_func'] 8 | 9 | 10 | def _get_default_iaa_sequence(): 11 | return [ 12 | iaa.Fliplr(0.5), 13 | ] 14 | 15 | 16 | def image_argument_with_imgaug(image, bboxes, iaa_sequence=None): 17 | """ 18 | 增强一张图片 19 | 输入图像是 tf.uint8 类型,数据范围 [0, 255] 20 | 输入bboxes是 tf.float32 类型,数据范围 [0, 1] 21 | 返回结果与输入相同 22 | :param image: 一张图片,类型为ndarray,shape为[None, None, 3] 23 | :param bboxes: 一组bounding box,shape 为 [bbox_number, 4],顺序为 ymin, xmin, ymax, xmax 24 | float类型,取值范围[0, 1] 25 | :param iaa_sequence: 26 | :return: 图像增强结果,包括image和bbox,其格式与输入相同 27 | """ 28 | bboxes_list = [] 29 | height, width, channels = image.shape 30 | for bbox in bboxes: 31 | ymin, xmin, ymax, xmax = int(bbox[0] * height), int(bbox[1] * width), int(bbox[2] * height), int( 32 | bbox[3] * width) 33 | bboxes_list.append(ia.BoundingBox(x1=xmin, y1=ymin, x2=xmax, y2=ymax)) 34 | bboxes_ia = ia.BoundingBoxesOnImage(bboxes_list, shape=image.shape) 35 | 36 | if iaa_sequence is None: 37 | iaa_sequence = _get_default_iaa_sequence() 38 | seq = iaa.Sequential(iaa_sequence) 39 | 40 | seq_det = seq.to_deterministic() 41 | 42 | image_aug = seq_det.augment_images([image])[0] 43 | bbs_aug = seq_det.augment_bounding_boxes([bboxes_ia])[0] 44 | 45 | bboxes_aug_list = [] 46 | height, width, channels = image_aug.shape 47 | for iaa_bbox in bbs_aug.bounding_boxes: 48 | bboxes_aug_list.append([iaa_bbox.y1 / height, iaa_bbox.x1 / width, iaa_bbox.y2 / height, iaa_bbox.x2 / width]) 49 | bboxes_aug_np = np.array(bboxes_aug_list) 50 | bboxes_aug_np[bboxes_aug_np < 0] = 0 51 | bboxes_aug_np[bboxes_aug_np > 1] = 1 52 | return image_aug, bboxes_aug_np.astype(np.float32) 53 | 54 | 55 | def _caffe_preprocessing(image, pixel_means): 56 | """ 57 | 输入 uint8 RGB 的图像,转换为 tf.float32 BGR 格式,并减去 imagenet 平均数 58 | :param image: 59 | :return: 60 | """ 61 | 62 | # 使用下面方法会碰到奇怪的问题:构建第二个 dataset 时报错 63 | # AttributeError: 'Tensor' object has no attribute '_datatype_enum' 64 | # return tf.keras.applications.vgg16.preprocess_input(image) 65 | 66 | image = tf.to_float(image) 67 | image = tf.reverse(image, axis=[-1]) 68 | channels = tf.split(axis=-1, num_or_size_splits=3, value=image) 69 | for i in range(3): 70 | channels[i] -= pixel_means[i] 71 | return tf.concat(axis=-1, values=channels) 72 | 73 | 74 | def _tf_preprocessing(image): 75 | """ 76 | 输入 uint8 RGB 的图像,转换为 tf.float32 RGB 格式,取值范围[-1, 1] 77 | :param image: 78 | :return: 79 | """ 80 | return tf.image.convert_image_dtype(image, dtype=tf.float32) * 2.0 - 1.0 81 | 82 | 83 | def preprocessing_training_func(image, bboxes, height, width, labels, 84 | min_size, max_size, preprocessing_type, caffe_pixel_means=None): 85 | """ 86 | 输入 rgb 图片,进行以下预处理 87 | 1) 短边最短为 min_size,长边最长为 max_size,矛盾时,优先满足长边 88 | 2) 通过 preprocessing_type 选择 preprocessing 函数 89 | :param image: 90 | :param bboxes: 91 | :param width: 92 | :param height: 93 | :param labels: 94 | :param max_size: 95 | :param min_size: 96 | :param preprocessing_type: 97 | :param caffe_pixel_means: 98 | :return: 99 | """ 100 | 101 | if preprocessing_type == 'caffe': 102 | preprocessing_fn = partial(_caffe_preprocessing, pixel_means=caffe_pixel_means) 103 | elif preprocessing_type == 'tf': 104 | preprocessing_fn = _tf_preprocessing 105 | else: 106 | raise ValueError('unknown preprocessing type {}'.format(preprocessing_type)) 107 | image = preprocessing_fn(image) 108 | 109 | height = tf.to_float(height[0]) 110 | width = tf.to_float(width[0]) 111 | scale1 = min_size / tf.minimum(height, width) 112 | scale2 = max_size / tf.maximum(height, width) 113 | scale = tf.minimum(scale1, scale2) 114 | n_height = tf.to_int32(scale * height) 115 | n_width = tf.to_int32(scale * width) 116 | 117 | image = tf.image.resize_bilinear(image, (n_height, n_width)) 118 | 119 | channels = tf.split(axis=-1, num_or_size_splits=4, value=bboxes) 120 | channels[0] = channels[0] * tf.to_float(n_height - 1) 121 | channels[1] = channels[1] * tf.to_float(n_width - 1) 122 | channels[2] = channels[2] * tf.to_float(n_height - 1) 123 | channels[3] = channels[3] * tf.to_float(n_width - 1) 124 | bboxes = tf.concat(channels, axis=-1) 125 | 126 | return image, bboxes, labels 127 | 128 | 129 | def preprocessing_eval_func(image, height, width, 130 | min_size, max_size, preprocessing_type, caffe_pixel_means=None): 131 | """ 132 | 输入 rgb 图片,进行以下预处理 133 | 1) 短边最短为 min_size,长边最长为 max_size,矛盾时,优先满足长边 134 | 2) 通过 preprocessing_type 选择 preprocessing 函数 135 | """ 136 | if preprocessing_type == 'caffe': 137 | preprocessing_fn = partial(_caffe_preprocessing, pixel_means=caffe_pixel_means) 138 | elif preprocessing_type == 'tf': 139 | preprocessing_fn = _tf_preprocessing 140 | else: 141 | raise ValueError('unknown preprocessing type {}'.format(preprocessing_type)) 142 | image = preprocessing_fn(image) 143 | 144 | height = tf.to_float(height[0]) 145 | width = tf.to_float(width[0]) 146 | scale1 = min_size / tf.minimum(height, width) 147 | scale2 = max_size / tf.maximum(height, width) 148 | scale = tf.minimum(scale1, scale2) 149 | n_height = tf.to_int32(scale * height) 150 | n_width = tf.to_int32(scale * width) 151 | 152 | image = tf.image.resize_bilinear(image, (n_height, n_width)) 153 | 154 | return image, scale, tf.to_int32(height), tf.to_int32(width) 155 | -------------------------------------------------------------------------------- /object_detection/dataset/utils/tf_record_utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import os 3 | 4 | 5 | def int64_feature(value): 6 | return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) 7 | 8 | 9 | def int64_list_feature(value): 10 | return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) 11 | 12 | 13 | def bytes_feature(value): 14 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 15 | 16 | 17 | def bytes_list_feature(value): 18 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) 19 | 20 | 21 | def float_list_feature(value): 22 | return tf.train.Feature(float_list=tf.train.FloatList(value=value)) 23 | 24 | 25 | def recursive_parse_xml_to_dict(xml): 26 | if not xml: 27 | return {xml.tag: xml.text} 28 | result = {} 29 | for child in xml: 30 | child_result = recursive_parse_xml_to_dict(child) 31 | if child.tag != 'object': 32 | result[child.tag] = child_result[child.tag] 33 | else: 34 | if child.tag not in result: 35 | result[child.tag] = [] 36 | result[child.tag].append(child_result[child.tag]) 37 | return {xml.tag: result} 38 | 39 | 40 | def get_multi_tf_record_writers(base_path, file_pattern, year, number, mode): 41 | writers_path = [os.path.join(base_path, file_pattern % (year, mode, i)) for i in range(number)] 42 | return [tf.python_io.TFRecordWriter(writer_path) for writer_path in writers_path] 43 | -------------------------------------------------------------------------------- /object_detection/evaluation/detectron_pascal_evaluation_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | ############################################################################## 15 | # 16 | # Based on: 17 | # -------------------------------------------------------- 18 | # Fast/er R-CNN 19 | # Licensed under The MIT License [see LICENSE for details] 20 | # Written by Bharath Hariharan 21 | # -------------------------------------------------------- 22 | 23 | """Python implementation of the PASCAL VOC devkit's AP evaluation code.""" 24 | 25 | import logging 26 | import numpy as np 27 | import os 28 | import xml.etree.ElementTree as ET 29 | import pickle 30 | 31 | logger = logging.getLogger(__name__) 32 | 33 | 34 | def parse_rec(filename): 35 | """Parse a PASCAL VOC xml file.""" 36 | tree = ET.parse(filename) 37 | objects = [] 38 | for obj in tree.findall('object'): 39 | obj_struct = {} 40 | obj_struct['name'] = obj.find('name').text 41 | obj_struct['pose'] = obj.find('pose').text 42 | obj_struct['truncated'] = int(obj.find('truncated').text) 43 | obj_struct['difficult'] = int(obj.find('difficult').text) 44 | bbox = obj.find('bndbox') 45 | obj_struct['bbox'] = [int(bbox.find('xmin').text), 46 | int(bbox.find('ymin').text), 47 | int(bbox.find('xmax').text), 48 | int(bbox.find('ymax').text)] 49 | objects.append(obj_struct) 50 | 51 | return objects 52 | 53 | 54 | def voc_ap(rec, prec, use_07_metric=False): 55 | """Compute VOC AP given precision and recall. If use_07_metric is true, uses 56 | the VOC 07 11-point method (default:False). 57 | """ 58 | if use_07_metric: 59 | # 11 point metric 60 | ap = 0. 61 | for t in np.arange(0., 1.1, 0.1): 62 | if np.sum(rec >= t) == 0: 63 | p = 0 64 | else: 65 | p = np.max(prec[rec >= t]) 66 | ap = ap + p / 11. 67 | else: 68 | # correct AP calculation 69 | # first append sentinel values at the end 70 | mrec = np.concatenate(([0.], rec, [1.])) 71 | mpre = np.concatenate(([0.], prec, [0.])) 72 | 73 | # compute the precision envelope 74 | for i in range(mpre.size - 1, 0, -1): 75 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 76 | 77 | # to calculate area under PR curve, look for points 78 | # where X axis (recall) changes value 79 | i = np.where(mrec[1:] != mrec[:-1])[0] 80 | 81 | # and sum (\Delta recall) * prec 82 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 83 | return ap 84 | 85 | 86 | def voc_eval(detpath, 87 | annopath, 88 | imagesetfile, 89 | classname, 90 | cachedir, 91 | ovthresh=0.5, 92 | use_07_metric=True): 93 | """rec, prec, ap = voc_eval(detpath, 94 | annopath, 95 | imagesetfile, 96 | classname, 97 | [ovthresh], 98 | [use_07_metric]) 99 | 100 | Top level function that does the PASCAL VOC evaluation. 101 | 102 | detpath: Path to detections 103 | detpath.format(classname) should produce the detection results file. 104 | annopath: Path to annotations 105 | annopath.format(imagename) should be the xml annotations file. 106 | imagesetfile: Text file containing the list of images, one image per line. 107 | classname: Category name (duh) 108 | cachedir: Directory for caching the annotations 109 | [ovthresh]: Overlap threshold (default = 0.5) 110 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 111 | (default False) 112 | """ 113 | # assumes detections are in detpath.format(classname) 114 | # assumes annotations are in annopath.format(imagename) 115 | # assumes imagesetfile is a text file with each line an image name 116 | # cachedir caches the annotations in a pickle file 117 | 118 | # first load gt 119 | if not os.path.isdir(cachedir): 120 | os.mkdir(cachedir) 121 | imageset = os.path.splitext(os.path.basename(imagesetfile))[0] 122 | cachefile = os.path.join(cachedir, imageset + '_annots.pkl') 123 | # read list of images 124 | with open(imagesetfile, 'r') as f: 125 | lines = f.readlines() 126 | imagenames = [x.strip() for x in lines] 127 | 128 | if not os.path.isfile(cachefile): 129 | # load annots 130 | recs = {} 131 | for i, imagename in enumerate(imagenames): 132 | recs[imagename] = parse_rec(annopath.format(imagename)) 133 | if i % 100 == 0: 134 | logger.info( 135 | 'Reading annotation for {:d}/{:d}'.format( 136 | i + 1, len(imagenames))) 137 | # save 138 | logger.info('Saving cached annotations to {:s}'.format(cachefile)) 139 | with open(cachefile, 'wb') as f: 140 | pickle.dump(recs, f) 141 | else: 142 | with open(cachefile, 'rb') as f: 143 | recs = pickle.load(f) 144 | 145 | # extract gt objects for this class 146 | class_recs = {} 147 | npos = 0 148 | for imagename in imagenames: 149 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 150 | bbox = np.array([x['bbox'] for x in R]) 151 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 152 | det = [False] * len(R) 153 | npos = npos + sum(~difficult) 154 | class_recs[imagename] = {'bbox': bbox, 155 | 'difficult': difficult, 156 | 'det': det} 157 | 158 | # read dets 159 | detfile = detpath.format(classname) 160 | with open(detfile, 'r') as f: 161 | lines = f.readlines() 162 | 163 | splitlines = [x.strip().split(' ') for x in lines] 164 | image_ids = [x[0] for x in splitlines] 165 | confidence = np.array([float(x[1]) for x in splitlines]) 166 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 167 | 168 | # sort by confidence 169 | sorted_ind = np.argsort(-confidence) 170 | BB = BB[sorted_ind, :] 171 | image_ids = [image_ids[x] for x in sorted_ind] 172 | 173 | # go down dets and mark TPs and FPs 174 | nd = len(image_ids) 175 | tp = np.zeros(nd) 176 | fp = np.zeros(nd) 177 | for d in range(nd): 178 | R = class_recs[image_ids[d]] 179 | bb = BB[d, :].astype(float) 180 | ovmax = -np.inf 181 | BBGT = R['bbox'].astype(float) 182 | 183 | if BBGT.size > 0: 184 | # compute overlaps 185 | # intersection 186 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 187 | iymin = np.maximum(BBGT[:, 1], bb[1]) 188 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 189 | iymax = np.minimum(BBGT[:, 3], bb[3]) 190 | iw = np.maximum(ixmax - ixmin + 1., 0.) 191 | ih = np.maximum(iymax - iymin + 1., 0.) 192 | inters = iw * ih 193 | 194 | # union 195 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 196 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 197 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 198 | 199 | overlaps = inters / uni 200 | ovmax = np.max(overlaps) 201 | jmax = np.argmax(overlaps) 202 | 203 | if ovmax > ovthresh: 204 | if not R['difficult'][jmax]: 205 | if not R['det'][jmax]: 206 | tp[d] = 1. 207 | R['det'][jmax] = 1 208 | else: 209 | fp[d] = 1. 210 | else: 211 | fp[d] = 1. 212 | 213 | # compute precision recall 214 | fp = np.cumsum(fp) 215 | tp = np.cumsum(tp) 216 | rec = tp / float(npos) 217 | # avoid divide by zero in case the first detection matches a difficult 218 | # ground truth 219 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 220 | ap = voc_ap(rec, prec, use_07_metric) 221 | 222 | return rec, prec, ap 223 | -------------------------------------------------------------------------------- /object_detection/evaluation/pascal_eval_files_utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from tqdm import tqdm 4 | from object_detection.dataset.eval_pascal_tf_dataset import get_dataset_by_local_file, get_dataset_by_tf_records 5 | from object_detection.utils.bbox_transform import decode_bbox_with_mean_and_std 6 | from object_detection.utils.bbox_tf import bboxes_clip_filter 7 | 8 | num_classes = 21 9 | class_list = ('__background__', # always index 0 10 | 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 11 | 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 12 | 'sheep', 'sofa', 'train', 'tvmonitor') 13 | class_name_to_id_dict = dict(list(zip(class_list, list(range(num_classes))))) 14 | class_id_to_name_dict = dict(list(zip(list(range(num_classes)), class_list))) 15 | 16 | __all__ = ['get_prediction_files'] 17 | 18 | 19 | def get_prediction_files(cur_model, 20 | dataset_type='tf', image_format='bgr', 21 | preprocessing_type='caffe', caffe_pixel_means=None, 22 | min_edge=600, max_edge=1000, 23 | data_root_path=None, 24 | mode='test', 25 | result_file_format='/path/to/results/{:s}.txt', 26 | score_threshold=0.0, iou_threshold=0.5, 27 | max_objects_per_class=50, max_objects_per_image=50, 28 | target_means=None, target_stds=None, 29 | min_size=10): 30 | """ 31 | 使用模型,生成预测结果文件 32 | :param cur_model: 已导入pre-trained model的模型 33 | :param dataset_type: 预测数据集类型,有 cv 和 tf 两个选项 34 | :param image_format: 35 | :param caffe_pixel_means: 36 | :param preprocessing_type: 37 | :param min_edge: 38 | :param max_edge: 39 | :param data_root_path: 数据集所在位置 40 | :param mode: 需要预测的数据集类型,train val trainval test 41 | :param result_file_format: `result_file_format.format(class_name)` 就是对应类型输出结果文件具体路径 42 | :param score_threshold: 预测结果最小得分 43 | :param iou_threshold: 进行nms时的 iou threshold 44 | :param max_objects_per_class: 一张图中,每个类型最多能够输出多少个预测结果 45 | :param max_objects_per_image: 一张图片中,一共最多能生成多少预测结果 46 | :param target_means: decode_bbox_with_mean_and_std 参数 47 | :param target_stds: decode_bbox_with_mean_and_std 参数 48 | :param min_size: 最终结果最小边长(像素) 49 | :return: 50 | """ 51 | if image_format not in ['bgr', 'rgb']: 52 | raise ValueError('unknown image format {}'.format(image_format)) 53 | 54 | if dataset_type == 'cv2': 55 | eval_dataset, image_sets = get_dataset_by_local_file(mode, data_root_path, 56 | image_format=image_format, 57 | preprocessing_type=preprocessing_type, 58 | caffe_pixel_means=caffe_pixel_means, 59 | min_edge=min_edge, max_edge=max_edge) 60 | elif dataset_type == 'tf': 61 | eval_dataset, image_sets = get_dataset_by_tf_records(mode, data_root_path, 62 | preprocessing_type=preprocessing_type, 63 | caffe_pixel_means=caffe_pixel_means, 64 | min_edge=min_edge, max_edge=max_edge) 65 | else: 66 | raise ValueError('unknown dataset type {}'.format(dataset_type)) 67 | 68 | if target_stds is None: 69 | target_stds = [0.1, 0.1, 0.2, 0.2] 70 | if target_means is None: 71 | target_means = [0, 0, 0, 0] 72 | 73 | all_boxes = [[[] for _ in range(len(image_sets))] 74 | for _ in range(num_classes)] 75 | i = 0 76 | for img, img_scale, raw_h, raw_w in tqdm(eval_dataset): 77 | raw_h = tf.to_float(raw_h) 78 | raw_w = tf.to_float(raw_w) 79 | scores, roi_txtytwth, rois = cur_model.im_detect(img, img_scale) 80 | roi_txtytwth = tf.reshape(roi_txtytwth, [-1, num_classes, 4]) 81 | for j in range(1, num_classes): 82 | inds = tf.where(scores[:, j] > score_threshold)[:, 0] 83 | cls_scores = tf.gather(scores[:, j], inds) 84 | cls_boxes = decode_bbox_with_mean_and_std(tf.gather(rois, inds), 85 | tf.gather(roi_txtytwth[:, j, :], inds), 86 | target_means=target_means, target_stds=target_stds) 87 | cls_boxes, inds = bboxes_clip_filter(cls_boxes, 0, raw_h, raw_w, min_size) 88 | cls_scores = tf.gather(cls_scores, inds) 89 | keep = tf.image.non_max_suppression(cls_boxes, cls_scores, max_objects_per_class, 90 | iou_threshold=iou_threshold) 91 | 92 | cls_scores = cls_scores.numpy() 93 | cls_boxes = cls_boxes.numpy() 94 | cls_dets = np.hstack((cls_boxes, cls_scores[:, np.newaxis])) \ 95 | .astype(np.float32, copy=False) 96 | cls_dets = cls_dets[keep.numpy(), :] 97 | all_boxes[j][i] = cls_dets 98 | 99 | if max_objects_per_image > 0: 100 | image_scores = np.hstack([all_boxes[j][i][:, -1] 101 | for j in range(1, num_classes)]) 102 | if len(image_scores) > max_objects_per_image: 103 | image_thresh = np.sort(image_scores)[-max_objects_per_image] 104 | for j in range(1, num_classes): 105 | keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0] 106 | all_boxes[j][i] = all_boxes[j][i][keep, :] 107 | i += 1 108 | 109 | for cls_ind, cls in enumerate(class_list): 110 | if cls == '__background__': 111 | continue 112 | tf.logging.info('Writing {} VOC results file'.format(cls)) 113 | filename = result_file_format.format(cls) 114 | with open(filename, 'wt') as f: 115 | for im_ind, index in enumerate(image_sets): 116 | dets = np.array(all_boxes[cls_ind][im_ind]) 117 | if dets == []: 118 | continue 119 | # the VOCdevkit expects 1-based indices 120 | for k in range(dets.shape[0]): 121 | f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. 122 | format(index, dets[k, -1], dets[k, 0] + 1, dets[k, 1] + 1, dets[k, 2] + 1, dets[k, 3] + 1)) 123 | -------------------------------------------------------------------------------- /object_detection/evaluation/pascal_voc_map_utils.py: -------------------------------------------------------------------------------- 1 | # copy from https://github.com/chenyuntc/simple-faster-rcnn-pytorch/blob/master/utils/eval_tool.py 2 | from __future__ import division 3 | 4 | from collections import defaultdict 5 | import itertools 6 | import numpy as np 7 | import six 8 | from object_detection.utils.bbox_tf import pairwise_iou 9 | 10 | 11 | def eval_detection_voc( 12 | pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, 13 | gt_difficults=None, 14 | iou_thresh=0.5, use_07_metric=False): 15 | """Calculate average precisions based on evaluation code of PASCAL VOC. 16 | 17 | This function evaluates predicted bounding boxes obtained from a dataset 18 | which has :math:`N` images by using average precision for each class. 19 | The code is based on the evaluation code used in PASCAL VOC Challenge. 20 | 21 | Args: 22 | pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N` 23 | sets of bounding boxes. 24 | Its index corresponds to an index for the base dataset. 25 | Each element of :obj:`pred_bboxes` is a set of coordinates 26 | of bounding boxes. This is an array whose shape is :math:`(R, 4)`, 27 | where :math:`R` corresponds 28 | to the number of bounding boxes, which may vary among boxes. 29 | The second axis corresponds to 30 | :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box. 31 | pred_labels (iterable of numpy.ndarray): An iterable of labels. 32 | Similar to :obj:`pred_bboxes`, its index corresponds to an 33 | index for the base dataset. Its length is :math:`N`. 34 | pred_scores (iterable of numpy.ndarray): An iterable of confidence 35 | scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`, 36 | its index corresponds to an index for the base dataset. 37 | Its length is :math:`N`. 38 | gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth 39 | bounding boxes 40 | whose length is :math:`N`. An element of :obj:`gt_bboxes` is a 41 | bounding box whose shape is :math:`(R, 4)`. Note that the number of 42 | bounding boxes in each image does not need to be same as the number 43 | of corresponding predicted boxes. 44 | gt_labels (iterable of numpy.ndarray): An iterable of ground truth 45 | labels which are organized similarly to :obj:`gt_bboxes`. 46 | gt_difficults (iterable of numpy.ndarray): An iterable of boolean 47 | arrays which is organized similarly to :obj:`gt_bboxes`. 48 | This tells whether the 49 | corresponding ground truth bounding box is difficult or not. 50 | By default, this is :obj:`None`. In that case, this function 51 | considers all bounding boxes to be not difficult. 52 | iou_thresh (float): A prediction is correct if its Intersection over 53 | Union with the ground truth is above this value. 54 | use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric 55 | for calculating average precision. The default value is 56 | :obj:`False`. 57 | 58 | Returns: 59 | dict: 60 | 61 | The keys, value-types and the description of the values are listed 62 | below. 63 | 64 | * **ap** (*numpy.ndarray*): An array of average precisions. \ 65 | The :math:`l`-th value corresponds to the average precision \ 66 | for class :math:`l`. If class :math:`l` does not exist in \ 67 | either :obj:`pred_labels` or :obj:`gt_labels`, the corresponding \ 68 | value is set to :obj:`numpy.nan`. 69 | * **map** (*float*): The average of Average Precisions over classes. 70 | 71 | """ 72 | 73 | prec, rec = calc_detection_voc_prec_rec( 74 | pred_bboxes, pred_labels, pred_scores, 75 | gt_bboxes, gt_labels, gt_difficults, 76 | iou_thresh=iou_thresh) 77 | 78 | ap = calc_detection_voc_ap(prec, rec, use_07_metric=use_07_metric) 79 | 80 | return {'ap': ap, 'map': np.nanmean(ap)} 81 | 82 | 83 | def calc_detection_voc_prec_rec( 84 | pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, 85 | gt_difficults=None, 86 | iou_thresh=0.5): 87 | """Calculate precision and recall based on evaluation code of PASCAL VOC. 88 | 89 | This function calculates precision and recall of 90 | predicted bounding boxes obtained from a dataset which has :math:`N` 91 | images. 92 | The code is based on the evaluation code used in PASCAL VOC Challenge. 93 | 94 | Args: 95 | pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N` 96 | sets of bounding boxes. 97 | Its index corresponds to an index for the base dataset. 98 | Each element of :obj:`pred_bboxes` is a set of coordinates 99 | of bounding boxes. This is an array whose shape is :math:`(R, 4)`, 100 | where :math:`R` corresponds 101 | to the number of bounding boxes, which may vary among boxes. 102 | The second axis corresponds to 103 | :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box. 104 | pred_labels (iterable of numpy.ndarray): An iterable of labels. 105 | Similar to :obj:`pred_bboxes`, its index corresponds to an 106 | index for the base dataset. Its length is :math:`N`. 107 | pred_scores (iterable of numpy.ndarray): An iterable of confidence 108 | scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`, 109 | its index corresponds to an index for the base dataset. 110 | Its length is :math:`N`. 111 | gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth 112 | bounding boxes 113 | whose length is :math:`N`. An element of :obj:`gt_bboxes` is a 114 | bounding box whose shape is :math:`(R, 4)`. Note that the number of 115 | bounding boxes in each image does not need to be same as the number 116 | of corresponding predicted boxes. 117 | gt_labels (iterable of numpy.ndarray): An iterable of ground truth 118 | labels which are organized similarly to :obj:`gt_bboxes`. 119 | gt_difficults (iterable of numpy.ndarray): An iterable of boolean 120 | arrays which is organized similarly to :obj:`gt_bboxes`. 121 | This tells whether the 122 | corresponding ground truth bounding box is difficult or not. 123 | By default, this is :obj:`None`. In that case, this function 124 | considers all bounding boxes to be not difficult. 125 | iou_thresh (float): A prediction is correct if its Intersection over 126 | Union with the ground truth is above this value.. 127 | 128 | Returns: 129 | tuple of two lists: 130 | This function returns two lists: :obj:`prec` and :obj:`rec`. 131 | 132 | * :obj:`prec`: A list of arrays. :obj:`prec[l]` is precision \ 133 | for class :math:`l`. If class :math:`l` does not exist in \ 134 | either :obj:`pred_labels` or :obj:`gt_labels`, :obj:`prec[l]` is \ 135 | set to :obj:`None`. 136 | * :obj:`rec`: A list of arrays. :obj:`rec[l]` is recall \ 137 | for class :math:`l`. If class :math:`l` that is not marked as \ 138 | difficult does not exist in \ 139 | :obj:`gt_labels`, :obj:`rec[l]` is \ 140 | set to :obj:`None`. 141 | 142 | """ 143 | 144 | pred_bboxes = iter(pred_bboxes) 145 | pred_labels = iter(pred_labels) 146 | pred_scores = iter(pred_scores) 147 | gt_bboxes = iter(gt_bboxes) 148 | gt_labels = iter(gt_labels) 149 | if gt_difficults is None: 150 | gt_difficults = itertools.repeat(None) 151 | else: 152 | gt_difficults = iter(gt_difficults) 153 | 154 | n_pos = defaultdict(int) 155 | score = defaultdict(list) 156 | match = defaultdict(list) 157 | 158 | for pred_bbox, pred_label, pred_score, gt_bbox, gt_label, gt_difficult in \ 159 | six.moves.zip( 160 | pred_bboxes, pred_labels, pred_scores, 161 | gt_bboxes, gt_labels, gt_difficults): 162 | 163 | if gt_difficult is None: 164 | gt_difficult = np.zeros(gt_bbox.shape[0], dtype=bool) 165 | 166 | for l in np.unique(np.concatenate((pred_label, gt_label)).astype(int)): 167 | pred_mask_l = pred_label == l 168 | pred_bbox_l = pred_bbox[pred_mask_l] 169 | pred_score_l = pred_score[pred_mask_l] 170 | # sort by score 171 | order = pred_score_l.argsort()[::-1] 172 | pred_bbox_l = pred_bbox_l[order] 173 | pred_score_l = pred_score_l[order] 174 | 175 | gt_mask_l = gt_label == l 176 | gt_bbox_l = gt_bbox[gt_mask_l] 177 | gt_difficult_l = gt_difficult[gt_mask_l] 178 | 179 | n_pos[l] += np.logical_not(gt_difficult_l).sum() 180 | score[l].extend(pred_score_l) 181 | 182 | if len(pred_bbox_l) == 0: 183 | continue 184 | if len(gt_bbox_l) == 0: 185 | match[l].extend((0,) * pred_bbox_l.shape[0]) 186 | continue 187 | 188 | # VOC evaluation follows integer typed bounding boxes. 189 | pred_bbox_l = pred_bbox_l.copy() 190 | pred_bbox_l[:, 2:] += 1 191 | gt_bbox_l = gt_bbox_l.copy() 192 | gt_bbox_l[:, 2:] += 1 193 | 194 | iou = pairwise_iou(pred_bbox_l, gt_bbox_l).numpy() 195 | gt_index = iou.argmax(axis=1) 196 | # set -1 if there is no matching ground truth 197 | gt_index[iou.max(axis=1) < iou_thresh] = -1 198 | del iou 199 | 200 | selec = np.zeros(gt_bbox_l.shape[0], dtype=bool) 201 | for gt_idx in gt_index: 202 | if gt_idx >= 0: 203 | if gt_difficult_l[gt_idx]: 204 | match[l].append(-1) 205 | else: 206 | if not selec[gt_idx]: 207 | match[l].append(1) 208 | else: 209 | match[l].append(0) 210 | selec[gt_idx] = True 211 | else: 212 | match[l].append(0) 213 | 214 | for iter_ in ( 215 | pred_bboxes, pred_labels, pred_scores, 216 | gt_bboxes, gt_labels, gt_difficults): 217 | if next(iter_, None) is not None: 218 | raise ValueError('Length of input iterables need to be same.') 219 | 220 | n_fg_class = max(n_pos.keys()) + 1 221 | prec = [None] * n_fg_class 222 | rec = [None] * n_fg_class 223 | 224 | for l in n_pos.keys(): 225 | score_l = np.array(score[l]) 226 | match_l = np.array(match[l], dtype=np.int8) 227 | 228 | order = score_l.argsort()[::-1] 229 | match_l = match_l[order] 230 | 231 | tp = np.cumsum(match_l == 1) 232 | fp = np.cumsum(match_l == 0) 233 | 234 | # If an element of fp + tp is 0, 235 | # the corresponding element of prec[l] is nan. 236 | prec[l] = tp / (fp + tp) 237 | # If n_pos[l] is 0, rec[l] is None. 238 | if n_pos[l] > 0: 239 | rec[l] = tp / n_pos[l] 240 | 241 | return prec, rec 242 | 243 | 244 | def calc_detection_voc_ap(prec, rec, use_07_metric=False): 245 | """Calculate average precisions based on evaluation code of PASCAL VOC. 246 | 247 | This function calculates average precisions 248 | from given precisions and recalls. 249 | The code is based on the evaluation code used in PASCAL VOC Challenge. 250 | 251 | Args: 252 | prec (list of numpy.array): A list of arrays. 253 | :obj:`prec[l]` indicates precision for class :math:`l`. 254 | If :obj:`prec[l]` is :obj:`None`, this function returns 255 | :obj:`numpy.nan` for class :math:`l`. 256 | rec (list of numpy.array): A list of arrays. 257 | :obj:`rec[l]` indicates recall for class :math:`l`. 258 | If :obj:`rec[l]` is :obj:`None`, this function returns 259 | :obj:`numpy.nan` for class :math:`l`. 260 | use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric 261 | for calculating average precision. The default value is 262 | :obj:`False`. 263 | 264 | Returns: 265 | ~numpy.ndarray: 266 | This function returns an array of average precisions. 267 | The :math:`l`-th value corresponds to the average precision 268 | for class :math:`l`. If :obj:`prec[l]` or :obj:`rec[l]` is 269 | :obj:`None`, the corresponding value is set to :obj:`numpy.nan`. 270 | 271 | """ 272 | 273 | n_fg_class = len(prec) 274 | ap = np.empty(n_fg_class) 275 | for l in six.moves.range(n_fg_class): 276 | if prec[l] is None or rec[l] is None: 277 | ap[l] = np.nan 278 | continue 279 | 280 | if use_07_metric: 281 | # 11 point metric 282 | ap[l] = 0 283 | for t in np.arange(0., 1.1, 0.1): 284 | if np.sum(rec[l] >= t) == 0: 285 | p = 0 286 | else: 287 | p = np.max(np.nan_to_num(prec[l])[rec[l] >= t]) 288 | ap[l] += p / 11 289 | else: 290 | # correct AP calculation 291 | # first append sentinel values at the end 292 | mpre = np.concatenate(([0], np.nan_to_num(prec[l]), [0])) 293 | mrec = np.concatenate(([0], rec[l], [1])) 294 | 295 | mpre = np.maximum.accumulate(mpre[::-1])[::-1] 296 | 297 | # to calculate area under PR curve, look for points 298 | # where X axis (recall) changes value 299 | i = np.where(mrec[1:] != mrec[:-1])[0] 300 | 301 | # and sum (\Delta recall) * prec 302 | ap[l] = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 303 | 304 | return ap 305 | -------------------------------------------------------------------------------- /object_detection/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irvingzhang0512/tf_eager_object_detection/e977217cb9d5fc2975292d55b1e5cae484f9e0dd/object_detection/model/__init__.py -------------------------------------------------------------------------------- /object_detection/model/anchor_target.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from object_detection.utils.bbox_transform import encode_bbox_with_mean_and_std 3 | from object_detection.utils.bbox_tf import pairwise_iou, bboxes_range_filter 4 | from tensorflow.python.platform import tf_logging 5 | 6 | 7 | class AnchorTarget(tf.keras.Model): 8 | def __init__(self, 9 | pos_iou_threshold=0.7, 10 | neg_iou_threshold=0.3, 11 | total_num_samples=256, 12 | max_pos_samples=128, 13 | target_means=None, 14 | target_stds=None): 15 | super().__init__() 16 | 17 | self._pos_iou_threshold = pos_iou_threshold 18 | self._neg_iou_threshold = neg_iou_threshold 19 | self._total_num_samples = total_num_samples 20 | self._max_pos_samples = max_pos_samples 21 | 22 | if target_stds is None: 23 | target_stds = [1, 1, 1, 1] 24 | if target_means is None: 25 | target_means = [0, 0, 0, 0] 26 | self._target_means = target_means 27 | self._target_stds = target_stds 28 | 29 | def call(self, inputs, training=None, mask=None): 30 | """ 31 | 不需要训练 32 | 生成训练rpn用的训练数据 33 | 总体过程: 34 | 1. 对 anchors 进行过滤,筛选符合边界要求的 anchor,之后操作都基于筛选后的结果。 35 | 2. 计算 anchors 与gt_bboxes(即输入数据中的bbox)的iou。 36 | 3. 设置与 gt_bboxes 的 max_iou > 0.7的anchor为正例,设置 max_iou < 0.3 的anchor为反例。 37 | 4. 设置与每个 gt_bboxes 的iou最大的anchor为正例。 38 | 5. 对正例、反例有数量限制,正例数量不大于 max_pos_samples,正例反例总数不超过 max_pos_samples。 39 | 6. 最终输出5个结果: 40 | 1)所有anchors的label [all_anchors_num, ],-1表示不参加训练,0表示反例,1表示正例 41 | 2)所有anchors对应的 txtwtyth [all_anchors_num, 4],只有正例参加训练,反例不参加训练 42 | 3)smooth l1 loss 中的 bbox_inside_weights [all_anchors_num, 4] 43 | 3)smooth l1 loss 中的 bbox_outside_weights [all_anchors_num, 4] 44 | :param inputs: 45 | :param training: 46 | :param mask: 47 | :return: 48 | """ 49 | gt_bboxes, image_shape, all_anchors = inputs 50 | total_anchors = all_anchors.get_shape().as_list()[0] 51 | 52 | # 1. 对 anchors 进行过滤,筛选符合边界要求的 anchor,之后操作都基于筛选后的结果。 53 | tf_logging.debug('anchor target, before filter has %d anchors' % all_anchors.shape[0]) 54 | selected_anchor_idx = bboxes_range_filter(all_anchors, image_shape[0], image_shape[1]) 55 | anchors = tf.gather(all_anchors, selected_anchor_idx) 56 | tf_logging.debug('anchor target, after filter has %d anchors' % anchors.shape[0]) 57 | 58 | # 准备工作 59 | labels = -tf.ones((anchors.shape[0],), tf.int32) 60 | overlaps = pairwise_iou(anchors, gt_bboxes) # [anchors_size, gt_bboxes_size] 61 | argmax_overlaps = tf.argmax(overlaps, axis=1, output_type=tf.int32) 62 | max_overlaps = tf.reduce_max(overlaps, axis=1) 63 | gt_max_overlaps = tf.reduce_max(overlaps, axis=0) 64 | gt_argmax_overlaps = tf.where(tf.equal(overlaps, gt_max_overlaps))[:, 0] 65 | 66 | # 设置labels 67 | labels = tf.where(max_overlaps < self._neg_iou_threshold, tf.zeros_like(labels), labels) 68 | labels = tf.scatter_update(tf.Variable(labels), gt_argmax_overlaps, 1) 69 | labels = tf.where(max_overlaps >= self._pos_iou_threshold, tf.ones_like(labels), labels) 70 | 71 | # 筛选正例反例 72 | fg_inds = tf.where(tf.equal(labels, 1))[:, 0] 73 | if tf.size(fg_inds) > self._max_pos_samples: 74 | fg_inds = tf.random_shuffle(fg_inds) 75 | disable_inds = fg_inds[self._max_pos_samples:] 76 | fg_inds = fg_inds[:self._max_pos_samples] 77 | labels = tf.scatter_update(tf.Variable(labels), disable_inds, -1) 78 | num_bg = self._total_num_samples - tf.reduce_sum(tf.to_int32(tf.equal(labels, 1))) 79 | bg_inds = tf.where(tf.equal(labels, 0))[:, 0] 80 | if tf.size(bg_inds) > num_bg: 81 | bg_inds = tf.random_shuffle(bg_inds) 82 | disable_inds = bg_inds[num_bg:] 83 | bg_inds = bg_inds[:num_bg] 84 | labels = tf.scatter_update(tf.Variable(labels), disable_inds, -1) 85 | tf.logging.debug('anchor target generate %d fgs and %d bgs.' % (tf.size(fg_inds), tf.size(bg_inds))) 86 | 87 | # 计算 bboxes targets,作为 rpn reg loss 的 ground truth 88 | bboxes_targets = encode_bbox_with_mean_and_std(anchors, tf.gather(gt_bboxes, argmax_overlaps), 89 | target_means=self._target_means, 90 | target_stds=self._target_stds) 91 | 92 | # 只有正例才有 reg loss 93 | bbox_inside_weights = tf.zeros((anchors.shape[0], 4), dtype=tf.float32) 94 | bbox_inside_weights = tf.scatter_update(tf.Variable(bbox_inside_weights), 95 | tf.where(tf.equal(labels, 1))[:, 0], 1) 96 | 97 | # 实质就是对 reg loss / num_rpn_samples 98 | bbox_outside_weights = tf.zeros((anchors.shape[0], 4), dtype=tf.float32) 99 | num_examples = tf.reduce_sum(tf.to_float(labels >= 0)) 100 | bbox_outside_weights = tf.scatter_update(tf.Variable(bbox_outside_weights), 101 | tf.where(labels >= 0)[:, 0], 1.0 / num_examples) 102 | 103 | # 生成最终结果 104 | return tf.stop_gradient(_unmap(labels, total_anchors, selected_anchor_idx, -1)), \ 105 | tf.stop_gradient(_unmap(bboxes_targets, total_anchors, selected_anchor_idx, 0)), \ 106 | tf.stop_gradient(_unmap(bbox_inside_weights, total_anchors, selected_anchor_idx, 0)), \ 107 | tf.stop_gradient(_unmap(bbox_outside_weights, total_anchors, selected_anchor_idx, 0)) 108 | 109 | 110 | def _unmap(data, count, inds, fill=0): 111 | """ 112 | 将 filter anchors 后的结果映射到 原始 anchors 中,主要就是 index 的转换 113 | :param data: 114 | :param count: 115 | :param inds: 116 | :param fill: 117 | :return: 118 | """ 119 | if len(data.shape) == 1: 120 | ret = tf.ones([count], dtype=tf.float32) * fill 121 | ret = tf.scatter_update(tf.Variable(ret), inds, tf.to_float(data)) 122 | else: 123 | ret = tf.ones([count, ] + data.get_shape().as_list()[1:], dtype=tf.float32) * fill 124 | ret = tf.scatter_update(tf.Variable(ret), inds, tf.to_float(data)) 125 | return ret 126 | -------------------------------------------------------------------------------- /object_detection/model/faster_rcnn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irvingzhang0512/tf_eager_object_detection/e977217cb9d5fc2975292d55b1e5cae484f9e0dd/object_detection/model/faster_rcnn/__init__.py -------------------------------------------------------------------------------- /object_detection/model/faster_rcnn/vgg16_faster_rcnn.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from object_detection.model.faster_rcnn.base_faster_rcnn_model import BaseFasterRcnn 3 | 4 | __all__ = ['Vgg16FasterRcnn'] 5 | layers = tf.keras.layers 6 | VGG_16_WEIGHTS_PATH = ('https://github.com/fchollet/deep-learning-models/' 7 | 'releases/download/v0.1/' 8 | 'vgg16_weights_tf_dim_ordering_tf_kernels.h5') 9 | 10 | 11 | class Vgg16FasterRcnn(BaseFasterRcnn): 12 | def __init__(self, 13 | # Vgg16FasterRcnn 特有参数 14 | slim_ckpt_file_path=None, 15 | roi_head_keep_dropout_rate=0.5, 16 | roi_feature_size=(7, 7, 512), 17 | 18 | # 通用参数 19 | num_classes=21, 20 | weight_decay=0.0001, 21 | ratios=(0.5, 1.0, 2.0), 22 | scales=(8, 16, 32), 23 | extractor_stride=16, 24 | 25 | # region proposal & anchor target 通用参数 26 | rpn_proposal_means=(0, 0, 0, 0), 27 | rpn_proposal_stds=(1.0, 1.0, 1.0, 1.0), 28 | 29 | # region proposal 参数 30 | rpn_proposal_num_pre_nms_train=12000, 31 | rpn_proposal_num_post_nms_train=2000, 32 | rpn_proposal_num_pre_nms_test=6000, 33 | rpn_proposal_num_post_nms_test=300, 34 | rpn_proposal_nms_iou_threshold=0.7, 35 | 36 | # anchor target 以及相关损失函数参数 37 | rpn_sigma=3.0, 38 | rpn_training_pos_iou_threshold=0.7, 39 | rpn_training_neg_iou_threshold=0.3, 40 | rpn_training_total_num_samples=256, 41 | rpn_training_max_pos_samples=128, 42 | 43 | # roi head & proposal target 参数 44 | roi_proposal_means=(0, 0, 0, 0), 45 | roi_proposal_stds=(0.1, 0.1, 0.2, 0.2), 46 | 47 | # roi pooling 参数 48 | roi_pool_size=7, 49 | roi_pooling_max_pooling_flag=True, 50 | 51 | # proposal target 以及相关损失函数参数 52 | roi_sigma=1, 53 | roi_training_pos_iou_threshold=0.5, 54 | roi_training_neg_iou_threshold=0.1, 55 | roi_training_total_num_samples=128, 56 | roi_training_max_pos_samples=32, 57 | 58 | # prediction 参数 59 | prediction_max_objects_per_image=50, 60 | prediction_max_objects_per_class=50, 61 | prediction_nms_iou_threshold=0.3, 62 | prediction_score_threshold=0.3, ): 63 | self._slim_ckpt_file_path = slim_ckpt_file_path 64 | self._roi_feature_size = roi_feature_size 65 | self._roi_head_keep_dropout_rate = roi_head_keep_dropout_rate 66 | super().__init__(num_classes=num_classes, 67 | weight_decay=weight_decay, 68 | 69 | ratios=ratios, 70 | scales=scales, 71 | extractor_stride=extractor_stride, 72 | 73 | rpn_proposal_means=rpn_proposal_means, 74 | rpn_proposal_stds=rpn_proposal_stds, 75 | 76 | rpn_proposal_num_pre_nms_train=rpn_proposal_num_pre_nms_train, 77 | rpn_proposal_num_post_nms_train=rpn_proposal_num_post_nms_train, 78 | rpn_proposal_num_pre_nms_test=rpn_proposal_num_pre_nms_test, 79 | rpn_proposal_num_post_nms_test=rpn_proposal_num_post_nms_test, 80 | rpn_proposal_nms_iou_threshold=rpn_proposal_nms_iou_threshold, 81 | 82 | rpn_sigma=rpn_sigma, 83 | rpn_training_pos_iou_threshold=rpn_training_pos_iou_threshold, 84 | rpn_training_neg_iou_threshold=rpn_training_neg_iou_threshold, 85 | rpn_training_total_num_samples=rpn_training_total_num_samples, 86 | rpn_training_max_pos_samples=rpn_training_max_pos_samples, 87 | 88 | roi_proposal_means=roi_proposal_means, 89 | roi_proposal_stds=roi_proposal_stds, 90 | 91 | roi_pool_size=roi_pool_size, 92 | roi_pooling_max_pooling_flag=roi_pooling_max_pooling_flag, 93 | 94 | roi_sigma=roi_sigma, 95 | roi_training_pos_iou_threshold=roi_training_pos_iou_threshold, 96 | roi_training_neg_iou_threshold=roi_training_neg_iou_threshold, 97 | roi_training_total_num_samples=roi_training_total_num_samples, 98 | roi_training_max_pos_samples=roi_training_max_pos_samples, 99 | 100 | prediction_max_objects_per_image=prediction_max_objects_per_image, 101 | prediction_max_objects_per_class=prediction_max_objects_per_class, 102 | prediction_nms_iou_threshold=prediction_nms_iou_threshold, 103 | prediction_score_threshold=prediction_score_threshold, 104 | ) 105 | 106 | def _get_roi_head(self): 107 | return Vgg16RoiHead(self.num_classes, 108 | roi_feature_size=self._roi_feature_size, 109 | keep_rate=self._roi_head_keep_dropout_rate, 110 | weight_decay=self.weight_decay, 111 | slim_ckpt_file_path=self._slim_ckpt_file_path) 112 | 113 | def _get_extractor(self): 114 | return Vgg16Extractor(weight_decay=self.weight_decay, 115 | slim_ckpt_file_path=self._slim_ckpt_file_path) 116 | 117 | def load_tf_faster_rcnn_tf_weights(self, ckpt_file_path): 118 | reader = tf.train.load_checkpoint(ckpt_file_path) 119 | extractor = self.get_layer('vgg16') 120 | extractor_dict = { 121 | "vgg_16/conv1/conv1_1/": "block1_conv1", 122 | "vgg_16/conv1/conv1_2/": "block1_conv2", 123 | 124 | "vgg_16/conv2/conv2_1/": "block2_conv1", 125 | "vgg_16/conv2/conv2_2/": "block2_conv2", 126 | 127 | "vgg_16/conv3/conv3_1/": "block3_conv1", 128 | "vgg_16/conv3/conv3_2/": "block3_conv2", 129 | "vgg_16/conv3/conv3_3/": "block3_conv3", 130 | 131 | "vgg_16/conv4/conv4_1/": "block4_conv1", 132 | "vgg_16/conv4/conv4_2/": "block4_conv2", 133 | "vgg_16/conv4/conv4_3/": "block4_conv3", 134 | 135 | "vgg_16/conv5/conv5_1/": "block5_conv1", 136 | "vgg_16/conv5/conv5_2/": "block5_conv2", 137 | "vgg_16/conv5/conv5_3/": "block5_conv3", 138 | } 139 | for slim_tensor_name_pre in extractor_dict.keys(): 140 | extractor.get_layer(name=extractor_dict[slim_tensor_name_pre]).set_weights([ 141 | reader.get_tensor(slim_tensor_name_pre + 'weights'), 142 | reader.get_tensor(slim_tensor_name_pre + 'biases'), 143 | ]) 144 | tf.logging.info('successfully loaded weights for {}'.format(extractor_dict[slim_tensor_name_pre])) 145 | 146 | rpn_head = self.get_layer('rpn_head') 147 | rpn_head_dict = { 148 | 'vgg_16/rpn_conv/3x3/': 'rpn_first_conv', 149 | 'vgg_16/rpn_cls_score/': 'rpn_score_conv', 150 | 'vgg_16/rpn_bbox_pred/': 'rpn_bbox_conv', 151 | } 152 | for slim_tensor_name_pre in rpn_head_dict.keys(): 153 | rpn_head.get_layer(rpn_head_dict[slim_tensor_name_pre]).set_weights([ 154 | reader.get_tensor(slim_tensor_name_pre + 'weights'), 155 | reader.get_tensor(slim_tensor_name_pre + 'biases') 156 | ]) 157 | tf.logging.info('successfully loaded weights for {}'.format(rpn_head_dict[slim_tensor_name_pre])) 158 | 159 | roi_head = self.get_layer('vgg16_roi_head') 160 | roi_head_dict = { 161 | 'vgg_16/fc6/': 'fc1', 162 | 'vgg_16/fc7/': 'fc2', 163 | 'vgg_16/bbox_pred/': 'roi_head_bboxes', 164 | 'vgg_16/cls_score/': 'roi_head_score' 165 | } 166 | for slim_tensor_name_pre in roi_head_dict.keys(): 167 | roi_head.get_layer(roi_head_dict[slim_tensor_name_pre]).set_weights([ 168 | reader.get_tensor(slim_tensor_name_pre + 'weights'), 169 | reader.get_tensor(slim_tensor_name_pre + 'biases') 170 | ]) 171 | tf.logging.info('successfully loaded weights for {}'.format(roi_head_dict[slim_tensor_name_pre])) 172 | 173 | def disable_biases(self): 174 | # vgg16 doesn't need to diable biases 175 | pass 176 | 177 | 178 | class Vgg16RoiHead(tf.keras.Model): 179 | def __init__(self, num_classes, 180 | roi_feature_size=(7, 7, 512), 181 | keep_rate=0.5, weight_decay=0.0005, 182 | slim_ckpt_file_path=None, ): 183 | super().__init__() 184 | self._num_classes = num_classes 185 | 186 | self._fc1 = layers.Dense(4096, name='fc1', activation='relu', 187 | kernel_initializer=tf.random_normal_initializer(0, 0.01), 188 | kernel_regularizer=tf.keras.regularizers.l2(weight_decay), 189 | input_shape=[roi_feature_size] 190 | ) 191 | self._dropout1 = layers.Dropout(rate=1 - keep_rate) 192 | 193 | self._fc2 = layers.Dense(4096, name='fc2', activation='relu', 194 | kernel_initializer=tf.random_normal_initializer(0, 0.01), 195 | kernel_regularizer=tf.keras.regularizers.l2(weight_decay), 196 | ) 197 | self._dropout2 = layers.Dropout(rate=1 - keep_rate) 198 | 199 | self._score_layer = layers.Dense(num_classes, name='roi_head_score', activation=None, 200 | kernel_initializer=tf.random_normal_initializer(0, 0.01), 201 | kernel_regularizer=tf.keras.regularizers.l2(weight_decay)) 202 | self._roi_bboxes_layer = layers.Dense(4 * num_classes, name='roi_head_bboxes', activation=None, 203 | kernel_initializer=tf.random_normal_initializer(0, 0.001), 204 | kernel_regularizer=tf.keras.regularizers.l2(weight_decay)) 205 | self._flatten_layer = layers.Flatten() 206 | 207 | self.build((None, *roi_feature_size)) 208 | 209 | if slim_ckpt_file_path is None: 210 | self._load_keras_weights() 211 | else: 212 | self._load_slim_weights(slim_ckpt_file_path) 213 | 214 | def _load_slim_weights(self, ckpt_file_path): 215 | reader = tf.train.NewCheckpointReader(ckpt_file_path) 216 | slim_to_keras = { 217 | "vgg_16/fc6/": "fc1", 218 | "vgg_16/fc7/": "fc2", 219 | } 220 | 221 | for slim_tensor_name_pre in slim_to_keras.keys(): 222 | cur_layer = self.get_layer(name=slim_to_keras[slim_tensor_name_pre]) 223 | cur_layer.set_weights([ 224 | reader.get_tensor(slim_tensor_name_pre + 'weights').reshape( 225 | cur_layer.variables[0].get_shape().as_list()), 226 | reader.get_tensor(slim_tensor_name_pre + 'biases').reshape( 227 | cur_layer.variables[1].get_shape().as_list()), 228 | ]) 229 | tf.logging.info('successfully loaded slim vgg weights for roi head.') 230 | 231 | def _load_keras_weights(self): 232 | weights_path = tf.keras.utils.get_file( 233 | 'vgg16_weights_tf_dim_ordering_tf_kernels.h5', 234 | VGG_16_WEIGHTS_PATH, 235 | cache_subdir='models', 236 | file_hash='64373286793e3c8b2b4e3219cbf3544b') 237 | self.load_weights(weights_path, by_name=True) 238 | tf.logging.info('successfully load pretrained weights for roi head.') 239 | 240 | def call(self, inputs, training=None): 241 | """ 242 | 输入 roi pooling 的结果 243 | 对每个 roi pooling 的结果进行预测(预测bboxes) 244 | :param inputs: roi_features, [num_rois, pool_size, pool_size, num_channels] 245 | :param training: 246 | :param mask: 247 | :return: 248 | """ 249 | x = self._flatten_layer(inputs) 250 | x = self._fc1(x) 251 | x = self._dropout1(x, training) 252 | x = self._fc2(x) 253 | x = self._dropout2(x, training) 254 | score = self._score_layer(x) 255 | bboxes = self._roi_bboxes_layer(x) 256 | 257 | return score, bboxes 258 | 259 | 260 | class Vgg16Extractor(tf.keras.Sequential): 261 | def __init__(self, weight_decay=0.0001, 262 | slim_ckpt_file_path=None): 263 | super().__init__(name='vgg16') 264 | # Block 1 265 | self.add(layers.Conv2D(64, (3, 3), 266 | activation='relu', 267 | padding='same', 268 | name='block1_conv1', trainable=False, 269 | kernel_regularizer=tf.keras.regularizers.l2(weight_decay), 270 | input_shape=(None, None, 3))) 271 | self.add(layers.Conv2D(64, (3, 3), 272 | activation='relu', 273 | padding='same', 274 | kernel_regularizer=tf.keras.regularizers.l2(weight_decay), 275 | name='block1_conv2', trainable=False)) 276 | self.add(layers.MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool', padding='same')) 277 | 278 | # Block 2 279 | self.add(layers.Conv2D(128, (3, 3), 280 | activation='relu', 281 | padding='same', 282 | kernel_regularizer=tf.keras.regularizers.l2(weight_decay), 283 | name='block2_conv1', trainable=False)) 284 | self.add(layers.Conv2D(128, (3, 3), 285 | activation='relu', 286 | padding='same', 287 | kernel_regularizer=tf.keras.regularizers.l2(weight_decay), 288 | name='block2_conv2', trainable=False)) 289 | self.add(layers.MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool', padding='same')) 290 | 291 | # Block 3 292 | self.add(layers.Conv2D(256, (3, 3), 293 | activation='relu', 294 | padding='same', 295 | kernel_regularizer=tf.keras.regularizers.l2(weight_decay), 296 | name='block3_conv1')) 297 | self.add(layers.Conv2D(256, (3, 3), 298 | activation='relu', 299 | padding='same', 300 | kernel_regularizer=tf.keras.regularizers.l2(weight_decay), 301 | name='block3_conv2')) 302 | self.add(layers.Conv2D(256, (3, 3), 303 | activation='relu', 304 | padding='same', 305 | kernel_regularizer=tf.keras.regularizers.l2(weight_decay), 306 | name='block3_conv3')) 307 | self.add(layers.MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool', padding='same')) 308 | 309 | # Block 4 310 | self.add(layers.Conv2D(512, (3, 3), 311 | activation='relu', 312 | padding='same', 313 | kernel_regularizer=tf.keras.regularizers.l2(weight_decay), 314 | name='block4_conv1')) 315 | self.add(layers.Conv2D(512, (3, 3), 316 | activation='relu', 317 | padding='same', 318 | kernel_regularizer=tf.keras.regularizers.l2(weight_decay), 319 | name='block4_conv2')) 320 | self.add(layers.Conv2D(512, (3, 3), 321 | activation='relu', 322 | padding='same', 323 | kernel_regularizer=tf.keras.regularizers.l2(weight_decay), 324 | name='block4_conv3')) 325 | self.add(layers.MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool', padding='same')) 326 | 327 | # Block 5 328 | self.add(layers.Conv2D(512, (3, 3), 329 | activation='relu', 330 | padding='same', 331 | kernel_regularizer=tf.keras.regularizers.l2(weight_decay), 332 | name='block5_conv1')) 333 | self.add(layers.Conv2D(512, (3, 3), 334 | activation='relu', 335 | padding='same', 336 | kernel_regularizer=tf.keras.regularizers.l2(weight_decay), 337 | name='block5_conv2')) 338 | self.add(layers.Conv2D(512, (3, 3), 339 | activation='relu', 340 | padding='same', 341 | kernel_regularizer=tf.keras.regularizers.l2(weight_decay), 342 | name='block5_conv3')) 343 | if slim_ckpt_file_path: 344 | self.load_slim_weights(slim_ckpt_file_path) 345 | else: 346 | self._load_keras_weights() 347 | 348 | def _load_keras_weights(self): 349 | weights_path = tf.keras.utils.get_file( 350 | 'vgg16_weights_tf_dim_ordering_tf_kernels.h5', 351 | VGG_16_WEIGHTS_PATH, 352 | cache_subdir='models', 353 | file_hash='64373286793e3c8b2b4e3219cbf3544b') 354 | self.load_weights(weights_path, by_name=True) 355 | tf.logging.info('successfully loaded keras vgg weights for vgg16 extractor.') 356 | 357 | def load_slim_weights(self, slim_ckpt_file_path): 358 | reader = tf.train.NewCheckpointReader(slim_ckpt_file_path) 359 | slim_to_keras = { 360 | "vgg_16/conv1/conv1_1/": "block1_conv1", 361 | "vgg_16/conv1/conv1_2/": "block1_conv2", 362 | 363 | "vgg_16/conv2/conv2_1/": "block2_conv1", 364 | "vgg_16/conv2/conv2_2/": "block2_conv2", 365 | 366 | "vgg_16/conv3/conv3_1/": "block3_conv1", 367 | "vgg_16/conv3/conv3_2/": "block3_conv2", 368 | "vgg_16/conv3/conv3_3/": "block3_conv3", 369 | 370 | "vgg_16/conv4/conv4_1/": "block4_conv1", 371 | "vgg_16/conv4/conv4_2/": "block4_conv2", 372 | "vgg_16/conv4/conv4_3/": "block4_conv3", 373 | 374 | "vgg_16/conv5/conv5_1/": "block5_conv1", 375 | "vgg_16/conv5/conv5_2/": "block5_conv2", 376 | "vgg_16/conv5/conv5_3/": "block5_conv3", 377 | } 378 | for slim_tensor_name_pre in slim_to_keras.keys(): 379 | if slim_tensor_name_pre == 'vgg_16/conv1/conv1_1/': 380 | weights = reader.get_tensor(slim_tensor_name_pre + 'weights')[:, :, ::-1, :] 381 | self.get_layer(name=slim_to_keras[slim_tensor_name_pre]).set_weights([ 382 | weights, 383 | reader.get_tensor(slim_tensor_name_pre + 'biases'), 384 | ]) 385 | else: 386 | self.get_layer(name=slim_to_keras[slim_tensor_name_pre]).set_weights([ 387 | reader.get_tensor(slim_tensor_name_pre + 'weights'), 388 | reader.get_tensor(slim_tensor_name_pre + 'biases'), 389 | ]) 390 | tf.logging.info('successfully loaded slim vgg weights for vgg16 extractor.') 391 | -------------------------------------------------------------------------------- /object_detection/model/fpn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irvingzhang0512/tf_eager_object_detection/e977217cb9d5fc2975292d55b1e5cae484f9e0dd/object_detection/model/fpn/__init__.py -------------------------------------------------------------------------------- /object_detection/model/losses.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def cls_loss(logits, labels, weight=1): 5 | """ 6 | 7 | :param weight: 8 | :param logits: [num_anchors, 2] 9 | :param labels: [num_anchors, ],取值[0, num_classes)(roi training) 或 [0, 1](rpn training) 10 | :return: 11 | """ 12 | return tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=tf.to_int32(labels), 13 | weights=weight) 14 | 15 | 16 | def smooth_l1_loss(bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights, sigma=1.0, dim=[1]): 17 | sigma_2 = sigma ** 2 18 | box_diff = bbox_pred - bbox_targets 19 | in_box_diff = bbox_inside_weights * box_diff 20 | abs_in_box_diff = tf.abs(in_box_diff) 21 | sign = tf.stop_gradient(tf.to_float(tf.less(abs_in_box_diff, 1. / sigma_2))) 22 | in_loss_box = tf.pow(in_box_diff, 2) * (sigma_2 / 2.) * sign + (abs_in_box_diff - (0.5 / sigma_2)) * (1. - sign) 23 | out_loss_box = bbox_outside_weights * in_loss_box 24 | loss_box = tf.reduce_mean(tf.reduce_sum( 25 | out_loss_box, 26 | axis=dim 27 | )) 28 | return loss_box 29 | -------------------------------------------------------------------------------- /object_detection/model/model_factory.py: -------------------------------------------------------------------------------- 1 | from object_detection.model.fpn.resnet_fpn import ResnetV1Fpn 2 | from object_detection.model.faster_rcnn.resnet_faster_rcnn import ResNetFasterRcnn 3 | from object_detection.model.faster_rcnn.vgg16_faster_rcnn import Vgg16FasterRcnn 4 | 5 | __all__ = ['model_factory'] 6 | 7 | 8 | def model_factory(model_type, backbone, config): 9 | if model_type == 'faster_rcnn': 10 | if backbone == 'vgg16': 11 | return _get_faster_rcnn_vgg16_model(None, config) 12 | elif backbone == 'resnet50': 13 | return _get_faster_rcnn_resnet_model(50, config) 14 | elif backbone == 'resnet101': 15 | return _get_faster_rcnn_resnet_model(101, config) 16 | elif backbone == 'resnet152': 17 | return _get_faster_rcnn_resnet_model(152, config) 18 | else: 19 | raise ValueError('unknown backbone {}'.format(backbone)) 20 | elif model_type == 'fpn': 21 | if backbone == 'resnet50': 22 | return _get_fpn_resnet_model(50, config) 23 | elif backbone == 'resnet101': 24 | return _get_fpn_resnet_model(101, config) 25 | elif backbone == 'resnet152': 26 | return _get_fpn_resnet_model(152, config) 27 | else: 28 | raise ValueError('unknown backbone {}'.format(backbone)) 29 | else: 30 | raise ValueError('unknown model type {}'.format(model_type)) 31 | 32 | 33 | def _get_fpn_resnet_model(depth, config): 34 | return ResnetV1Fpn( 35 | depth=depth, 36 | roi_head_keep_dropout_rate=config['roi_head_keep_dropout_rate'], 37 | 38 | roi_feature_size=config['resnet_roi_feature_size'], 39 | num_classes=config['num_classes'], 40 | weight_decay=config['weight_decay'], 41 | 42 | level_name_list=config['level_name_list'], 43 | min_level=config['min_level'], 44 | max_level=config['max_level'], 45 | top_down_dims=config['top_down_dims'], 46 | 47 | anchor_stride_list=config['anchor_stride_list'], 48 | base_anchor_size_list=config['base_anchor_size_list'], 49 | ratios=config['ratios'], 50 | scales=config['scales'], 51 | 52 | rpn_proposal_means=config['rpn_proposal_means'], 53 | rpn_proposal_stds=config['rpn_proposal_stds'], 54 | 55 | rpn_proposal_num_pre_nms_train=config['rpn_proposal_train_pre_nms_sample_number'], 56 | rpn_proposal_num_post_nms_train=config['rpn_proposal_train_after_nms_sample_number'], 57 | rpn_proposal_num_pre_nms_test=config['rpn_proposal_test_pre_nms_sample_number'], 58 | rpn_proposal_num_post_nms_test=config['rpn_proposal_test_after_nms_sample_number'], 59 | rpn_proposal_nms_iou_threshold=config['rpn_proposal_nms_iou_threshold'], 60 | 61 | rpn_sigma=config['rpn_sigma'], 62 | rpn_training_pos_iou_threshold=config['rpn_pos_iou_threshold'], 63 | rpn_training_neg_iou_threshold=config['rpn_neg_iou_threshold'], 64 | rpn_training_total_num_samples=config['rpn_total_sample_number'], 65 | rpn_training_max_pos_samples=config['rpn_pos_sample_max_number'], 66 | 67 | roi_proposal_means=config['roi_proposal_means'], 68 | roi_proposal_stds=config['roi_proposal_stds'], 69 | 70 | roi_pool_size=config['roi_pooling_size'], 71 | roi_pooling_max_pooling_flag=config['roi_pooling_max_pooling_flag'], 72 | 73 | roi_sigma=config['roi_sigma'], 74 | roi_training_pos_iou_threshold=config['roi_pos_iou_threshold'], 75 | roi_training_neg_iou_threshold=config['roi_neg_iou_threshold'], 76 | roi_training_total_num_samples=config['roi_total_sample_number'], 77 | roi_training_max_pos_samples=config['roi_pos_sample_max_number'], 78 | 79 | prediction_max_objects_per_image=config['max_objects_per_image'], 80 | prediction_max_objects_per_class=config['max_objects_per_class_per_image'], 81 | prediction_nms_iou_threshold=config['prediction_nms_iou_threshold'], 82 | prediction_score_threshold=config['prediction_score_threshold'], 83 | ) 84 | 85 | 86 | def _get_faster_rcnn_resnet_model(depth, config): 87 | return ResNetFasterRcnn( 88 | depth=depth, 89 | roi_feature_size=config['resnet_roi_feature_size'], 90 | 91 | num_classes=config['num_classes'], 92 | weight_decay=config['weight_decay'], 93 | 94 | ratios=config['ratios'], 95 | scales=config['scales'], 96 | extractor_stride=config['extractor_stride'], 97 | 98 | rpn_proposal_means=config['rpn_proposal_means'], 99 | rpn_proposal_stds=config['rpn_proposal_stds'], 100 | 101 | rpn_proposal_num_pre_nms_train=config['rpn_proposal_train_pre_nms_sample_number'], 102 | rpn_proposal_num_post_nms_train=config['rpn_proposal_train_after_nms_sample_number'], 103 | rpn_proposal_num_pre_nms_test=config['rpn_proposal_test_pre_nms_sample_number'], 104 | rpn_proposal_num_post_nms_test=config['rpn_proposal_test_after_nms_sample_number'], 105 | rpn_proposal_nms_iou_threshold=config['rpn_proposal_nms_iou_threshold'], 106 | 107 | rpn_sigma=config['rpn_sigma'], 108 | rpn_training_pos_iou_threshold=config['rpn_pos_iou_threshold'], 109 | rpn_training_neg_iou_threshold=config['rpn_neg_iou_threshold'], 110 | rpn_training_total_num_samples=config['rpn_total_sample_number'], 111 | rpn_training_max_pos_samples=config['rpn_pos_sample_max_number'], 112 | 113 | roi_proposal_means=config['roi_proposal_means'], 114 | roi_proposal_stds=config['roi_proposal_stds'], 115 | 116 | roi_pool_size=config['roi_pooling_size'], 117 | roi_pooling_max_pooling_flag=config['resnet_roi_pooling_max_pooling_flag'], 118 | 119 | roi_sigma=config['roi_sigma'], 120 | roi_training_pos_iou_threshold=config['roi_pos_iou_threshold'], 121 | roi_training_neg_iou_threshold=config['roi_neg_iou_threshold'], 122 | roi_training_total_num_samples=config['roi_total_sample_number'], 123 | roi_training_max_pos_samples=config['roi_pos_sample_max_number'], 124 | 125 | prediction_max_objects_per_image=config['max_objects_per_image'], 126 | prediction_max_objects_per_class=config['max_objects_per_class_per_image'], 127 | prediction_nms_iou_threshold=config['prediction_nms_iou_threshold'], 128 | prediction_score_threshold=config['prediction_score_threshold'], 129 | ) 130 | 131 | 132 | def _get_faster_rcnn_vgg16_model(slim_ckpt_file_path, config): 133 | return Vgg16FasterRcnn( 134 | slim_ckpt_file_path=slim_ckpt_file_path, 135 | roi_head_keep_dropout_rate=config['roi_head_keep_dropout_rate'], 136 | roi_feature_size=config['vgg16_roi_feature_size'], 137 | 138 | num_classes=config['num_classes'], 139 | weight_decay=config['weight_decay'], 140 | 141 | ratios=config['ratios'], 142 | scales=config['scales'], 143 | extractor_stride=config['extractor_stride'], 144 | 145 | rpn_proposal_means=config['rpn_proposal_means'], 146 | rpn_proposal_stds=config['rpn_proposal_stds'], 147 | 148 | rpn_proposal_num_pre_nms_train=config['rpn_proposal_train_pre_nms_sample_number'], 149 | rpn_proposal_num_post_nms_train=config['rpn_proposal_train_after_nms_sample_number'], 150 | rpn_proposal_num_pre_nms_test=config['rpn_proposal_test_pre_nms_sample_number'], 151 | rpn_proposal_num_post_nms_test=config['rpn_proposal_test_after_nms_sample_number'], 152 | rpn_proposal_nms_iou_threshold=config['rpn_proposal_nms_iou_threshold'], 153 | 154 | rpn_sigma=config['rpn_sigma'], 155 | rpn_training_pos_iou_threshold=config['rpn_pos_iou_threshold'], 156 | rpn_training_neg_iou_threshold=config['rpn_neg_iou_threshold'], 157 | rpn_training_total_num_samples=config['rpn_total_sample_number'], 158 | rpn_training_max_pos_samples=config['rpn_pos_sample_max_number'], 159 | 160 | roi_proposal_means=config['roi_proposal_means'], 161 | roi_proposal_stds=config['roi_proposal_stds'], 162 | 163 | roi_pool_size=config['roi_pooling_size'], 164 | roi_pooling_max_pooling_flag=config['vgg16_roi_pooling_max_pooling_flag'], 165 | 166 | roi_sigma=config['roi_sigma'], 167 | roi_training_pos_iou_threshold=config['roi_pos_iou_threshold'], 168 | roi_training_neg_iou_threshold=config['roi_neg_iou_threshold'], 169 | roi_training_total_num_samples=config['roi_total_sample_number'], 170 | roi_training_max_pos_samples=config['roi_pos_sample_max_number'], 171 | 172 | prediction_max_objects_per_image=config['max_objects_per_image'], 173 | prediction_max_objects_per_class=config['max_objects_per_class_per_image'], 174 | prediction_nms_iou_threshold=config['prediction_nms_iou_threshold'], 175 | prediction_score_threshold=config['prediction_score_threshold'], 176 | ) 177 | -------------------------------------------------------------------------------- /object_detection/model/prediction.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from object_detection.utils.bbox_transform import decode_bbox_with_mean_and_std 4 | from object_detection.utils.bbox_tf import bboxes_clip_filter as bboxes_clip_filter_tf 5 | 6 | 7 | __all__ = ['post_ops_prediction'] 8 | 9 | 10 | def predict_after_roi(roi_scores_softmax, roi_txtytwth, rois, image_shape, 11 | target_means, target_stds, 12 | max_num_per_class=5, 13 | max_num_per_image=5, 14 | nms_iou_threshold=0.3, 15 | score_threshold=0.3, 16 | extractor_stride=16, 17 | ): 18 | """ 19 | copy from https://github.com/Viredery/tf-eager-fasterrcnn/blob/master/detection/models/bbox_heads/bbox_head.py 20 | :param roi_scores_softmax: 21 | :param roi_txtytwth: 22 | :param rois: 23 | :param image_shape: 24 | :param target_means: 25 | :param target_stds: 26 | :param max_num_per_class: 27 | :param max_num_per_image: 28 | :param nms_iou_threshold: 29 | :param score_threshold: 30 | :param extractor_stride: 31 | :return: 32 | """ 33 | 34 | # Class IDs per ROI 35 | class_ids = tf.argmax(roi_scores_softmax, axis=1, output_type=tf.int32) 36 | 37 | # Class probability of the top class of each ROI 38 | indices = tf.stack([tf.range(roi_scores_softmax.shape[0]), class_ids], axis=1) 39 | class_scores = tf.gather_nd(roi_scores_softmax, indices) 40 | # Class-specific bounding box deltas 41 | deltas_specific = tf.gather_nd(roi_txtytwth, indices) 42 | # Apply bounding box deltas 43 | # Shape: [num_rois, (y1, x1, y2, x2)] in normalized coordinates 44 | refined_rois = decode_bbox_with_mean_and_std(rois, deltas_specific, 45 | target_means, target_stds) 46 | refined_rois, refined_rois_idx = bboxes_clip_filter_tf(refined_rois, 0, image_shape[0], image_shape[1], 47 | min_edge=None) 48 | # TODO: remove min edge 49 | 50 | # Filter out background boxes 51 | keep = tf.where(class_ids > 0)[:, 0] 52 | 53 | # Filter out low confidence boxes 54 | score_keep = tf.where(class_scores >= score_threshold)[:, 0] 55 | keep = tf.sets.set_intersection(tf.expand_dims(keep, 0), 56 | tf.expand_dims(score_keep, 0)) 57 | keep = tf.sparse_tensor_to_dense(keep)[0] 58 | 59 | # Apply per-class NMS 60 | # 1. Prepare variables 61 | pre_nms_class_ids = tf.gather(class_ids, keep) 62 | pre_nms_scores = tf.gather(class_scores, keep) 63 | pre_nms_rois = tf.gather(refined_rois, keep) 64 | unique_pre_nms_class_ids = tf.unique(pre_nms_class_ids)[0] 65 | 66 | def nms_keep_map(class_id): 67 | # Indices of ROIs of the given class 68 | ixs = tf.where(tf.equal(pre_nms_class_ids, class_id))[:, 0] 69 | # Apply NMS 70 | class_keep = tf.image.non_max_suppression( 71 | tf.gather(pre_nms_rois, ixs), 72 | tf.gather(pre_nms_scores, ixs), 73 | max_output_size=max_num_per_class, 74 | iou_threshold=nms_iou_threshold) 75 | # Map indices 76 | class_keep = tf.gather(keep, tf.gather(ixs, class_keep)) 77 | tf.logging.debug('nms keep map is {}'.format(class_keep)) 78 | return class_keep 79 | 80 | # 2. Map over class IDs 81 | nms_keep = [] 82 | for i in range(unique_pre_nms_class_ids.shape[0]): 83 | nms_keep.append(nms_keep_map(unique_pre_nms_class_ids[i])) 84 | 85 | if len(nms_keep) == 0: 86 | return None, None, None 87 | nms_keep = tf.concat(nms_keep, axis=0) 88 | 89 | # 3. Compute intersection between keep and nms_keep 90 | keep = tf.sets.set_intersection(tf.expand_dims(keep, 0), 91 | tf.expand_dims(nms_keep, 0)) 92 | keep = tf.sparse_tensor_to_dense(keep)[0] 93 | # Keep top detections 94 | roi_count = max_num_per_image 95 | class_scores_keep = tf.gather(class_scores, keep) 96 | num_keep = tf.minimum(tf.shape(class_scores_keep)[0], roi_count) 97 | top_ids = tf.nn.top_k(class_scores_keep, k=num_keep, sorted=True)[1] 98 | keep = tf.gather(keep, top_ids) 99 | 100 | return tf.gather(refined_rois, keep), tf.gather(class_ids, keep), tf.gather(class_scores, keep) 101 | 102 | 103 | def post_ops_prediction(roi_scores_softmax, roi_txtytwth, rois, image_shape, 104 | target_means, target_stds, 105 | max_num_per_class=50, 106 | max_num_per_image=150, 107 | nms_iou_threshold=0.3, 108 | score_threshold=0.05, 109 | extractor_stride=16, 110 | num_classes=21, 111 | ): 112 | """ 113 | 114 | :param roi_scores_softmax: [num_rois, num_classes] 115 | :param roi_txtytwth: [num_rois, num_classes, 4] 116 | :param rois: [num_rois, 4] 117 | :param image_shape: [2,] 118 | :param target_means: [4,] 119 | :param target_stds: [4,] 120 | :param max_num_per_class: 121 | :param max_num_per_image: 122 | :param nms_iou_threshold: 123 | :param score_threshold: 124 | :param extractor_stride: 125 | :param num_classes: 126 | :return: 127 | """ 128 | if target_stds is None: 129 | target_stds = [1, 1, 1, 1] 130 | if target_means is None: 131 | target_means = [0, 0, 0, 0] 132 | res_scores = [] 133 | res_bboxes = [] 134 | res_cls = [] 135 | for i in range(1, num_classes): 136 | inds = tf.where(roi_scores_softmax[:, i] > score_threshold)[:, 0] 137 | cls_score = tf.gather(roi_scores_softmax[:, i], inds) 138 | final_bboxes = decode_bbox_with_mean_and_std(tf.gather(rois, inds), 139 | tf.gather(roi_txtytwth[:, i, :], inds), 140 | target_means, target_stds) 141 | final_bboxes, clip_selected_idx = bboxes_clip_filter_tf(final_bboxes, 0, 142 | image_shape[0], image_shape[1], 143 | extractor_stride) 144 | cls_score = tf.gather(cls_score, clip_selected_idx) 145 | 146 | keep = tf.image.non_max_suppression(final_bboxes, cls_score, max_num_per_class, iou_threshold=nms_iou_threshold) 147 | if tf.size(keep).numpy() == 0: 148 | continue 149 | res_scores.append(tf.gather(cls_score, keep)) 150 | res_bboxes.append(tf.gather(final_bboxes, keep)) 151 | res_cls.append(tf.ones_like(keep, dtype=tf.int32) * i) 152 | 153 | if len(res_scores) == 0: 154 | return None, None, None 155 | 156 | scores_after_nms = tf.concat(res_scores, axis=0) 157 | bboxes_after_nms = tf.concat(res_bboxes, axis=0) 158 | cls_after_nms = tf.concat(res_cls, axis=0) 159 | 160 | _, final_idx = tf.nn.top_k(scores_after_nms, k=tf.minimum(max_num_per_image, tf.size(scores_after_nms)), 161 | sorted=False) 162 | return tf.gather(bboxes_after_nms, final_idx), tf.gather(cls_after_nms, final_idx), tf.gather(scores_after_nms, 163 | final_idx) 164 | -------------------------------------------------------------------------------- /object_detection/model/proposal_target.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | from object_detection.utils.bbox_tf import pairwise_iou 5 | from object_detection.utils.bbox_transform import encode_bbox_with_mean_and_std 6 | 7 | 8 | class ProposalTarget(tf.keras.Model): 9 | def __init__(self, 10 | num_classes=21, 11 | pos_iou_threshold=0.5, 12 | neg_iou_threshold=0.5, 13 | total_num_samples=128, 14 | max_pos_samples=32, 15 | target_means=None, 16 | target_stds=None): 17 | super().__init__() 18 | 19 | self._num_classes = num_classes 20 | self._pos_iou_threshold = pos_iou_threshold 21 | self._neg_iou_threshold = neg_iou_threshold 22 | self._total_num_samples = total_num_samples 23 | self._max_pos_samples = max_pos_samples 24 | 25 | if target_stds is None: 26 | target_stds = [1, 1, 1, 1] 27 | if target_means is None: 28 | target_means = [0, 0, 0, 0] 29 | self._target_means = target_means 30 | self._target_stds = target_stds 31 | 32 | def call(self, inputs, training=None, mask=None): 33 | """ 34 | 不需要训练 35 | 生成训练roi用的数据 36 | 总体过程: 37 | 1. 计算 rois 与 gt_bboxes(即输入数据中的bbox)的iou 38 | 2. 设置与 gt_bboxes 的 max_iou > pos_iou_threshold 的 roi 为正例,设置 max_iou < neg_iou_threshold 的 roi 为反例 39 | 3. 对正例、反例有数量限制: 40 | 正例数量不大于 max_pos_samples 41 | 正例反例总数不超过 max_pos_samples 42 | 反例数量如果过少,则通过 numpy.random.choice 随机填充 43 | 4. 最终输出5个结果: 44 | 1)rois [128, 4] 45 | 2)每个 roi 对应的 label [128,],如果我为0则表示为反例,>0则表示为正例 46 | 3)每个 roi 对应的 txtytwth [128, num_classes * 4] 47 | 4)计算 smooth l1 loss时的 bbox_inside_weights [128, num_classes * 4] 48 | 5)计算 smooth l1 loss时的 bbox_outside_weights [128, num_classes * 4] 49 | :param inputs: 50 | :param training: 51 | :param mask: 52 | :return: 53 | """ 54 | rois, gt_bboxes, gt_labels = inputs 55 | 56 | iou = pairwise_iou(rois, gt_bboxes) # [rois_size, gt_bboxes_size] 57 | max_overlaps = tf.reduce_max(iou, axis=1) # [rois_size, ] 58 | gt_assignment = tf.argmax(iou, axis=1) # [rois_size, ] 59 | labels = tf.gather(gt_labels, gt_assignment) # [rois_size, ] 60 | 61 | # 根据条件获取 前景 背景 62 | fg_inds = tf.where(max_overlaps >= self._pos_iou_threshold)[:, 0] 63 | bg_inds = tf.where(tf.logical_and(max_overlaps < self._pos_iou_threshold, 64 | max_overlaps >= self._neg_iou_threshold))[:, 0] 65 | 66 | # 筛选 前景/背景 67 | if tf.size(fg_inds) > self._max_pos_samples: 68 | fg_inds = tf.random_shuffle(fg_inds)[:self._max_pos_samples] 69 | if tf.size(bg_inds) > self._total_num_samples - tf.size(fg_inds): 70 | # 如果bg sample的数量多于要求值,则随机筛选 71 | bg_inds = tf.random_shuffle(bg_inds)[:(self._total_num_samples - tf.size(fg_inds))] 72 | elif tf.size(bg_inds).numpy() == (self._total_num_samples - tf.size(fg_inds)).numpy(): 73 | pass 74 | else: 75 | # 如果bg sample的数量少于要求数值,则重复获取 76 | target_size = (self._total_num_samples - tf.size(fg_inds)).numpy() 77 | bg_inds = np.random.choice(bg_inds.numpy(), size=int(target_size), replace=True) 78 | 79 | tf.logging.debug('proposal target generate %d fgs and %d bgs.' % (tf.size(fg_inds), tf.size(bg_inds))) 80 | 81 | keep_inds = tf.concat([fg_inds, bg_inds], axis=0) 82 | final_rois = tf.gather(rois, keep_inds) # rois[keep_inds] 83 | final_labels = tf.gather(labels, keep_inds) # labels[keep_inds] 84 | # labels[fg_inds_size:] = 0 85 | final_labels = tf.scatter_update(tf.Variable(final_labels), 86 | tf.range(tf.size(fg_inds), tf.size(keep_inds), dtype=tf.int32), 0) 87 | 88 | # inside weights 只有正例才会设置,其他均为0 89 | bbox_inside_weights = tf.zeros((tf.size(keep_inds), self._num_classes, 4), dtype=tf.float32) 90 | if tf.size(fg_inds) > 0: 91 | # memory leak bug for tf.scatter_nd_update 92 | # https://github.com/tensorflow/tensorflow/issues/27288 93 | # cur_index = tf.stack([tf.range(tf.size(fg_inds)), tf.gather(labels, fg_inds)], axis=1) 94 | # bbox_inside_weights = tf.scatter_nd_update(tf.Variable(bbox_inside_weights), 95 | # cur_index, 96 | # tf.ones([tf.size(fg_inds), 4])) 97 | bbox_inside_weights = bbox_inside_weights.numpy() 98 | for idx, fg_ind in enumerate(fg_inds.numpy()): 99 | bbox_inside_weights[idx, labels[idx]] = 1 100 | bbox_inside_weights = tf.reshape(bbox_inside_weights, [-1, self._num_classes * 4]) 101 | 102 | # final bbox target 只有正例才会设置,其他均为0 103 | final_bbox_targets = tf.zeros((tf.size(keep_inds), self._num_classes, 4), dtype=tf.float32) 104 | if tf.size(fg_inds) > 0: 105 | bbox_targets = encode_bbox_with_mean_and_std(tf.gather(final_rois, tf.range(tf.size(fg_inds))), 106 | tf.gather(gt_bboxes, tf.gather(gt_assignment, fg_inds)), 107 | target_stds=self._target_stds, target_means=self._target_means, 108 | ) 109 | # memory leak bug for tf.scatter_nd_update 110 | # https://github.com/tensorflow/tensorflow/issues/27288 111 | # final_bbox_targets = tf.scatter_nd_update(tf.Variable(final_bbox_targets), 112 | # tf.stack([tf.range(tf.size(fg_inds)), 113 | # tf.gather(labels, fg_inds)], axis=1), bbox_targets) 114 | final_bbox_targets = final_bbox_targets.numpy() 115 | bbox_targets = bbox_targets.numpy() 116 | for idx, fg_ind in enumerate(fg_inds.numpy()): 117 | final_bbox_targets[idx, labels[idx]] = bbox_targets[idx] 118 | 119 | final_bbox_targets = tf.reshape(final_bbox_targets, [-1, self._num_classes * 4]) 120 | 121 | # 这个好像没啥用 122 | bbox_outside_weights = tf.ones_like(bbox_inside_weights, dtype=tf.float32) 123 | return tf.stop_gradient(final_rois), tf.stop_gradient(final_labels), tf.stop_gradient(final_bbox_targets), \ 124 | tf.stop_gradient(bbox_inside_weights), tf.stop_gradient(bbox_outside_weights) 125 | -------------------------------------------------------------------------------- /object_detection/model/region_proposal.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from object_detection.utils.bbox_transform import decode_bbox_with_mean_and_std 3 | from object_detection.utils.bbox_tf import bboxes_clip_filter 4 | from tensorflow.python.platform import tf_logging 5 | 6 | layers = tf.keras.layers 7 | 8 | __all__ = ['RegionProposal'] 9 | 10 | 11 | class RegionProposal(tf.keras.Model): 12 | def __init__(self, 13 | num_anchors=9, 14 | num_pre_nms_train=12000, 15 | num_post_nms_train=2000, 16 | num_pre_nms_test=6000, 17 | num_post_nms_test=300, 18 | nms_iou_threshold=0.7, 19 | target_means=None, 20 | target_stds=None): 21 | super().__init__() 22 | 23 | self._num_anchors = num_anchors 24 | self._num_pre_nms_train = num_pre_nms_train 25 | self._num_post_nms_train = num_post_nms_train 26 | self._num_pre_nms_test = num_pre_nms_test 27 | self._num_post_nms_test = num_post_nms_test 28 | self._nms_iou_threshold = nms_iou_threshold 29 | 30 | if target_stds is None: 31 | target_stds = [1, 1, 1, 1] 32 | if target_means is None: 33 | target_means = [0, 0, 0, 0] 34 | self._target_means = target_means 35 | self._target_stds = target_stds 36 | 37 | def call(self, inputs, training=None, mask=None): 38 | """ 39 | 生成 rpn 的结果,即一组 bboxes,用于后续 roi pooling 40 | 总体过程: 41 | 1. 使用anchors和rpn_pred修正,获取所有预测结果。 42 | 2. 对选中修正后的anchors进行处理(剪裁)。 43 | 3. 根据rpn_score获取num_pre_nms个anchors。 44 | 4. 进行nms。 45 | 5. 根据rpn_score排序,获取num_post_nms个anchors作为proposal结果。 46 | :param inputs: 47 | :param training: 48 | :param mask: 49 | :return: 50 | """ 51 | # bboxes_txtytwth shape: [num_anchors*feature_width*feature_height, 4] 52 | # anchors shape: [num_anchors*feature_width*feature_height, 4] 53 | # scores shape: [feature_width*feature_height*num_anchors,] 54 | # image_shape shape: [2, ] 55 | bboxes_txtytwth, anchors, scores, image_shape = inputs 56 | 57 | # 1. 使用anchors使用rpn_pred修正,获取所有预测结果。 58 | # [num_anchors*feature_width*feature_height, 4] 59 | decoded_bboxes = decode_bbox_with_mean_and_std(anchors, bboxes_txtytwth, 60 | self._target_means, self._target_stds) 61 | 62 | # 2. 对选中修正后的anchors进行处理 63 | decoded_bboxes, _ = bboxes_clip_filter(decoded_bboxes, 0, image_shape[0], image_shape[1]) 64 | 65 | # # 3. 根据rpn_score获取num_pre_nms个anchors。 66 | # num_pre_nms = self._num_pre_nms_train if training else self._num_pre_nms_test 67 | # cur_top_k = tf.minimum(num_pre_nms, tf.size(scores)) 68 | # scores, selected_idx = tf.nn.top_k(scores, k=cur_top_k, sorted=False) 69 | # decoded_bboxes = tf.gather(decoded_bboxes, selected_idx) 70 | 71 | # 4. 进行nms。 72 | # 5. 根据rpn_score排序,获取num_post_nms个anchors作为proposal结果。 73 | num_post_nms = self._num_post_nms_train if training else self._num_post_nms_test 74 | selected_idx = tf.image.non_max_suppression(tf.to_float(decoded_bboxes), scores, 75 | max_output_size=num_post_nms, 76 | iou_threshold=self._nms_iou_threshold) 77 | 78 | tf_logging.debug('rpn proposal net generate %d proposals' % tf.size(selected_idx)) 79 | 80 | # 不参与训练,所以需要设置 tf.stop_gradient 81 | return tf.stop_gradient(tf.gather(decoded_bboxes, selected_idx)) 82 | -------------------------------------------------------------------------------- /object_detection/model/roi_pooling.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | layers = tf.keras.layers 4 | 5 | __all__ = ['RoiPoolingCropAndResize', 'RoiPoolingRoiAlign', 'RoiPoolingCropAndResize2'] 6 | 7 | 8 | class RoiPoolingCropAndResize2(tf.keras.Model): 9 | def __init__(self, pool_size): 10 | super().__init__() 11 | self._pool_size = pool_size 12 | self._concat_layer = layers.Concatenate(axis=0) 13 | self._max_pool = layers.MaxPooling2D(padding='same') 14 | 15 | def call(self, inputs, training=None, mask=None): 16 | """ 17 | 输入 backbone 的结果和 rpn proposals 的结果(即 RegionProosal 的输出) 18 | 输出 roi pooloing 的结果,即在特征图上,对每个rpn proposal获取一个固定尺寸的特征图 19 | :param inputs: 20 | :param training: 21 | :param mask: 22 | :return: 23 | """ 24 | # [1, height, width, channels] [num_rois, 4] 25 | shared_layers, rois, image_shape = inputs 26 | h, w = tf.to_float(image_shape[0]), tf.to_float(image_shape[1]) 27 | 28 | batch_ids = tf.zeros([tf.shape(rois)[0]], dtype=tf.int32) 29 | roi_channels = tf.split(rois, 4, axis=1) 30 | bboxes = tf.concat([ 31 | roi_channels[1] / tf.to_float(h), 32 | roi_channels[0] / tf.to_float(w), 33 | roi_channels[3] / tf.to_float(h), 34 | roi_channels[2] / tf.to_float(w), 35 | ], axis=1) 36 | pre_pool_size = self._pool_size * 2 37 | crops = tf.image.crop_and_resize(shared_layers, 38 | tf.stop_gradient(bboxes), 39 | box_ind=tf.to_int32(batch_ids), 40 | crop_size=[pre_pool_size, pre_pool_size], 41 | name="crops") 42 | return self._max_pool(crops) 43 | 44 | 45 | class RoiPoolingCropAndResize(tf.keras.Model): 46 | def __init__(self, pool_size, max_pooling_flag=True): 47 | super().__init__() 48 | self._pool_size = pool_size 49 | self._max_pooling_flag = max_pooling_flag 50 | self._concat_layer = layers.Concatenate(axis=0) 51 | self._max_pool = layers.MaxPooling2D(padding='same') 52 | 53 | def call(self, inputs, training=None, mask=None): 54 | """ 55 | 输入 backbone 的结果和 rpn proposals 的结果(即 RegionProosal 的输出) 56 | 输出 roi pooloing 的结果,即在特征图上,对每个rpn proposal获取一个固定尺寸的特征图 57 | :param inputs: 58 | :param training: 59 | :param mask: 60 | :return: 61 | """ 62 | # [1, height, width, channels] [num_rois, 4] 63 | shared_layers, rois, extractor_stride = inputs 64 | rois = rois / extractor_stride 65 | 66 | batch_ids = tf.zeros([tf.shape(rois)[0]], dtype=tf.int32) 67 | h, w = shared_layers.get_shape().as_list()[1:3] 68 | roi_channels = tf.split(rois, 4, axis=1) 69 | bboxes = tf.concat([ 70 | roi_channels[1] / tf.to_float(h - 1), 71 | roi_channels[0] / tf.to_float(w - 1), 72 | roi_channels[3] / tf.to_float(h - 1), 73 | roi_channels[2] / tf.to_float(w - 1), 74 | ], axis=1) 75 | if self._max_pooling_flag: 76 | pre_pool_size = self._pool_size * 2 77 | 78 | # 重大bug…… shared_layers 还是需要参与反向传播的……,bboxes不参加 79 | crops = tf.image.crop_and_resize(shared_layers, 80 | tf.stop_gradient(bboxes), 81 | box_ind=tf.to_int32(batch_ids), 82 | crop_size=[pre_pool_size, pre_pool_size], 83 | name="crops") 84 | return self._max_pool(crops) 85 | else: 86 | return tf.image.crop_and_resize(shared_layers, 87 | tf.stop_gradient(bboxes), 88 | box_ind=tf.to_int32(batch_ids), 89 | crop_size=[self._pool_size, self._pool_size], 90 | name="crops") 91 | 92 | 93 | def crop_and_resize(image, boxes, box_ind, crop_size, pad_border=True): 94 | assert isinstance(crop_size, int), crop_size 95 | boxes = tf.stop_gradient(boxes) 96 | 97 | # TF's crop_and_resize produces zeros on border 98 | if pad_border: 99 | # this can be quite slow 100 | image = tf.pad(image, [[0, 0], [1, 1], [1, 1], [0, 0]], mode='SYMMETRIC') 101 | boxes = boxes + 1 102 | 103 | def transform_fpcoor_for_tf(boxes, image_shape, crop_shape): 104 | """ 105 | The way tf.image.crop_and_resize works (with normalized box): 106 | Initial point (the value of output[0]): x0_box * (W_img - 1) 107 | Spacing: w_box * (W_img - 1) / (W_crop - 1) 108 | Use the above grid to bilinear sample. 109 | However, what we want is (with fpcoor box): 110 | Spacing: w_box / W_crop 111 | Initial point: x0_box + spacing/2 - 0.5 112 | (-0.5 because bilinear sample (in my definition) assumes floating point coordinate 113 | (0.0, 0.0) is the same as pixel value (0, 0)) 114 | This function transform fpcoor boxes to a format to be used by tf.image.crop_and_resize 115 | Returns: 116 | y1x1y2x2 117 | """ 118 | x0, y0, x1, y1 = tf.split(boxes, 4, axis=1) 119 | 120 | spacing_w = (x1 - x0) / tf.cast(crop_shape[1], tf.float32) 121 | spacing_h = (y1 - y0) / tf.cast(crop_shape[0], tf.float32) 122 | 123 | imshape = [tf.cast(image_shape[0] - 1, tf.float32), tf.cast(image_shape[1] - 1, tf.float32)] 124 | nx0 = (x0 + spacing_w / 2 - 0.5) / imshape[1] 125 | ny0 = (y0 + spacing_h / 2 - 0.5) / imshape[0] 126 | 127 | nw = spacing_w * tf.cast(crop_shape[1] - 1, tf.float32) / imshape[1] 128 | nh = spacing_h * tf.cast(crop_shape[0] - 1, tf.float32) / imshape[0] 129 | 130 | return tf.concat([ny0, nx0, ny0 + nh, nx0 + nw], axis=1) 131 | 132 | image_shape = tf.shape(image)[1:3] 133 | boxes = transform_fpcoor_for_tf(boxes, image_shape, [crop_size, crop_size]) 134 | ret = tf.image.crop_and_resize( 135 | image, boxes, tf.cast(box_ind, tf.int32), 136 | crop_size=[crop_size, crop_size]) 137 | return ret 138 | 139 | 140 | def roi_align(featuremap, boxes, resolution): 141 | """ 142 | Args: 143 | featuremap: 1xHxWxC 144 | boxes: [0, 1] 145 | resolution: output spatial resolution 146 | Returns: 147 | NxCx res x res 148 | """ 149 | # sample 4 locations per roi bin 150 | ret = crop_and_resize( 151 | featuremap, boxes, 152 | tf.zeros([tf.shape(boxes)[0]], dtype=tf.int32), 153 | resolution * 2) 154 | ret = tf.nn.avg_pool(ret, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME') 155 | return ret 156 | 157 | 158 | class RoiPoolingRoiAlign(tf.keras.Model): 159 | def __init__(self, pool_size): 160 | super().__init__() 161 | self._pool_size = pool_size 162 | self._concat_layer = layers.Concatenate(axis=0) 163 | 164 | def call(self, inputs, training=None, mask=None): 165 | """ 166 | 输入 backbone 的结果和 rpn proposals 的结果(即 RegionProosal 的输出) 167 | 输出 roi pooloing 的结果,即在特征图上,对每个rpn proposal获取一个固定尺寸的特征图 168 | :param inputs: 169 | :param training: 170 | :param mask: 171 | :return: 172 | """ 173 | # [1, height, width, channels] [num_rois, 4] 174 | shared_layers, rois, extractor_stride = inputs 175 | rois = rois / extractor_stride 176 | net = roi_align(shared_layers, tf.stop_gradient(rois), self._pool_size) 177 | return net 178 | -------------------------------------------------------------------------------- /object_detection/protos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irvingzhang0512/tf_eager_object_detection/e977217cb9d5fc2975292d55b1e5cae484f9e0dd/object_detection/protos/__init__.py -------------------------------------------------------------------------------- /object_detection/protos/string_int_label_map.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | package object_detection.protos; 4 | 5 | message StringIntLabelMapItem { 6 | // String name. The most common practice is to set this to a MID or synsets 7 | // id. 8 | optional string name = 1; 9 | 10 | // Integer id that maps to the string name above. Label ids should start from 11 | // 1. 12 | optional int32 id = 2; 13 | 14 | // Human readable string label. 15 | optional string display_name = 3; 16 | }; 17 | 18 | message StringIntLabelMap { 19 | repeated StringIntLabelMapItem item = 1; 20 | }; -------------------------------------------------------------------------------- /object_detection/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irvingzhang0512/tf_eager_object_detection/e977217cb9d5fc2975292d55b1e5cae484f9e0dd/object_detection/utils/__init__.py -------------------------------------------------------------------------------- /object_detection/utils/anchor_generator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from six.moves import range 4 | 5 | __all__ = ['generate_anchor_base', 'generate_by_anchor_base_np', 'generate_by_anchor_base_tf', 'make_anchors'] 6 | 7 | """ 8 | 参考了多处代码,包括: 9 | Numpy实现主要参考了: 10 | https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/rpn/generate_anchors.py 11 | 12 | TF实现主要参考了(其实主要还是参考了Numpy代码,只是改写成TF): 13 | https://github.com/Viredery/tf-eager-fasterrcnn/blob/master/detection/core/anchor/anchor_generator.py 14 | 15 | 16 | 使用方式: 17 | 要么使用 `generate_anchor_base` 和 `generate_by_anchor_base_np`/`generate_by_anchor_base_tf` 18 | 要么使用 `generate_anchors_tf` 或 `generate_anchors_np` 19 | 具体实例参考本文件主函数中内容 20 | """ 21 | 22 | 23 | def generate_by_anchor_base_np(anchor_base, feat_stride, height, width): 24 | # Enumerate all shifted anchors: 25 | # 26 | # add A anchors (1, A, 4) to 27 | # cell K shifts (K, 1, 4) to get 28 | # shift anchors (K, A, 4) 29 | # reshape to (K*A, 4) shifted anchors 30 | # return (K*A, 4) 31 | 32 | import numpy as xp 33 | shift_y = xp.arange(0, height, feat_stride) 34 | shift_x = xp.arange(0, width, feat_stride) 35 | shift_x, shift_y = xp.meshgrid(shift_x, shift_y) 36 | shift = xp.stack((shift_y.ravel(), shift_x.ravel(), 37 | shift_y.ravel(), shift_x.ravel()), axis=1) 38 | 39 | A = anchor_base.shape[0] 40 | K = shift.shape[0] 41 | anchor = anchor_base.reshape((1, A, 4)) + shift.reshape((1, K, 4)).transpose((1, 0, 2)) 42 | anchor = anchor.reshape((K * A, 4)).astype(np.float32) 43 | return anchor 44 | 45 | 46 | def generate_by_anchor_base_tf(anchor_base, feat_stride, height, width): 47 | shift_x = tf.range(width) * feat_stride # width 48 | shift_y = tf.range(height) * feat_stride # height 49 | shift_x, shift_y = tf.meshgrid(shift_x, shift_y) 50 | sx = tf.reshape(shift_x, shape=(-1,)) 51 | sy = tf.reshape(shift_y, shape=(-1,)) 52 | shifts = tf.transpose(tf.stack([sx, sy, sx, sy])) 53 | 54 | K = tf.multiply(width, height) 55 | A = anchor_base.shape[0] 56 | shifts = tf.transpose(tf.reshape(shifts, shape=[1, K, 4]), perm=(1, 0, 2)) 57 | anchor_constant = tf.to_float(tf.reshape(anchor_base, (1, A, 4))) 58 | anchors_tf = tf.reshape(tf.add(anchor_constant, tf.to_float(shifts)), shape=(-1, 4)) 59 | 60 | return tf.cast(anchors_tf, dtype=tf.float32) 61 | 62 | 63 | def generate_anchor_base(base_size=16, ratios=[0.5, 1, 2], 64 | scales=2 ** np.arange(3, 6)): 65 | """ 66 | 有两种生成 anchor base 的方法,这种好像是原论文中使用的 67 | anchor base 决定了最终 anchors 的长宽,后续 generate_by_anchor_base 函数的作用是确定anchor的中心点 68 | 输入的三个参数都会影响到最终的长宽: 69 | ratios 确定了长宽的比例 70 | base_size 和 scales 共同决定了 anchor 的具体尺寸,即 base_size * scales 就是最终 anchors 的尺寸 71 | Generate anchor (reference) windows by enumerating aspect ratios X 72 | scales wrt a reference (0, 0, 15, 15) window. 73 | """ 74 | 75 | ratios = np.array(ratios) 76 | scales = np.array(scales) 77 | base_anchor = np.array([1, 1, base_size, base_size]) - 1 78 | ratio_anchors = _ratio_enum(base_anchor, ratios) 79 | anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) 80 | for i in range(ratio_anchors.shape[0])]) 81 | return anchors 82 | 83 | 84 | def _whctrs(anchor): 85 | """ 86 | Return width, height, x center, and y center for an anchor (window). 87 | """ 88 | 89 | w = anchor[2] - anchor[0] + 1 90 | h = anchor[3] - anchor[1] + 1 91 | x_ctr = anchor[0] + 0.5 * (w - 1) 92 | y_ctr = anchor[1] + 0.5 * (h - 1) 93 | return w, h, x_ctr, y_ctr 94 | 95 | 96 | def _mkanchors(ws, hs, x_ctr, y_ctr): 97 | """ 98 | Given a vector of widths (ws) and heights (hs) around a center 99 | (x_ctr, y_ctr), output a set of anchors (windows). 100 | """ 101 | 102 | ws = ws[:, np.newaxis] 103 | hs = hs[:, np.newaxis] 104 | anchors = np.hstack((x_ctr - 0.5 * (ws - 1), 105 | y_ctr - 0.5 * (hs - 1), 106 | x_ctr + 0.5 * (ws - 1), 107 | y_ctr + 0.5 * (hs - 1))) 108 | return anchors 109 | 110 | 111 | def _ratio_enum(anchor, ratios): 112 | """ 113 | Enumerate a set of anchors for each aspect ratio wrt an anchor. 114 | """ 115 | 116 | w, h, x_ctr, y_ctr = _whctrs(anchor) 117 | size = w * h 118 | size_ratios = size / ratios 119 | ws = np.round(np.sqrt(size_ratios)) 120 | hs = np.round(ws * ratios) 121 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 122 | return anchors 123 | 124 | 125 | def _scale_enum(anchor, scales): 126 | """ 127 | Enumerate a set of anchors for each scale wrt an anchor. 128 | """ 129 | 130 | w, h, x_ctr, y_ctr = _whctrs(anchor) 131 | ws = w * scales 132 | hs = h * scales 133 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 134 | return anchors 135 | 136 | 137 | def make_anchors(base_anchor_size, anchor_scales, anchor_ratios, 138 | featuremap_height, featuremap_width, 139 | stride, name='make_anchors'): 140 | with tf.variable_scope(name): 141 | base_anchor = tf.constant([0, 0, base_anchor_size, base_anchor_size], tf.float32) # [x_center, y_center, w, h] 142 | 143 | ws, hs = enum_ratios(enum_scales(base_anchor, anchor_scales), 144 | anchor_ratios) # per locations ws and hs 145 | 146 | x_centers = tf.range(featuremap_width, dtype=tf.float32) * stride 147 | y_centers = tf.range(featuremap_height, dtype=tf.float32) * stride 148 | 149 | x_centers, y_centers = tf.meshgrid(x_centers, y_centers) 150 | 151 | ws, x_centers = tf.meshgrid(ws, x_centers) 152 | hs, y_centers = tf.meshgrid(hs, y_centers) 153 | 154 | anchor_centers = tf.stack([x_centers, y_centers], 2) 155 | anchor_centers = tf.reshape(anchor_centers, [-1, 2]) 156 | 157 | box_sizes = tf.stack([ws, hs], axis=2) 158 | box_sizes = tf.reshape(box_sizes, [-1, 2]) 159 | # anchors = tf.concat([anchor_centers, box_sizes], axis=1) 160 | anchors = tf.concat([anchor_centers - 0.5 * box_sizes, 161 | anchor_centers + 0.5 * box_sizes], axis=1) 162 | return anchors 163 | 164 | 165 | def enum_scales(base_anchor, anchor_scales): 166 | anchor_scales = base_anchor * tf.constant(anchor_scales, dtype=tf.float32, shape=(len(anchor_scales), 1)) 167 | return anchor_scales 168 | 169 | 170 | def enum_ratios(anchors, anchor_ratios): 171 | ws = anchors[:, 2] # for base anchor: w == h 172 | hs = anchors[:, 3] 173 | sqrt_ratios = tf.sqrt(tf.constant(anchor_ratios)) 174 | 175 | ws = tf.reshape(ws / sqrt_ratios[:, tf.newaxis], [-1, 1]) 176 | hs = tf.reshape(hs * sqrt_ratios[:, tf.newaxis], [-1, 1]) 177 | 178 | return hs, ws 179 | -------------------------------------------------------------------------------- /object_detection/utils/bbox_np.py: -------------------------------------------------------------------------------- 1 | # copy from https://github.com/tensorpack/tensorpack/blob/master/examples/FasterRCNN/utils/np_box_ops.py 2 | import numpy as np 3 | 4 | 5 | __all__ = ['pairwise_iou', 'ioa', 'bboxes_clip_filter', 'bboxes_range_filter'] 6 | 7 | 8 | def area(boxes): 9 | """Computes area of boxes. 10 | Args: 11 | boxes: Numpy array with shape [N, 4] holding N boxes 12 | Returns: 13 | a numpy array with shape [N*1] representing box areas 14 | """ 15 | return (boxes[:, 2] - boxes[:, 0] + 1) * (boxes[:, 3] - boxes[:, 1] + 1) 16 | 17 | 18 | def intersection(boxes1, boxes2): 19 | """Compute pairwise intersection areas between boxes. 20 | Args: 21 | boxes1: a numpy array with shape [N, 4] holding N boxes 22 | boxes2: a numpy array with shape [M, 4] holding M boxes 23 | Returns: 24 | a numpy array with shape [N*M] representing pairwise intersection area 25 | """ 26 | [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1) 27 | [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1) 28 | 29 | all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2)) 30 | all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2)) 31 | intersect_heights = np.maximum( 32 | np.zeros(all_pairs_max_ymin.shape, dtype='f4'), 33 | all_pairs_min_ymax - all_pairs_max_ymin + 1) 34 | all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2)) 35 | all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2)) 36 | intersect_widths = np.maximum( 37 | np.zeros(all_pairs_max_xmin.shape, dtype='f4'), 38 | all_pairs_min_xmax - all_pairs_max_xmin + 1) 39 | return intersect_heights * intersect_widths 40 | 41 | 42 | def pairwise_iou(boxes1, boxes2): 43 | """Computes pairwise intersection-over-union between box collections. 44 | Args: 45 | boxes1: a numpy array with shape [N, 4] holding N boxes. 46 | boxes2: a numpy array with shape [M, 4] holding M boxes. 47 | Returns: 48 | a numpy array with shape [N, M] representing pairwise iou scores. 49 | """ 50 | intersect = intersection(boxes1, boxes2) 51 | area1 = area(boxes1) 52 | area2 = area(boxes2) 53 | union = np.expand_dims(area1, axis=1) + np.expand_dims( 54 | area2, axis=0) - intersect 55 | return intersect / union 56 | 57 | 58 | def ioa(boxes1, boxes2): 59 | """Computes pairwise intersection-over-area between box collections. 60 | Intersection-over-area (ioa) between two boxes box1 and box2 is defined as 61 | their intersection area over box2's area. Note that ioa is not symmetric, 62 | that is, IOA(box1, box2) != IOA(box2, box1). 63 | Args: 64 | boxes1: a numpy array with shape [N, 4] holding N boxes. 65 | boxes2: a numpy array with shape [M, 4] holding N boxes. 66 | Returns: 67 | a numpy array with shape [N, M] representing pairwise ioa scores. 68 | """ 69 | intersect = intersection(boxes1, boxes2) 70 | inv_areas = np.expand_dims(1.0 / area(boxes2), axis=0) 71 | return intersect * inv_areas 72 | 73 | 74 | def bboxes_clip_filter(rpn_proposals, min_value, max_height, max_width, min_edge=None): 75 | """ 76 | numpy 操作 77 | 根据边界、最小边长过滤 proposals 78 | :param rpn_proposals: bboxes 79 | :param min_value: 80 | :param max_height: 81 | :param max_width: 82 | :param min_edge: 83 | :return: 84 | """ 85 | rpn_proposals[rpn_proposals < min_value] = min_value 86 | rpn_proposals[:, ::2][rpn_proposals[:, ::2] > max_height - 1.0] = max_height - 1.0 87 | rpn_proposals[:, 1::2][rpn_proposals[:, 1::2] > max_width - 1.0] = max_width - 1.0 88 | 89 | if min_edge is None: 90 | return rpn_proposals, np.arange(len(rpn_proposals)) 91 | 92 | new_rpn_proposals = [] 93 | rpn_proposals_idx = [] 94 | for idx, (ymin, xmin, ymax, xmax) in enumerate(rpn_proposals): 95 | if (ymax - ymin + 1.0) >= min_edge and (xmax - xmin + 1.0) >= min_edge: 96 | new_rpn_proposals.append([ymin, xmin, ymax, xmax]) 97 | rpn_proposals_idx.append(idx) 98 | return np.array(new_rpn_proposals), np.array(rpn_proposals_idx) 99 | 100 | 101 | def bboxes_range_filter(anchors, max_height, max_width): 102 | """ 103 | 过滤 anchors,超出图像范围的 anchors 都不要 104 | :param anchors: 105 | :param max_height: 106 | :param max_width: 107 | :return: 108 | """ 109 | index_inside = np.where( 110 | (anchors[:, 0] >= 0) & 111 | (anchors[:, 1] >= 0) & 112 | (anchors[:, 2] <= max_height - 1) & 113 | (anchors[:, 3] <= max_width - 1) 114 | )[0] 115 | return index_inside 116 | -------------------------------------------------------------------------------- /object_detection/utils/bbox_tf.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | __all__ = ['pairwise_iou', 'bboxes_clip_filter', 'bboxes_range_filter'] 5 | 6 | 7 | def area(boxes): 8 | """ 9 | Args: 10 | boxes: nx4 floatbox 11 | Returns: 12 | n 13 | """ 14 | x_min, y_min, x_max, y_max = tf.split(boxes, 4, axis=1) 15 | return tf.squeeze((y_max - y_min + 1.0) * (x_max - x_min + 1.0), [1]) 16 | 17 | 18 | def pairwise_intersection(boxlist1, boxlist2): 19 | """Compute pairwise intersection areas between boxes. 20 | Args: 21 | boxlist1: Nx4 floatbox 22 | boxlist2: Mx4 23 | Returns: 24 | a tensor with shape [N, M] representing pairwise intersections 25 | """ 26 | x_min1, y_min1, x_max1, y_max1 = tf.split(boxlist1, 4, axis=1) 27 | x_min2, y_min2, x_max2, y_max2 = tf.split(boxlist2, 4, axis=1) 28 | all_pairs_min_ymax = tf.minimum(y_max1, tf.transpose(y_max2)) 29 | all_pairs_max_ymin = tf.maximum(y_min1, tf.transpose(y_min2)) 30 | intersect_heights = tf.maximum(0.0, all_pairs_min_ymax - all_pairs_max_ymin + 1.0) 31 | all_pairs_min_xmax = tf.minimum(x_max1, tf.transpose(x_max2)) 32 | all_pairs_max_xmin = tf.maximum(x_min1, tf.transpose(x_min2)) 33 | intersect_widths = tf.maximum(0.0, all_pairs_min_xmax - all_pairs_max_xmin + 1.0) 34 | return intersect_heights * intersect_widths 35 | 36 | 37 | def pairwise_iou(boxlist1, boxlist2): 38 | """Computes pairwise intersection-over-union between box collections. 39 | copy from https://github.com/tensorpack/tensorpack/blob/master/examples/FasterRCNN/utils/box_ops.py 40 | Args: 41 | boxlist1: Nx4 floatbox 42 | boxlist2: Mx4 43 | Returns: 44 | a tensor with shape [N, M] representing pairwise iou scores. 45 | """ 46 | boxlist1 = tf.to_float(boxlist1) 47 | boxlist2 = tf.to_float(boxlist2) 48 | 49 | intersections = pairwise_intersection(boxlist1, boxlist2) 50 | areas1 = area(boxlist1) 51 | areas2 = area(boxlist2) 52 | unions = ( 53 | tf.expand_dims(areas1, 1) + tf.expand_dims(areas2, 0) - intersections) 54 | return tf.where( 55 | tf.equal(intersections, 0.0), 56 | tf.zeros_like(intersections), tf.truediv(intersections, unions)) 57 | 58 | 59 | def bboxes_clip_filter(rpn_proposals, min_value, max_height, max_width, min_edge=None): 60 | """ 61 | numpy 操作 62 | 根据边界、最小边长过滤 proposals 63 | :param rpn_proposals: bboxes 64 | :param min_value: 65 | :param max_height: 66 | :param max_width: 67 | :param min_edge: 68 | :return: 69 | """ 70 | channels = tf.split(rpn_proposals, 4, axis=1) 71 | channels[0] = tf.maximum(tf.minimum(channels[0], max_width - 1), min_value) 72 | channels[1] = tf.maximum(tf.minimum(channels[1], max_height - 1), min_value) 73 | channels[2] = tf.maximum(tf.minimum(channels[2], max_width - 1), min_value) 74 | channels[3] = tf.maximum(tf.minimum(channels[3], max_height - 1), min_value) 75 | rpn_proposals = tf.concat(channels, axis=1) 76 | 77 | if min_edge is None: 78 | return rpn_proposals, tf.range(rpn_proposals.shape[0]) 79 | 80 | min_edge = tf.to_float(min_edge) 81 | y_len = tf.to_float(channels[2] - channels[0] + 1.0) 82 | x_len = tf.to_float(channels[3] - channels[1] + 1.0) 83 | rpn_proposals_idx = tf.where(tf.logical_and(x_len >= min_edge, y_len >= min_edge))[:, 0] 84 | return tf.gather(rpn_proposals, rpn_proposals_idx), rpn_proposals_idx 85 | 86 | 87 | def bboxes_range_filter(anchors, max_height, max_width): 88 | """ 89 | 过滤 anchors,超出图像范围的 anchors 都不要 90 | :param anchors: 91 | :param max_height: 92 | :param max_width: 93 | :return: 94 | """ 95 | index_inside = tf.where( 96 | tf.logical_and( 97 | tf.logical_and((anchors[:, 0] >= 0), (anchors[:, 1] >= 0)), 98 | tf.logical_and((anchors[:, 2] <= max_width - 1), (anchors[:, 3] <= max_height - 1)), 99 | ) 100 | )[:, 0] 101 | return index_inside 102 | -------------------------------------------------------------------------------- /object_detection/utils/bbox_transform.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def encode_bbox_with_mean_and_std(src_bbox, dst_bbox, target_means, target_stds): 5 | target_means = tf.constant(target_means, dtype=tf.float32) 6 | target_stds = tf.constant(target_stds, dtype=tf.float32) 7 | 8 | box = tf.cast(src_bbox, tf.float32) 9 | gt_box = tf.cast(dst_bbox, tf.float32) 10 | 11 | width = box[..., 2] - box[..., 0] + 1.0 12 | height = box[..., 3] - box[..., 1] + 1.0 13 | center_x = box[..., 0] + 0.5 * width 14 | center_y = box[..., 1] + 0.5 * height 15 | 16 | gt_width = gt_box[..., 2] - gt_box[..., 0] + 1.0 17 | gt_height = gt_box[..., 3] - gt_box[..., 1] + 1.0 18 | gt_center_x = gt_box[..., 0] + 0.5 * gt_width 19 | gt_center_y = gt_box[..., 1] + 0.5 * gt_height 20 | 21 | dx = (gt_center_x - center_x) / width 22 | dy = (gt_center_y - center_y) / height 23 | dw = tf.log(gt_width / width) 24 | dh = tf.log(gt_height / height) 25 | 26 | delta = tf.stack([dx, dy, dw, dh], axis=-1) 27 | delta = (delta - target_means) / target_stds 28 | 29 | return delta 30 | 31 | 32 | def decode_bbox_with_mean_and_std(anchors, bboxes_txtytwth, target_means, target_stds): 33 | target_means = tf.constant( 34 | target_means, dtype=tf.float32) 35 | target_stds = tf.constant( 36 | target_stds, dtype=tf.float32) 37 | delta = bboxes_txtytwth * target_stds + target_means 38 | 39 | # TODO fix whether to use +1 in the following two lines. 40 | width = anchors[:, 2] - anchors[:, 0] + 1 41 | height = anchors[:, 3] - anchors[:, 1] + 1 42 | center_x = anchors[:, 0] + 0.5 * width 43 | center_y = anchors[:, 1] + 0.5 * height 44 | 45 | center_x += delta[:, 0] * width 46 | center_y += delta[:, 1] * height 47 | width *= tf.exp(delta[:, 2]) 48 | height *= tf.exp(delta[:, 3]) 49 | 50 | x1 = center_x - 0.5 * width 51 | y1 = center_y - 0.5 * height 52 | x2 = x1 + width 53 | y2 = y1 + height 54 | result = tf.stack([x1, y1, x2, y2], axis=1) 55 | return result 56 | -------------------------------------------------------------------------------- /object_detection/utils/pytorch_to_tf.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | 4 | 5 | def pytorch_to_tf_np(v): 6 | if v.ndim == 4: 7 | # OUT, IN, H, W --> H, W, IN, OUT 8 | return np.ascontiguousarray(v.transpose(2, 3, 1, 0)) 9 | if v.ndim == 2: 10 | return np.ascontiguousarray(v.transpose()) 11 | return v 12 | 13 | 14 | def convert_pth_to_dict(pth_dir, dict_path): 15 | import torch 16 | torch_file = torch.load(pth_dir) 17 | 18 | tf_dict = {} 19 | for key in torch_file['model'].keys(): 20 | tf_dict[key] = pytorch_to_tf_np(torch_file['model'][key].cpu().numpy()) 21 | 22 | with open(dict_path, 'wb') as f: 23 | pickle.dump(tf_dict, f) 24 | -------------------------------------------------------------------------------- /object_detection/utils/visual_utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import cv2 5 | 6 | 7 | def draw_bboxes_with_labels(image, bboxes, label_texts): 8 | """ 9 | 在ndarray或tf.Tensor对象上,画bboxes和对应的labels 10 | :param image: 一张图片,shape 为 [height, width, channels] 11 | :param bboxes: 一组bounding box,shape 为 [bbox_number, 4],顺序为 ymin, xmin, ymax, xmax 12 | float类型,取值范围[0, height/width] 13 | :param label_texts: 要显示的标签,shape为(bbox_number, ) 14 | :return: 画完bbox的图片,为ndarray类型,shape与输入相同 15 | """ 16 | if isinstance(image, tf.Tensor): 17 | image = image.numpy() 18 | if isinstance(bboxes, tf.Tensor): 19 | bboxes = bboxes.numpy() 20 | if isinstance(label_texts, tf.Tensor): 21 | label_texts = label_texts.numpy() 22 | idx = 0 23 | for bbox in bboxes: 24 | ymin, xmin, ymax, xmax = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]) 25 | cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2) 26 | if label_texts is not None: 27 | cv2.putText(img=image, 28 | text=str(label_texts[idx]), 29 | org=(xmin, ymin + 20), 30 | fontFace=cv2.FONT_HERSHEY_COMPLEX, 31 | fontScale=1e-3 * image.shape[0], 32 | color=(0, 0, 255), 33 | thickness=2, 34 | ) 35 | idx += 1 36 | return image 37 | 38 | 39 | def show_one_image(preprocessed_image, bboxes, labels_text=None, preprocessing_type='caffe', caffe_pixel_means=None, 40 | figsize=(15, 10), enable_matplotlib=True): 41 | """ 42 | 显示图片 43 | :param preprocessed_image: preprocessed image by `preprocessing_type`, if caffe then bgr, if tf then rgb 44 | :param bboxes: 45 | :param labels_text: 46 | :param preprocessing_type: 47 | :param caffe_pixel_means: 48 | :param figsize: 49 | :param enable_matplotlib: 50 | :return: 51 | """ 52 | if isinstance(preprocessed_image, tf.Tensor): 53 | preprocessed_image = tf.squeeze(preprocessed_image, axis=0).numpy() 54 | if isinstance(bboxes, tf.Tensor): 55 | bboxes = bboxes.numpy() 56 | if isinstance(labels_text, tf.Tensor): 57 | labels_text = labels_text.numpy() 58 | if preprocessing_type == 'caffe': 59 | cur_means = caffe_pixel_means 60 | preprocessed_image[..., 0] += cur_means[0] 61 | preprocessed_image[..., 1] += cur_means[1] 62 | preprocessed_image[..., 2] += cur_means[2] 63 | preprocessed_image = preprocessed_image[..., ::-1] 64 | preprocessed_image = preprocessed_image.astype(np.uint8) 65 | elif preprocessing_type == 'tf': 66 | preprocessed_image = ((preprocessed_image + 1.0) / 2.0) * 255.0 67 | preprocessed_image = preprocessed_image.astype(np.uint8) 68 | elif preprocessing_type is None: 69 | pass 70 | else: 71 | raise ValueError('unknown preprocess_type {}'.format(preprocessing_type)) 72 | image_with_bboxes = draw_bboxes_with_labels(preprocessed_image, bboxes, labels_text) 73 | if enable_matplotlib: 74 | plt.figure(figsize=figsize) 75 | plt.imshow(image_with_bboxes) 76 | plt.show() 77 | 78 | return image_with_bboxes 79 | -------------------------------------------------------------------------------- /scripts/eval_coco.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | import sys 5 | import argparse 6 | import json 7 | 8 | from object_detection.dataset.dataset_factory import dataset_factory 9 | from object_detection.model.model_factory import model_factory 10 | from object_detection.config.config_factory import config_factory 11 | from tensorflow.contrib.eager.python import saver as eager_saver 12 | from object_detection.utils.bbox_transform import decode_bbox_with_mean_and_std 13 | from object_detection.utils.bbox_tf import bboxes_clip_filter 14 | 15 | from pycocotools.coco import COCO 16 | from pycocotools.cocoeval import COCOeval 17 | 18 | 19 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152 20 | num_classes = 81 21 | 22 | coco_id_to_name_list = [ 23 | 'back_ground', 'person', 'bicycle', 'car', 'motorcycle', 24 | 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 25 | 'fire hydrant', 'stop sign', 'parking meter', 'bench', 26 | 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 27 | 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 28 | 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 29 | 'sports ball', 'kite', 'baseball bat', 'baseball glove', 30 | 'skateboard', 'surfboard', 'tennis racket', 'bottle', 31 | 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 32 | 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 33 | 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 34 | 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 35 | 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 36 | 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 37 | 'book', 'clock', 'vase', 'scissors', 'teddy bear', 38 | 'hair drier', 'toothbrush'] 39 | 40 | coco_name_to_cat_id_dict = { 41 | 'back_ground': 0, 42 | 'person': 1, 'bicycle': 2, 'car': 3, 'motorcycle': 4, 43 | 'airplane': 5, 'bus': 6, 'train': 7, 'truck': 8, 'boat': 9, 44 | 'traffic light': 10, 'fire hydrant': 11, 'stop sign': 13, 45 | 'parking meter': 14, 'bench': 15, 'bird': 16, 'cat': 17, 46 | 'dog': 18, 'horse': 19, 'sheep': 20, 'cow': 21, 'elephant': 22, 47 | 'bear': 23, 'zebra': 24, 'giraffe': 25, 'backpack': 27, 48 | 'umbrella': 28, 'handbag': 31, 'tie': 32, 'suitcase': 33, 49 | 'frisbee': 34, 'skis': 35, 'snowboard': 36, 'sports ball': 37, 50 | 'kite': 38, 'baseball bat': 39, 'baseball glove': 40, 51 | 'skateboard': 41, 'surfboard': 42, 'tennis racket': 43, 52 | 'bottle': 44, 'wine glass': 46, 'cup': 47, 'fork': 48, 53 | 'knife': 49, 'spoon': 50, 'bowl': 51, 'banana': 52, 'apple': 53, 54 | 'sandwich': 54, 'orange': 55, 'broccoli': 56, 'carrot': 57, 55 | 'hot dog': 58, 'pizza': 59, 'donut': 60, 'cake': 61, 56 | 'chair': 62, 'couch': 63, 'potted plant': 64, 'bed': 65, 57 | 'dining table': 67, 'toilet': 70, 'tv': 72, 'laptop': 73, 58 | 'mouse': 74, 'remote': 75, 'keyboard': 76, 'cell phone': 77, 59 | 'microwave': 78, 'oven': 79, 'toaster': 80, 'sink': 81, 60 | 'refrigerator': 82, 'book': 84, 'clock': 85, 'vase': 86, 61 | 'scissors': 87, 'teddy bear': 88, 'hair drier': 89, 62 | 'toothbrush': 90} 63 | 64 | 65 | def eval_by_cocotools(res_file_path, mode, root_path): 66 | coco_gt = COCO(os.path.join(root_path, 'annotations', 'instances_{}2017.json'.format(mode))) 67 | coco_dt = coco_gt.loadRes(res_file_path) 68 | coco_eval = COCOeval(coco_gt, coco_dt, iouType='bbox') 69 | 70 | coco_eval.params.imgIds = coco_dt.getImgIds() 71 | coco_eval.evaluate() 72 | coco_eval.accumulate() 73 | coco_eval.summarize() 74 | 75 | 76 | def eval_coco(model, 77 | result_file_path, 78 | dataset_mode, 79 | dataset_year, 80 | image_format, 81 | preprocessing_type, 82 | root_path, 83 | config, 84 | min_size=10, 85 | ): 86 | """ 87 | COCO Eval 的总体思路 88 | 1. 构建tf.data.Dataset对象,返回需要测试COCO数据集的 preprocessed_image, raw_image_height, raw_image_width, image_id。 89 | 2. 通过训练好的模型以及 preprocessed_image 获取预测结果,包括每张图片对应的 image_id, bboxes, classes,scores。 90 | 2.1. 将预测结果保存为一个序列,序列中每个元素都是一个字典,分别包括image_id, category_id, bbox, score四个对象。 91 | 2.2. image_id是int32,category_id是int32,bbox是个长度为4的float32数组,score是float32。 92 | 2.3. 具体细节可以参考官方给出的实例: 93 | https://github.com/cocodataset/cocoapi/blob/master/results/instances_val2014_fakebbox100_results.json 94 | 3. 通过COCOEval工具进行测试。 95 | 3.1. 具体细节可以参考官方给出的实例: 96 | https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb 97 | 3.2. 大概过程就是构建pycocotools.coco.COCO对象,导入结果数组、通过COCO.loadRes构建预测对象,最后通过cocoEval计算结果 98 | :param result_file_path: path to save result json file 99 | :param model: pre-trained model 100 | :param dataset_mode: train or val 101 | :param dataset_year: 102 | :param image_format: 103 | :param preprocessing_type: 104 | :param root_path: VOC的目录,要具体到某一年 105 | :param config: 106 | :param min_size: 107 | :return: 108 | """ 109 | dataset_configs = {'root_dir': root_path, 110 | 'mode': dataset_mode, 'year': dataset_year, 111 | 'min_size': config['image_max_size'], 'max_size': config['image_min_size'], 112 | 'preprocessing_type': preprocessing_type, 113 | 'caffe_pixel_means': config['bgr_pixel_means']} 114 | dataset = dataset_factory(dataset_mode, mode=dataset_mode, **dataset_configs) 115 | 116 | res_list = [] 117 | for img, img_scale, raw_h, raw_w, img_id in dataset: 118 | # final_bboxes, final_labels, final_scores = model(img, False) 119 | # final_bboxes = final_bboxes / tf.to_float(img_scale) 120 | 121 | scores, roi_txtytwth, rois = model.im_detect(img, img_scale) 122 | roi_txtytwth = tf.reshape(roi_txtytwth, [-1, num_classes, 4]) 123 | 124 | res_score = [] 125 | res_bbox = [] 126 | res_category = [] 127 | for j in range(1, num_classes): 128 | inds = tf.where(scores[:, j] > config['prediction_score_threshold'])[:, 0] 129 | cls_scores = tf.gather(scores[:, j], inds) 130 | cls_boxes = decode_bbox_with_mean_and_std(tf.gather(rois, inds), 131 | tf.gather(roi_txtytwth[:, j, :], inds), 132 | target_means=config['roi_proposal_means'], 133 | target_stds=config['roi_proposal_stds']) 134 | 135 | cls_boxes, inds = bboxes_clip_filter(cls_boxes, 0, raw_h, raw_w, min_size) 136 | cls_scores = tf.gather(cls_scores, inds) 137 | keep = tf.image.non_max_suppression(cls_boxes, cls_scores, config['max_objects_per_class_per_image'], 138 | iou_threshold=config['prediction_nms_iou_threshold']) 139 | if tf.size(keep).numpy() == 0: 140 | continue 141 | 142 | res_score.append(tf.gather(cls_scores, keep)) 143 | res_bbox.append(tf.gather(cls_boxes, keep)) 144 | res_category.append(tf.ones_like(keep, dtype=tf.int32) * j) 145 | 146 | scores_after_nms = tf.concat(res_score, axis=0) 147 | bboxes_after_nms = tf.concat(res_bbox, axis=0) 148 | category_after_nms = tf.concat(res_category, axis=0) 149 | 150 | final_scores, final_idx = tf.nn.top_k(scores_after_nms, k=tf.minimum(config['max_objects_per_image'], 151 | tf.size(scores_after_nms)), 152 | sorted=False) 153 | final_bboxes = tf.gather(bboxes_after_nms, final_idx).numpy() 154 | final_labels = tf.gather(category_after_nms, final_idx).numpy() 155 | final_scores = final_scores.numpy() 156 | 157 | for cur_bbox, cur_label, cur_score in zip(final_bboxes, final_labels, final_scores): 158 | res_list.append({ 159 | 'image_id': int(img_id), 160 | 'category_id': int(coco_name_to_cat_id_dict[coco_id_to_name_list[cur_label]]), 161 | 'bbox': [float(cur_bbox[0]), float(cur_bbox[1]), 162 | float(cur_bbox[2] - cur_bbox[0] + 1), float(cur_bbox[3] - cur_bbox[1] + 1)], 163 | 'score': float(cur_score) 164 | }) 165 | 166 | with open(result_file_path, 'w') as f: 167 | json.dump(res_list, f) 168 | eval_by_cocotools(result_file_path, dataset_mode, root_path) 169 | 170 | 171 | def _load_from_ckpt_file(model, ckpt_file_path): 172 | saver = eager_saver.Saver(model.variables) 173 | for var in model.variables: 174 | tf.logging.info('restore var {}'.format(var.name)) 175 | if tf.train.latest_checkpoint(ckpt_file_path) is not None: 176 | saver.restore(tf.train.latest_checkpoint(ckpt_file_path)) 177 | else: 178 | raise ValueError('unknown ckpt file {}'.format(ckpt_file_path)) 179 | 180 | 181 | def parse_args(): 182 | parser = argparse.ArgumentParser(description='Evaluate a Fast R-CNN model') 183 | parser.add_argument('ckpt_file_path', type=str, help='target ckpt file path', ) 184 | parser.add_argument('--year', type=str, default=['2014', '2017'], help='one of [2014, 2017]', ) 185 | 186 | parser.add_argument('--gpu_id', type=str, default='0') 187 | 188 | parser.add_argument('--dataset_mode', type=str, default='val', help='one of [train or val]') 189 | 190 | parser.add_argument('--model_type', type=str, default='faster_rcnn', help='one of [faster_rcnn, fpn]') 191 | parser.add_argument('--backbone', type=str, default='vgg16', help='one of [vgg16, resnet50, resnet101, resnet152]') 192 | 193 | parser.add_argument('--use_fpn_tensorflow_model', default=False, type=bool, 194 | help='load fpn tensorflow model, only support resnet50 backbone') 195 | 196 | parser.add_argument('--root_path', help='path to pascal COCO', 197 | default='/ssd/zhangyiyang/COCO2017', type=str) 198 | parser.add_argument('--result_file_dir', help='path to save detection result json file', 199 | default='/ssd/zhangyiyang/results/', type=str) 200 | parser.add_argument('--logs_name', default=None, type=str) 201 | 202 | if len(sys.argv) == 1: 203 | parser.print_help() 204 | sys.exit(1) 205 | 206 | args = parser.parse_args() 207 | return args 208 | 209 | 210 | def main(args): 211 | # 设置 eager 模式必须的参数 212 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id 213 | config = tf.ConfigProto(allow_soft_placement=True) 214 | config.gpu_options.allow_growth = True 215 | tf.enable_eager_execution(config=config) 216 | tf.logging.set_verbosity(tf.logging.INFO) 217 | 218 | # 获取模型并初始化参数 219 | model_config = config_factory('coco', args.model_type) 220 | cur_model = model_factory(args.model_type, args.backbone, model_config) 221 | preprocessing_type = 'caffe' 222 | cur_model(tf.to_float(np.random.rand(1, 800, 600, 3)), False) 223 | 224 | # result file path 225 | # {result_file_dir}/{model_type}/{backbone}/{logs_name}/coco_res.json 226 | logs_name = args.logs_name if args.logs_name is not None else 'default' 227 | final_result_file_dir = os.path.join(args.result_file_dir, args.model_type, args.backbone, logs_name) 228 | if not os.path.exists(final_result_file_dir): 229 | os.makedirs(final_result_file_dir) 230 | final_result_file_path = os.path.join(final_result_file_dir, 'coco_res.json') 231 | 232 | # 导入预训练模型 233 | image_format = 'bgr' 234 | if args.use_fpn_tensorflow_model: 235 | image_format = 'rgb' 236 | cur_model.load_fpn_tensorflow_weights(args.ckpt_file_path) 237 | else: 238 | _load_from_ckpt_file(cur_model, args.ckpt_file_path) 239 | 240 | # 将预测结果写到文件,并评估结果 241 | eval_coco(cur_model, 242 | result_file_path=final_result_file_path, 243 | dataset_mode=args.dataset_mode, 244 | image_format=image_format, 245 | preprocessing_type=preprocessing_type, 246 | root_path=os.path.join(args.root_path), 247 | config=model_config,) 248 | 249 | 250 | if __name__ == '__main__': 251 | main(parse_args()) 252 | -------------------------------------------------------------------------------- /scripts/eval_pascal.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | import sys 5 | import argparse 6 | from object_detection.model.model_factory import model_factory 7 | from object_detection.config.config_factory import config_factory 8 | from object_detection.evaluation.pascal_eval_files_utils import get_prediction_files 9 | from object_detection.evaluation.detectron_pascal_evaluation_utils import voc_eval 10 | from tensorflow.contrib.eager.python import saver as eager_saver 11 | 12 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152 13 | num_classes = 21, 14 | class_list = ('__background__', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 15 | 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 16 | 'tvmonitor') 17 | 18 | 19 | def eval_from_scratch(model, 20 | dataset_type, 21 | dataset_mode, 22 | image_format, 23 | preprocessing_type, 24 | root_path, 25 | result_file_format, 26 | cache_dir, 27 | use_07_metric, 28 | config, 29 | ): 30 | """ 31 | 32 | :param model: 导入好参数的模型 33 | :param dataset_type: 训练时使用的原始数据是通过 cv2 产生还是 tf 产生 34 | :param dataset_mode: train, val, test or trainval 35 | :param image_format: 36 | :param preprocessing_type: 37 | :param root_path: VOC的目录,要具体到某一年 38 | :param result_file_format: 要将test结果写到文件中,文件路径为 result_file_format.format(class_name) 39 | :param cache_dir: 预测时,会将gt的信息使用pickle进行保存,保存的路径就是 cache_dir+'test_annots.pkl' 40 | :param use_07_metric: 41 | :param config: 42 | :return: 43 | """ 44 | 45 | # 生成检测结果的本地文件 46 | get_prediction_files(model, 47 | dataset_type=dataset_type, 48 | image_format=image_format, 49 | preprocessing_type=preprocessing_type, 50 | caffe_pixel_means=config['bgr_pixel_means'], 51 | min_edge=config['image_min_size'], 52 | max_edge=config['image_max_size'], 53 | data_root_path=root_path, 54 | mode=dataset_mode, 55 | result_file_format=result_file_format, 56 | score_threshold=config['prediction_score_threshold'], 57 | iou_threshold=config['prediction_nms_iou_threshold'], 58 | max_objects_per_class=config['max_objects_per_class_per_image'], 59 | max_objects_per_image=config['max_objects_per_image'] , 60 | target_means=config['roi_proposal_means'], 61 | target_stds=config['roi_proposal_stds'], 62 | min_size=10 63 | ) 64 | 65 | # 通过本地文件(包括检测结果和真实结果)计算map 66 | eval_by_local_files_and_gt_xmls(root_path, 67 | result_file_format, 68 | cache_dir, 69 | dataset_mode, 70 | config['evaluate_iou_threshold'], 71 | use_07_metric=use_07_metric, ) 72 | 73 | 74 | def eval_by_local_files_and_gt_xmls(root_path, 75 | result_file_format, 76 | cache_dir, 77 | mode, 78 | prediction_iou_threshold, 79 | use_07_metric=True): 80 | annotation_file_format = os.path.join(root_path, 'Annotations', "{}.xml") 81 | imagesetfile = os.path.join(root_path, 'ImageSets', 'Main', '{}.txt'.format(mode)) 82 | all_ap = .0 83 | for cls_name in class_list: 84 | if cls_name == '__background__': 85 | continue 86 | cur_res = voc_eval(result_file_format, 87 | annotation_file_format, 88 | imagesetfile, 89 | cls_name, 90 | cache_dir, 91 | ovthresh=prediction_iou_threshold, 92 | use_07_metric=use_07_metric, 93 | ) 94 | tf.logging.info('class {} get ap {}'.format(cls_name, cur_res[2])) 95 | all_ap += cur_res[2] 96 | tf.logging.info('map {}'.format(all_ap / (len(class_list) - 1))) 97 | 98 | 99 | def _load_from_ckpt_file(model, ckpt_file_path): 100 | saver = eager_saver.Saver(model.variables) 101 | for var in model.variables: 102 | tf.logging.info('restore var {}'.format(var.name)) 103 | if tf.train.latest_checkpoint(ckpt_file_path) is not None: 104 | saver.restore(tf.train.latest_checkpoint(ckpt_file_path)) 105 | else: 106 | raise ValueError('unknown ckpt file {}'.format(ckpt_file_path)) 107 | 108 | 109 | def parse_args(): 110 | parser = argparse.ArgumentParser(description='Evaluate a Fast R-CNN model') 111 | parser.add_argument('ckpt_file_path', type=str, help='target ckpt file path', ) 112 | 113 | parser.add_argument('--gpu_id', type=str, default='0') 114 | 115 | parser.add_argument('--dataset_type', help='type of dataset, cv2 or tf', default='cv2', type=str) 116 | parser.add_argument('--dataset_mode', type=str, default='test', help='one of [test, train, trainval, val]') 117 | parser.add_argument('--year', type=str, default='2007', help='one of [2007, 2012]') 118 | 119 | parser.add_argument('--model_type', type=str, default='faster_rcnn', help='one of [faster_rcnn, fpn]') 120 | parser.add_argument('--backbone', type=str, default='vgg16', help='one of [vgg16, resnet50, resnet101, resnet152]') 121 | 122 | parser.add_argument('--use_tf_faster_rcnn_model', type=bool, default=False, 123 | help='load tf-faster-rcnn model, only support resnet101 backbone') 124 | parser.add_argument('--use_fpn_tensorflow_model', default=False, type=bool, 125 | help='load fpn tensorflow model, only support resnet50 backbone') 126 | parser.add_argument('--use_local_result_files', default=False, type=bool) 127 | 128 | parser.add_argument('--use_07_metric', default=True, type=bool) 129 | 130 | # parser.add_argument('--root_path', help='path to pascal VOCdevkit', 131 | # default='D:\\data\\VOCdevkit', type=str) 132 | # parser.add_argument('--result_file_format', help='local detection result file pattern', 133 | # default='D:\\data\\VOCdevkit\\VOC2007\\results\\{:s}.txt', type=str) 134 | # parser.add_argument('--annotation_cache_dir', help='path to save annotation cache pickle file', 135 | # default='D:\\data\\VOCdevkit\\VOC2007\\results', type=str) 136 | 137 | parser.add_argument('--root_path', help='path to pascal VOCdevkit', 138 | default='/ssd/zhangyiyang/tf_eager_object_detection/VOCdevkit', type=str) 139 | parser.add_argument('--annotation_cache_dir', help='path to save annotation cache pickle file', 140 | default='/ssd/zhangyiyang/tf_eager_object_detection/results', type=str) 141 | 142 | # path to save detection result files 143 | parser.add_argument('--result_file_dir', help='local detection result file pattern', 144 | default='/ssd/zhangyiyang/tf_eager_object_detection/results', type=str) 145 | parser.add_argument('--logs_name', default=None, type=str) 146 | 147 | if len(sys.argv) == 1: 148 | parser.print_help() 149 | sys.exit(1) 150 | 151 | args = parser.parse_args() 152 | return args 153 | 154 | 155 | def main(args): 156 | model_config = config_factory('pascal', args.model_type) 157 | 158 | # get result file format 159 | # {args.result_file_dir}/{args.model_type}/{args.backbone}/{logs_name}/{}.txt 160 | logs_name = args.logs_name if args.logs_name is not None else 'default' 161 | result_file_dir = os.path.join(args.result_file_dir, args.model_type, args.backbone, logs_name) 162 | if not os.path.exists(result_file_dir): 163 | os.makedirs(result_file_dir) 164 | result_file_path = os.path.join(result_file_dir, '{}.txt') 165 | 166 | if args.use_local_result_files: 167 | # 本地文件已存在,通过本地文件进行评估 168 | eval_by_local_files_and_gt_xmls(root_path=args.root_path, 169 | result_file_format=result_file_path, 170 | cache_dir=args.annotation_cache_dir, 171 | mode=args.dataset_mode, 172 | prediction_iou_threshold=model_config['evaluate_iou_threshold'] 173 | ) 174 | return 175 | 176 | # 判断参数合法性 177 | if args.year not in ['2007', '2012']: 178 | raise ValueError('unknown pascal year {}'.format(args.year)) 179 | 180 | # 设置 eager 模式必须的参数 181 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id 182 | config = tf.ConfigProto(allow_soft_placement=True) 183 | config.gpu_options.allow_growth = True 184 | tf.enable_eager_execution(config=config) 185 | tf.logging.set_verbosity(tf.logging.INFO) 186 | 187 | # 获取模型并初始化参数 188 | cur_model = model_factory(args.model_type, args.backbone, model_config) 189 | preprocessing_type = 'caffe' 190 | cur_model(tf.to_float(np.random.rand(1, 800, 600, 3)), False) 191 | 192 | # 导入预训练模型 193 | image_format = 'bgr' 194 | if args.use_tf_faster_rcnn_model: 195 | cur_model.load_tf_faster_rcnn_tf_weights(args.ckpt_file_path) 196 | elif args.use_fpn_tensorflow_model: 197 | image_format = 'rgb' 198 | cur_model.load_fpn_tensorflow_weights(args.ckpt_file_path) 199 | else: 200 | _load_from_ckpt_file(cur_model, args.ckpt_file_path) 201 | 202 | # 将预测结果写到文件,并评估结果 203 | eval_from_scratch(cur_model, 204 | dataset_type=args.dataset_type, 205 | dataset_mode=args.dataset_mode, 206 | image_format=image_format, 207 | preprocessing_type=preprocessing_type, 208 | root_path=os.path.join(args.root_path, 'VOC' + str(args.year)), 209 | result_file_format=result_file_path, 210 | cache_dir=args.annotation_cache_dir, 211 | use_07_metric=args.use_07_metric, 212 | config=model_config) 213 | 214 | 215 | if __name__ == '__main__': 216 | main(parse_args()) 217 | -------------------------------------------------------------------------------- /scripts/generate_pascal_tf_records.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import tensorflow as tf 4 | import argparse 5 | import object_detection.dataset.utils.tf_record_utils as dataset_utils 6 | import object_detection.dataset.utils.label_map_utils as label_map_utils 7 | from tqdm import tqdm 8 | from lxml import etree 9 | 10 | 11 | def _get_tf_example(xml_dict, label_map_dict, image_path): 12 | with open(image_path, 'rb') as image: 13 | encoded_jpg = image.read() 14 | width = int(xml_dict['size']['width']) 15 | height = int(xml_dict['size']['height']) 16 | 17 | xmin = [] 18 | ymin = [] 19 | xmax = [] 20 | ymax = [] 21 | classes = [] 22 | classes_text = [] 23 | if 'object' in xml_dict: 24 | for obj in xml_dict['object']: 25 | xmin.append(float(int(obj['bndbox']['xmin']) - 1) / (width - 1)) 26 | ymin.append(float(int(obj['bndbox']['ymin']) - 1) / (height - 1)) 27 | xmax.append(float(int(obj['bndbox']['xmax']) - 1) / (width - 1)) 28 | ymax.append(float(int(obj['bndbox']['ymax']) - 1) / (height - 1)) 29 | classes_text.append(obj['name'].encode('utf8')) 30 | classes.append(label_map_dict[obj['name']]) 31 | 32 | example = tf.train.Example(features=tf.train.Features(feature={ 33 | 'image/height': dataset_utils.int64_feature(height), 34 | 'image/width': dataset_utils.int64_feature(width), 35 | 'image/filename': dataset_utils.bytes_feature(xml_dict['filename'].encode('utf8')), 36 | 'image/encoded': dataset_utils.bytes_feature(encoded_jpg), 37 | 'image/object/bbox/xmin': dataset_utils.float_list_feature(xmin), 38 | 'image/object/bbox/xmax': dataset_utils.float_list_feature(xmax), 39 | 'image/object/bbox/ymin': dataset_utils.float_list_feature(ymin), 40 | 'image/object/bbox/ymax': dataset_utils.float_list_feature(ymax), 41 | 'image/object/class/label': dataset_utils.int64_list_feature(classes), 42 | 'image/object/class/text': dataset_utils.bytes_list_feature(classes_text), 43 | })) 44 | return example 45 | 46 | 47 | def main(args): 48 | writers = dataset_utils.get_multi_tf_record_writers(base_path=args.writer_base_path, 49 | file_pattern=args.writer_file_pattern, 50 | year=args.year, 51 | number=args.writers_number, 52 | mode=args.mode) 53 | label_map_dict = label_map_utils.get_label_map_dict(args.label_map_path) 54 | if args.year == "2007": 55 | years = ["VOC2007"] 56 | elif args.year == "2012": 57 | years = ["VOC2012"] 58 | elif args.year == "0712": 59 | years = ["VOC2007", "VOC2012"] 60 | else: 61 | raise ValueError('unknown year {}'.format(args.year)) 62 | 63 | annotation_file_paths_list = [] 64 | root_paths = [] 65 | for year in years: 66 | with open(os.path.join(args.data_root_path, year, 'ImageSets', 'Main', 'aeroplane_%s.txt' % args.mode), 67 | 'r') as f: 68 | lines = f.readlines() 69 | cur_annotation_list = [ 70 | os.path.join(args.data_root_path, year, 'Annotations', line.strip().split(' ')[0] + '.xml') 71 | for line in lines 72 | ] 73 | cur_root_paths = [os.path.join(args.data_root_path, year)] * len(lines) 74 | 75 | annotation_file_paths_list += cur_annotation_list 76 | root_paths += cur_root_paths 77 | 78 | for idx, (annotation_file_path, root_path) in enumerate(tqdm(zip(annotation_file_paths_list, root_paths))): 79 | with open(annotation_file_path, 'r') as f: 80 | xml_str = f.read() 81 | xml_dict = dataset_utils.recursive_parse_xml_to_dict(etree.fromstring(xml_str))['annotation'] 82 | tf_example = _get_tf_example(xml_dict, label_map_dict, 83 | os.path.join(root_path, 'JPEGImages', xml_dict['filename'])) 84 | writers[idx % args.writers_number].write(tf_example.SerializeToString()) 85 | for writer in writers: 86 | writer.close() 87 | 88 | 89 | def _parse_arguments(argv): 90 | parser = argparse.ArgumentParser() 91 | parser.add_argument('--mode', type=str, default="trainval") 92 | parser.add_argument('--year', type=str, default="2007", help="one of [2007, 2012, 0712]") 93 | parser.add_argument('--writer_file_pattern', type=str, default='pascal_%s_%s_%02d.tfrecords', 94 | help='tf records output file name pattern') 95 | parser.add_argument('--writers_number', type=int, default=5, help='split tf records into several files.') 96 | 97 | parser.add_argument('--writer_base_path', type=str, default="/path/to/tf_eager_records", 98 | help='path to save generated tf record files.') 99 | parser.add_argument('--label_map_path', type=str, 100 | help='path to pascal_label_map.pbtxt, already exists in ./scripts/label_map_src/', 101 | default='./scripts/label_map_src/pascal_label_map.pbtxt') 102 | parser.add_argument('--data_root_path', type=str, default='/path/to/VOCdevkit') 103 | 104 | # parser.add_argument('--writer_base_path', type=str, default="D:\\data\\VOCdevkit\\tf_eager_records") 105 | # parser.add_argument('--label_map_path', type=str, 106 | # help='path to pascal_label_map.pbtxt, already exists in ./scripts/label_map_src/', 107 | # default='./scripts/label_map_src/pascal_label_map.pbtxt') 108 | # parser.add_argument('--data_root_path', type=str, default='D:\\data\\VOCdevkit') 109 | 110 | return parser.parse_args(argv) 111 | 112 | 113 | if __name__ == '__main__': 114 | main(_parse_arguments(sys.argv[1:])) 115 | -------------------------------------------------------------------------------- /scripts/label_map_src/pascal_label_map.pbtxt: -------------------------------------------------------------------------------- 1 | item { 2 | id: 1 3 | name: 'aeroplane' 4 | } 5 | 6 | item { 7 | id: 2 8 | name: 'bicycle' 9 | } 10 | 11 | item { 12 | id: 3 13 | name: 'bird' 14 | } 15 | 16 | item { 17 | id: 4 18 | name: 'boat' 19 | } 20 | 21 | item { 22 | id: 5 23 | name: 'bottle' 24 | } 25 | 26 | item { 27 | id: 6 28 | name: 'bus' 29 | } 30 | 31 | item { 32 | id: 7 33 | name: 'car' 34 | } 35 | 36 | item { 37 | id: 8 38 | name: 'cat' 39 | } 40 | 41 | item { 42 | id: 9 43 | name: 'chair' 44 | } 45 | 46 | item { 47 | id: 10 48 | name: 'cow' 49 | } 50 | 51 | item { 52 | id: 11 53 | name: 'diningtable' 54 | } 55 | 56 | item { 57 | id: 12 58 | name: 'dog' 59 | } 60 | 61 | item { 62 | id: 13 63 | name: 'horse' 64 | } 65 | 66 | item { 67 | id: 14 68 | name: 'motorbike' 69 | } 70 | 71 | item { 72 | id: 15 73 | name: 'person' 74 | } 75 | 76 | item { 77 | id: 16 78 | name: 'pottedplant' 79 | } 80 | 81 | item { 82 | id: 17 83 | name: 'sheep' 84 | } 85 | 86 | item { 87 | id: 18 88 | name: 'sofa' 89 | } 90 | 91 | item { 92 | id: 19 93 | name: 'train' 94 | } 95 | 96 | item { 97 | id: 20 98 | name: 'tvmonitor' 99 | } 100 | -------------------------------------------------------------------------------- /scripts/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import argparse 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | from object_detection.model.model_factory import model_factory 8 | from object_detection.config.config_factory import config_factory 9 | from object_detection.utils.visual_utils import show_one_image 10 | from object_detection.dataset.dataset_factory import dataset_factory 11 | from tensorflow.contrib.summary import summary 12 | from tensorflow.contrib.eager.python import saver as eager_saver 13 | from tensorflow.python.platform import tf_logging 14 | from tqdm import tqdm 15 | 16 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152 17 | tf_logging.set_verbosity(tf_logging.INFO) 18 | 19 | CONFIG = None 20 | 21 | 22 | def train_step(model, loss, tape, optimizer): 23 | all_vars = model.variables 24 | gradients = tape.gradient(loss, all_vars) 25 | 26 | if CONFIG['learning_rate_bias_double']: 27 | all_grads = [] 28 | all_vars = [] 29 | for grad, var in zip(gradients, model.variables): 30 | if grad is None: 31 | continue 32 | scale = 1.0 33 | if 'bias' in var.name: 34 | scale = 2.0 35 | all_grads.append(grad * scale) 36 | all_vars.append(var) 37 | gradients = all_grads 38 | 39 | optimizer.apply_gradients(zip(gradients, all_vars), 40 | global_step=tf.train.get_or_create_global_step()) 41 | 42 | 43 | def _get_default_optimizer(use_adam): 44 | lr = tf.train.piecewise_constant(tf.train.get_or_create_global_step(), 45 | boundaries=CONFIG['learning_rate_multi_decay_steps'], 46 | values=CONFIG['learning_rate_multi_lrs']) 47 | if use_adam: 48 | return tf.train.AdamOptimizer(lr) 49 | else: 50 | return tf.train.MomentumOptimizer(lr, momentum=CONFIG['optimizer_momentum']) 51 | 52 | 53 | def _get_training_dataset(preprocessing_type='caffe', dataset_type='pascal', 54 | coco_year="2017", 55 | pascal_year="2007", pascal_mode='trainval', pascal_tf_records_num=5, 56 | data_root_path=None): 57 | if dataset_type == 'pascal': 58 | base_pattern = 'pascal_{}_{}_%02d.tfrecords'.format(pascal_year, pascal_mode) 59 | file_names = [os.path.join(data_root_path, base_pattern % i) for i in range(pascal_tf_records_num)] 60 | dataset_configs = {'tf_records_list': file_names, 61 | 'min_size': CONFIG['image_min_size'], 'max_size': CONFIG['image_max_size'], 62 | 'preprocessing_type': preprocessing_type, 'caffe_pixel_means': CONFIG['bgr_pixel_means'], 63 | 'argument': True, } 64 | dataset = dataset_factory('pascal', 'train', dataset_configs) 65 | elif dataset_type == 'coco': 66 | dataset_configs = {'root_dir': data_root_path, 67 | 'mode': 'train', 'year': coco_year, 68 | 'min_size': CONFIG['image_min_size'], 'max_size': CONFIG['image_max_size'], 69 | 'preprocessing_type': preprocessing_type, 'caffe_pixel_means': CONFIG['bgr_pixel_means'], 70 | 'argument': True, } 71 | dataset = dataset_factory('coco', 'train', dataset_configs) 72 | else: 73 | raise ValueError('unknown dataset type {}'.format(dataset_type)) 74 | return dataset 75 | 76 | 77 | def train_one_epoch(dataset, base_model, optimizer, 78 | preprocessing_type, 79 | logging_every_n_steps, 80 | summary_every_n_steps, 81 | saver, save_every_n_steps, save_path): 82 | idx = 0 83 | 84 | for image, gt_bboxes, gt_labels in tqdm(dataset): 85 | # bgr input 86 | # for keras application pre-trained models, use bgr 87 | 88 | # conver ymin xmin ymax xmax -> xmin ymin xmax ymax 89 | gt_bboxes = tf.squeeze(gt_bboxes, axis=0) 90 | channels = tf.split(gt_bboxes, 4, axis=1) 91 | gt_bboxes = tf.concat([ 92 | channels[1], channels[0], channels[3], channels[2] 93 | ], axis=1) 94 | 95 | # set labels to int32 96 | gt_labels = tf.to_int32(tf.squeeze(gt_labels, axis=0)) 97 | 98 | # train one step 99 | with tf.GradientTape() as tape: 100 | rpn_cls_loss, rpn_reg_loss, roi_cls_loss, roi_reg_loss = base_model((image, gt_bboxes, gt_labels), True) 101 | l2_loss = tf.add_n(base_model.losses) 102 | total_loss = rpn_cls_loss + rpn_reg_loss + roi_cls_loss + roi_reg_loss + l2_loss 103 | train_step(base_model, total_loss, tape, optimizer) 104 | 105 | # summary 106 | if idx % summary_every_n_steps == 0: 107 | summary.scalar("l2_loss", l2_loss) 108 | summary.scalar("rpn_cls_loss", rpn_cls_loss) 109 | summary.scalar("rpn_reg_loss", rpn_reg_loss) 110 | summary.scalar("roi_cls_loss", roi_cls_loss) 111 | summary.scalar("roi_reg_loss", roi_reg_loss) 112 | summary.scalar("total_loss", total_loss) 113 | 114 | pred_bboxes, pred_labels, pred_scores = base_model(image, False) 115 | 116 | if pred_bboxes is not None: 117 | selected_idx = tf.where(pred_scores >= CONFIG['show_image_score_threshold'])[:, 0] 118 | if tf.size(selected_idx) != 0: 119 | # show gt 120 | gt_channels = tf.split(gt_bboxes, 4, axis=1) 121 | show_gt_bboxes = tf.concat([gt_channels[1], gt_channels[0], gt_channels[3], gt_channels[2]], axis=1) 122 | gt_image = show_one_image(tf.squeeze(image, axis=0).numpy(), show_gt_bboxes.numpy(), 123 | gt_labels.numpy(), 124 | preprocessing_type=preprocessing_type, 125 | caffe_pixel_means=CONFIG['bgr_pixel_means'], 126 | enable_matplotlib=False) 127 | tf.contrib.summary.image("gt_image", tf.expand_dims(gt_image, axis=0)) 128 | 129 | # show pred 130 | pred_bboxes = tf.gather(pred_bboxes, selected_idx) 131 | pred_labels = tf.gather(pred_labels, selected_idx) 132 | channels = tf.split(pred_bboxes, num_or_size_splits=4, axis=1) 133 | show_pred_bboxes = tf.concat([ 134 | channels[1], channels[0], channels[3], channels[2] 135 | ], axis=1) 136 | pred_image = show_one_image(tf.squeeze(image, axis=0).numpy(), 137 | show_pred_bboxes.numpy(), 138 | pred_labels.numpy(), 139 | preprocessing_type=preprocessing_type, 140 | caffe_pixel_means=CONFIG['bgr_pixel_means'], 141 | enable_matplotlib=False) 142 | tf.contrib.summary.image("pred_image", tf.expand_dims(pred_image, axis=0)) 143 | 144 | # logging 145 | if idx % logging_every_n_steps == 0: 146 | if isinstance(optimizer, tf.train.AdamOptimizer): 147 | show_lr = optimizer._lr() 148 | else: 149 | show_lr = optimizer._learning_rate() 150 | logging_format = 'steps %d, lr is %.5f, loss: %.4f, %.4f, %.4f, %.4f, %.4f, %.4f' 151 | tf_logging.info(logging_format % (idx + 1, show_lr, 152 | rpn_cls_loss, rpn_reg_loss, roi_cls_loss, roi_reg_loss, 153 | l2_loss, total_loss)) 154 | 155 | # saving 156 | if saver is not None and save_path is not None and idx % save_every_n_steps == 0 and idx != 0: 157 | saver.save(os.path.join(save_path, 'model.ckpt'), global_step=tf.train.get_or_create_global_step()) 158 | 159 | idx += 1 160 | 161 | 162 | def train(training_dataset, 163 | preprocessing_type, 164 | 165 | base_model, 166 | 167 | optimizer, 168 | 169 | logging_every_n_steps, 170 | save_every_n_steps, 171 | summary_every_n_steps, 172 | 173 | train_dir, 174 | ckpt_dir, 175 | restore_ckpt_file_path, 176 | ): 177 | # 获取 pretrained model 178 | variables = base_model.variables + [tf.train.get_or_create_global_step()] 179 | saver = eager_saver.Saver(variables) 180 | 181 | # 命令行指定 ckpt file 182 | if restore_ckpt_file_path is not None: 183 | saver.restore(restore_ckpt_file_path) 184 | 185 | # 当前 logs_dir 中的预训练模型,用于继续训练 186 | if tf.train.latest_checkpoint(ckpt_dir) is not None: 187 | saver.restore(tf.train.latest_checkpoint(ckpt_dir)) 188 | 189 | train_writer = tf.contrib.summary.create_file_writer(train_dir, flush_millis=100000) 190 | for i in range(CONFIG['epochs']): 191 | tf_logging.info('epoch %d starting...' % (i + 1)) 192 | start = time.time() 193 | with train_writer.as_default(), summary.always_record_summaries(): 194 | train_one_epoch(dataset=training_dataset, base_model=base_model, 195 | optimizer=optimizer, preprocessing_type=preprocessing_type, 196 | logging_every_n_steps=logging_every_n_steps, 197 | summary_every_n_steps=summary_every_n_steps, 198 | saver=saver, save_every_n_steps=save_every_n_steps, save_path=ckpt_dir, 199 | ) 200 | tf.set_random_seed(1) 201 | train_end = time.time() 202 | tf_logging.info('epoch %d training finished, costing %d seconds...' % (i + 1, train_end - start)) 203 | 204 | 205 | def parse_args(): 206 | """ 207 | Parse input arguments 208 | """ 209 | parser = argparse.ArgumentParser(description='Train a model') 210 | parser.add_argument('--gpu_id', default="0", type=str, help='used in sys variable CUDA_VISIBLE_DEVICES') 211 | 212 | parser.add_argument('--model_type', type=str, default='faster_rcnn', 213 | help='one of [faster_rcnn, fpn]') 214 | parser.add_argument('--backbone', type=str, default='resnet50', 215 | help='one of [vgg16, resnet50, resnet101, resnet152]') 216 | 217 | parser.add_argument('--data_type', default="pascal", type=str, help='pascal or coco') 218 | 219 | # coco 220 | parser.add_argument('--coco_year', default="2017", type=str, help='one of [2014, 2017]') 221 | 222 | # pascal 223 | parser.add_argument('--pascal_year', default="2007", type=str, help='one of [2007, 2012, 0712]') 224 | parser.add_argument('--pascal_mode', default="trainval", type=str, help='one of [trainval, train, val]') 225 | parser.add_argument('--pascal_tf_records_num', default=5, type=int, help='number of pascal tf records') 226 | 227 | parser.add_argument('--logging_every_n_steps', default=100, type=int) 228 | parser.add_argument('--saving_every_n_steps', default=5000, type=int) 229 | parser.add_argument('--summary_every_n_steps', default=100, type=int) 230 | parser.add_argument('--restore_ckpt_path', type=str, default=None) 231 | 232 | parser.add_argument('--use_adam', type=bool, default=False) 233 | 234 | parser.add_argument('--logs_name', type=str, default='default', 235 | help='logs dir name pattern is `logs-{data_type}-{model_type}-{backbone}-{logs_name}`', ) 236 | 237 | # parser.add_argument('--data_root_path', default="/ssd/zhangyiyang/COCO2017", type=str) 238 | parser.add_argument('--data_root_path', type=str, 239 | help='path to tfrecord files if pascal, path to coco root if coco', 240 | default="/ssd/zhangyiyang/tf_eager_object_detection/VOCdevkit/tf_eager_records") 241 | parser.add_argument('--logs_dir', type=str, help='path to save ckpt files and tensorboard summaries.', 242 | default="/ssd/zhangyiyang/tf_eager_object_detection/logs") 243 | 244 | # # parser.add_argument('--data_root_path', default="D:\\data\\COCO2017", type=str) 245 | # parser.add_argument('--data_root_path', default="D:\\data\\VOCdevkit\\tf_eager_records\\", type=str) 246 | # parser.add_argument('--logs_dir', default="D:\\data\\logs\\logs-pascal", type=str) 247 | 248 | args = parser.parse_args() 249 | return args 250 | 251 | 252 | def main(args): 253 | global CONFIG 254 | CONFIG = config_factory(args.data_type, args.model_type) 255 | 256 | # tensorflow eager 模式基本参数设置 257 | os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id) 258 | config = tf.ConfigProto(allow_soft_placement=True) 259 | config.gpu_options.allow_growth = True 260 | # config.log_device_placement = True 261 | tf.enable_eager_execution(config=config) 262 | 263 | # 建立模型,并初始化 264 | cur_model = model_factory(args.model_type, args.backbone, CONFIG) 265 | preprocessing_type = 'caffe' 266 | cur_model(tf.to_float(np.random.rand(1, 800, 600, 3)), False) 267 | 268 | # logs基本信息 269 | # logs-{data_type}-{model_type}-{backbone}-{logs_name} 270 | logs_name_pattern = 'logs-{}-{}-{}-{}' 271 | logs_path_name = logs_name_pattern.format(args.data_type, args.model_type, args.backbone, args.logs_name) 272 | 273 | # 开始训练 274 | train(training_dataset=_get_training_dataset(preprocessing_type=preprocessing_type, 275 | dataset_type=args.data_type, 276 | coco_year=args.coco_year, 277 | pascal_year=args.pascal_year, 278 | pascal_mode=args.pascal_mode, 279 | pascal_tf_records_num=args.pascal_tf_records_num, 280 | data_root_path=args.data_root_path), 281 | preprocessing_type=preprocessing_type, 282 | 283 | base_model=cur_model, 284 | 285 | optimizer=_get_default_optimizer(args.use_adam), 286 | 287 | logging_every_n_steps=args.logging_every_n_steps, 288 | save_every_n_steps=args.saving_every_n_steps, 289 | summary_every_n_steps=args.summary_every_n_steps, 290 | 291 | train_dir=os.path.join(args.logs_dir, logs_path_name, 'train'), 292 | ckpt_dir=os.path.join(args.logs_dir, logs_path_name, 'ckpt'), 293 | restore_ckpt_file_path=args.restore_ckpt_path, 294 | ) 295 | 296 | 297 | if __name__ == '__main__': 298 | main(parse_args()) 299 | --------------------------------------------------------------------------------