├── Fashion_Test.py
├── Fashion_Train.py
├── KerasRFCN
    ├── Config.py
    ├── Data_generator.py
    ├── Losses.py
    ├── Model
    │   ├── BaseModel.py
    │   ├── Model.py
    │   ├── ResNet.py
    │   └── ResNet_dilated.py
    ├── Utils.py
    └── __init__.py
├── LICENSE
├── README.md
├── ReadmeImages
    ├── 1.png
    ├── result_1.jpg
    └── result_2.jpg
└── data.pk


/Fashion_Test.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Keras RFCN
  3 | Copyright (c) 2018
  4 | Licensed under the MIT License (see LICENSE for details)
  5 | Written by parap1uie-s@github.com
  6 | """
  7 | 
  8 | '''
  9 | This is a demo to Eval a RFCN model with DeepFashion Dataset
 10 | http://mmlab.ie.cuhk.edu.hk/projects/DeepFashion.html
 11 | '''
 12 | 
 13 | from KerasRFCN.Model.Model import RFCN_Model
 14 | from KerasRFCN.Config import Config
 15 | import KerasRFCN.Utils 
 16 | import os
 17 | from keras.preprocessing import image
 18 | import pickle
 19 | import numpy as np
 20 | import argparse
 21 | import matplotlib.pyplot as plt
 22 | import matplotlib.patches as patches
 23 | 
 24 | class RFCNNConfig(Config):
 25 |     """Configuration for training on the toy shapes dataset.
 26 |     Derives from the base Config class and overrides values specific
 27 |     to the toy shapes dataset.
 28 |     """
 29 |     # Give the configuration a recognizable name
 30 |     NAME = "Fashion"
 31 | 
 32 |     # Backbone model
 33 |     # choose one from ['resnet50', 'resnet101', 'resnet50_dilated', 'resnet101_dilated']
 34 |     BACKBONE = "resnet101"
 35 |     
 36 |     # Train on 1 GPU and 8 images per GPU. We can put multiple images on each
 37 |     # GPU because the images are small. Batch size is 8 (GPUs * images/GPU).
 38 |     GPU_COUNT = 1
 39 |     IMAGES_PER_GPU = 1
 40 | 
 41 |     # Number of classes (including background)
 42 |     C = 1 + 46  # background + 2 tags
 43 |     NUM_CLASSES = C
 44 |     # Use small images for faster training. Set the limits of the small side
 45 |     # the large side, and that determines the image shape.
 46 |     IMAGE_MIN_DIM = 640
 47 |     IMAGE_MAX_DIM = 768
 48 | 
 49 |     # Use smaller anchors because our image and objects are small
 50 |     RPN_ANCHOR_SCALES = (32, 64, 128, 256, 512)  # anchor side in pixels
 51 |     # Use same strides on stage 4-6 if use dilated resnet of DetNet
 52 |     # Like BACKBONE_STRIDES = [4, 8, 16, 16, 16]
 53 |     BACKBONE_STRIDES = [4, 8, 16, 32, 64]
 54 |     # Reduce training ROIs per image because the images are small and have
 55 |     # few objects. Aim to allow ROI sampling to pick 33% positive ROIs.
 56 |     TRAIN_ROIS_PER_IMAGE = 200
 57 | 
 58 |     # Use a small epoch since the data is simple
 59 |     STEPS_PER_EPOCH = 100
 60 | 
 61 |     # use small validation steps since the epoch is small
 62 |     VALIDATION_STEPS = 5
 63 | 
 64 |     RPN_NMS_THRESHOLD = 0.7
 65 | 
 66 |     DETECTION_MIN_CONFIDENCE = 0.4
 67 |     POOL_SIZE = 7
 68 | 
 69 | 
 70 | def Test(model, loadpath, savepath):
 71 |     assert not loadpath == savepath, "loadpath should'n same with savepath"
 72 | 
 73 |     model_path = model.find_last()[1]
 74 |     # Load trained weights (fill in path to trained weights here)
 75 |     
 76 |     model.load_weights(model_path, by_name=True)
 77 |     print("Loading weights from ", model_path)
 78 | 
 79 |     if os.path.isdir(loadpath):
 80 |         for idx, imgname in enumerate(os.listdir(loadpath)):
 81 |             if not imgname.lower().endswith(('.bmp', '.jpeg', '.jpg', '.png', '.tif', '.tiff')):
 82 |                 continue
 83 |             print(imgname)
 84 |             imageoriChannel = np.array(plt.imread( os.path.join(loadpath, imgname) )) / 255.0
 85 |             img = image.img_to_array( image.load_img(os.path.join(loadpath, imgname)) )
 86 |             TestSinglePic(img, imageoriChannel, model, savepath=savepath, imgname=imgname)
 87 |             
 88 |     elif os.path.isfile(loadpath):
 89 |         if not loadpath.lower().endswith(('.bmp', '.jpeg', '.jpg', '.png', '.tif', '.tiff')):
 90 |             print("not image file!")
 91 |             return
 92 |         print(loadpath)
 93 |         imageoriChannel = np.array(plt.imread( loadpath )) / 255.0
 94 |         img = image.img_to_array( image.load_img(loadpath) )
 95 |         (filename,extension) = os.path.splitext(loadpath)
 96 |         TestSinglePic(img, imageoriChannel, model, savepath=savepath, imgname=filename)
 97 |     
 98 | def TestSinglePic(image, image_ori, model, savepath, imgname):
 99 |     r = model.detect([image], verbose=1)[0]
100 |     print(r)
101 |     def get_ax(rows=1, cols=1, size=8):
102 |         _, ax = plt.subplots(rows, cols, figsize=(size*cols, size*rows))
103 |         return ax
104 | 
105 |     ax = get_ax(1)
106 | 
107 |     assert not savepath == "", "empty save path"
108 |     assert not imgname == "", "empty image file name"
109 | 
110 |     for box in r['rois']:
111 |         y1, x1, y2, x2 = box
112 |         p = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=2,
113 |                               alpha=0.7, linestyle="dashed",
114 |                               edgecolor="red", facecolor='none')
115 |         ax.add_patch(p)
116 |     ax.imshow(image_ori)
117 | 
118 |     plt.savefig(os.path.join(savepath, imgname),bbox_inches='tight')
119 |     plt.clf()
120 | 
121 | if __name__ == '__main__':
122 |     ROOT_DIR = os.getcwd()
123 |     parser = argparse.ArgumentParser()
124 | 
125 |     parser.add_argument('--loadpath', required=False,
126 |                 default="images/",
127 |                 metavar="evaluate images loadpath",
128 |                 help="evaluate images loadpath")
129 |     parser.add_argument('--savepath', required=False,
130 |             default="result/",
131 |             metavar="evaluate images savepath",
132 |             help="evaluate images savepath")
133 | 
134 |     config = RFCNNConfig()
135 |     args = parser.parse_args()
136 | 
137 |     model = RFCN_Model(mode="inference", config=config,
138 |                       model_dir=os.path.join(ROOT_DIR, "logs") )
139 | 
140 |     Test(model, args.loadpath, args.savepath)


--------------------------------------------------------------------------------
/Fashion_Train.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Keras RFCN
  3 | Copyright (c) 2018
  4 | Licensed under the MIT License (see LICENSE for details)
  5 | Written by parap1uie-s@github.com
  6 | """
  7 | 
  8 | '''
  9 | This is a demo to TRAIN a RFCN model with DeepFashion Dataset
 10 | http://mmlab.ie.cuhk.edu.hk/projects/DeepFashion.html
 11 | '''
 12 | 
 13 | from KerasRFCN.Model.Model import RFCN_Model
 14 | from KerasRFCN.Config import Config
 15 | from KerasRFCN.Utils import Dataset
 16 | import os
 17 | import pickle
 18 | import numpy as np
 19 | from PIL import Image
 20 | 
 21 | ############################################################
 22 | #  Config
 23 | ############################################################
 24 | 
 25 | class RFCNNConfig(Config):
 26 |     """Configuration for training on the toy shapes dataset.
 27 |     Derives from the base Config class and overrides values specific
 28 |     to the toy shapes dataset.
 29 |     """
 30 |     # Give the configuration a recognizable name
 31 |     NAME = "Fashion"
 32 | 
 33 |     # Backbone model
 34 |     # choose one from ['resnet50', 'resnet101', 'resnet50_dilated', 'resnet101_dilated']
 35 |     BACKBONE = "resnet101"
 36 | 
 37 |     # Train on 1 GPU and 8 images per GPU. We can put multiple images on each
 38 |     # GPU because the images are small. Batch size is 8 (GPUs * images/GPU).
 39 |     GPU_COUNT = 1
 40 |     IMAGES_PER_GPU = 1
 41 | 
 42 |     # Number of classes (including background)
 43 |     C = 1 + 46  # background + 2 tags
 44 |     NUM_CLASSES = C
 45 |     # Use small images for faster training. Set the limits of the small side
 46 |     # the large side, and that determines the image shape.
 47 |     IMAGE_MIN_DIM = 640
 48 |     IMAGE_MAX_DIM = 768
 49 | 
 50 |     # Use smaller anchors because our image and objects are small
 51 |     RPN_ANCHOR_SCALES = (32, 64, 128, 256, 512)  # anchor side in pixels
 52 |     # Use same strides on stage 4-6 if use dilated resnet of DetNet
 53 |     # Like BACKBONE_STRIDES = [4, 8, 16, 16, 16]
 54 |     BACKBONE_STRIDES = [4, 8, 16, 32, 64]
 55 |     # Reduce training ROIs per image because the images are small and have
 56 |     # few objects. Aim to allow ROI sampling to pick 33% positive ROIs.
 57 |     TRAIN_ROIS_PER_IMAGE = 200
 58 | 
 59 |     # Use a small epoch since the data is simple
 60 |     STEPS_PER_EPOCH = 1000
 61 | 
 62 |     # use small validation steps since the epoch is small
 63 |     VALIDATION_STEPS = 200
 64 | 
 65 |     RPN_NMS_THRESHOLD = 0.6
 66 |     POOL_SIZE = 7
 67 | 
 68 | ############################################################
 69 | #  Dataset
 70 | ############################################################
 71 | 
 72 | class FashionDataset(Dataset):
 73 |     # count - int, images in the dataset
 74 |     def initDB(self, count, start = 0):
 75 |         self.start = start
 76 | 
 77 |         all_images, classes_count, class_mapping = pickle.load(open("data.pk", "rb"))
 78 |         self.classes = {}
 79 |         # Add classes
 80 |         for k,c in class_mapping.items():
 81 |             self.add_class("Fashion",c,k)
 82 |             self.classes[c] = k
 83 | 
 84 |         for k, item in enumerate(all_images[start:count+start]):
 85 |             self.add_image(source="Fashion",image_id=k, path=item['filepath'], width=item['width'], height=item['height'], bboxes=item['bboxes'])
 86 | 
 87 |         self.rootpath = '/content/'
 88 | 
 89 |     # read image from file and get the 
 90 |     def load_image(self, image_id):
 91 |         info = self.image_info[image_id]
 92 |         # tempImg = image.img_to_array( image.load_img(info['path']) )
 93 |         tempImg = np.array(Image.open( os.path.join(self.rootpath, info['path']) ))
 94 |         return tempImg
 95 | 
 96 |     def get_keys(self, d, value):
 97 |         return [k for k,v in d.items() if v == value]
 98 | 
 99 |     def load_bbox(self, image_id):
100 |         info = self.image_info[image_id]
101 |         bboxes = []
102 |         labels = []
103 |         for item in info['bboxes']:
104 |             bboxes.append((item['y1'], item['x1'], item['y2'], item['x2']))
105 |             label_key = self.get_keys(self.classes, item['class'])
106 |             if len(label_key) == 0:
107 |                 continue
108 |             labels.extend( label_key )
109 |         return np.array(bboxes), np.array(labels)
110 | 
111 | if __name__ == '__main__':
112 |     ROOT_DIR = os.getcwd()
113 | 
114 |     config = RFCNNConfig()
115 |     dataset_train = FashionDataset()
116 |     dataset_train.initDB(100000)
117 |     dataset_train.prepare()
118 | 
119 |     # Validation dataset
120 |     dataset_val = FashionDataset()
121 |     dataset_val.initDB(5000, start=100000)
122 |     dataset_val.prepare()
123 | 
124 |     model = RFCN_Model(mode="training", config=config, model_dir=os.path.join(ROOT_DIR, "logs") )
125 | 
126 |     # This is a hack, bacause the pre-train weights are not fit with dilated ResNet
127 |     model.keras_model.load_weights("resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5", by_name=True, skip_mismatch=True)
128 | 
129 |     try:
130 |         model_path = model.find_last()[1]
131 |         if model_path is not None:
132 |             model.load_weights(model_path, by_name=True)
133 |     except Exception as e:
134 |         print(e)
135 |         print("No checkpoint founded")
136 |         
137 |     # *** This training schedule is an example. Update to your needs ***
138 | 
139 |     # Training - Stage 1
140 |     model.train(dataset_train, dataset_val,
141 |                 learning_rate=config.LEARNING_RATE,
142 |                 epochs=20,
143 |                 layers='heads')
144 | 
145 |     # Training - Stage 2
146 |     # Finetune layers from ResNet stage 4 and up
147 |     print("Fine tune Resnet stage 4 and up")
148 |     model.train(dataset_train, dataset_val,
149 |                 learning_rate=config.LEARNING_RATE,
150 |                 epochs=40,
151 |                 layers='4+')
152 | 
153 |     # Training - Stage 3
154 |     # Fine tune all layers
155 |     print("Fine tune all layers")
156 |     model.train(dataset_train, dataset_val,
157 |                 learning_rate=config.LEARNING_RATE,
158 |                 epochs=80,
159 |                 layers='all')
160 | 
161 |     # Training - Stage 3
162 |     # Fine tune all layers
163 |     print("Fine tune all layers")
164 |     model.train(dataset_train, dataset_val,
165 |                 learning_rate=config.LEARNING_RATE,
166 |                 epochs=240,
167 |                 layers='all')


--------------------------------------------------------------------------------
/KerasRFCN/Config.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Keras RFCN
  3 | Copyright (c) 2018
  4 | Licensed under the MIT License (see LICENSE for details)
  5 | Written by parap1uie-s@github.com
  6 | """
  7 | 
  8 | import math
  9 | import numpy as np
 10 | 
 11 | 
 12 | # Base Configuration Class
 13 | # Don't use this class directly. Instead, sub-class it and override
 14 | # the configurations you need to change.
 15 | 
 16 | class Config(object):
 17 |     """Base configuration class. For custom configurations, create a
 18 |     sub-class that inherits from this one and override properties
 19 |     that need to be changed.
 20 |     """
 21 |     # Name the configurations. For example, 'COCO', 'Experiment 3', ...etc.
 22 |     # Useful if your code needs to do things differently depending on which
 23 |     # experiment is running.
 24 |     NAME = None  # Override in sub-classes
 25 | 
 26 |     # Backbone model
 27 |     # choose one from ['resnet50', 'resnet101', 'resnet50_dilated', 'resnet101_dilated']
 28 |     BACKBONE = "resnet101"
 29 | 
 30 |     # NUMBER OF GPUs to use. For CPU training, use 1
 31 |     GPU_COUNT = 1
 32 | 
 33 |     # Number of images to train with on each GPU. A 12GB GPU can typically
 34 |     # handle 2 images of 1024x1024px.
 35 |     # Adjust based on your GPU memory and image sizes. Use the highest
 36 |     # number that your GPU can handle for best performance.
 37 |     IMAGES_PER_GPU = 2
 38 | 
 39 |     # Number of training steps per epoch
 40 |     # This doesn't need to match the size of the training set. Tensorboard
 41 |     # updates are saved at the end of each epoch, so setting this to a
 42 |     # smaller number means getting more frequent TensorBoard updates.
 43 |     # Validation stats are also calculated at each epoch end and they
 44 |     # might take a while, so don't set this too small to avoid spending
 45 |     # a lot of time on validation stats.
 46 |     STEPS_PER_EPOCH = 1000
 47 | 
 48 |     # Number of validation steps to run at the end of every training epoch.
 49 |     # A bigger number improves accuracy of validation stats, but slows
 50 |     # down the training.
 51 |     VALIDATION_STEPS = 50
 52 | 
 53 |     # The strides of each layer of the FPN Pyramid. These values
 54 |     # are based on a Resnet101 backbone.
 55 |     # Use same strides on stage 4-6 if use dilated resnet of DetNet
 56 |     # Like BACKBONE_STRIDES = [4, 8, 16, 16, 16]
 57 |     BACKBONE_STRIDES = [4, 8, 16, 32, 64]
 58 | 
 59 |     # Number of classification classes (including background)
 60 |     NUM_CLASSES = 1  # Override in sub-classes
 61 | 
 62 |     # Length of square anchor side in pixels
 63 |     RPN_ANCHOR_SCALES = (32, 64, 128, 256, 512)
 64 | 
 65 |     # Ratios of anchors at each cell (width/height)
 66 |     # A value of 1 represents a square anchor, and 0.5 is a wide anchor
 67 |     RPN_ANCHOR_RATIOS = [0.5, 1, 2]
 68 | 
 69 |     # Anchor stride
 70 |     # If 1 then anchors are created for each cell in the backbone feature map.
 71 |     # If 2, then anchors are created for every other cell, and so on.
 72 |     RPN_ANCHOR_STRIDE = 1
 73 | 
 74 |     # Non-max suppression threshold to filter RPN proposals.
 75 |     # You can reduce this during training to generate more propsals.
 76 |     RPN_NMS_THRESHOLD = 0.7
 77 | 
 78 |     # How many anchors per image to use for RPN training
 79 |     RPN_TRAIN_ANCHORS_PER_IMAGE = 256
 80 | 
 81 |     # ROIs kept after non-maximum supression (training and inference)
 82 |     POST_NMS_ROIS_TRAINING = 2000
 83 |     POST_NMS_ROIS_INFERENCE = 1000
 84 | 
 85 |     # Input image resing
 86 |     # Images are resized such that the smallest side is >= IMAGE_MIN_DIM and
 87 |     # the longest side is <= IMAGE_MAX_DIM. In case both conditions can't
 88 |     # be satisfied together the IMAGE_MAX_DIM is enforced.
 89 |     IMAGE_MIN_DIM = 800
 90 |     IMAGE_MAX_DIM = 1024
 91 |     # If True, pad images with zeros such that they're (max_dim by max_dim)
 92 |     IMAGE_PADDING = True  # currently, the False option is not supported
 93 | 
 94 |     # Image mean (RGB)
 95 |     MEAN_PIXEL = np.array([123.7, 116.8, 103.9])
 96 | 
 97 |     # Number of ROIs per image to feed to classifier/mask heads
 98 |     # The Mask RCNN paper uses 512 but often the RPN doesn't generate
 99 |     # enough positive proposals to fill this and keep a positive:negative
100 |     # ratio of 1:3. You can increase the number of proposals by adjusting
101 |     # the RPN NMS threshold.
102 |     TRAIN_ROIS_PER_IMAGE = 200
103 | 
104 |     # Percent of positive ROIs used to train classifier/mask heads
105 |     ROI_POSITIVE_RATIO = 0.33
106 | 
107 |     # Pooled ROIs
108 |     POOL_SIZE = 3
109 | 
110 |     # Maximum number of ground truth instances to use in one image
111 |     MAX_GT_INSTANCES = 100
112 | 
113 |     # Bounding box refinement standard deviation for RPN and final detections.
114 |     RPN_BBOX_STD_DEV = np.array([0.1, 0.1, 0.2, 0.2])
115 |     BBOX_STD_DEV = np.array([0.1, 0.1, 0.2, 0.2])
116 | 
117 |     # Max number of final detections
118 |     DETECTION_MAX_INSTANCES = 100
119 | 
120 |     # Minimum probability value to accept a detected instance
121 |     # ROIs below this threshold are skipped
122 |     DETECTION_MIN_CONFIDENCE = 0.8
123 | 
124 |     # Non-maximum suppression threshold for detection
125 |     DETECTION_NMS_THRESHOLD = 0.3
126 | 
127 |     # Learning rate and momentum
128 |     # The Mask RCNN paper uses lr=0.02, but on TensorFlow it causes
129 |     # weights to explode. Likely due to differences in optimzer
130 |     # implementation.
131 |     LEARNING_RATE = 0.001
132 |     LEARNING_MOMENTUM = 0.9
133 | 
134 |     # Weight decay regularization
135 |     WEIGHT_DECAY = 0.0005
136 | 
137 |     # Use RPN ROIs or externally generated ROIs for training
138 |     # Keep this True for most situations. Set to False if you want to train
139 |     # the head branches on ROI generated by code rather than the ROIs from
140 |     # the RPN. For example, to debug the classifier head without having to
141 |     # train the RPN.
142 |     USE_RPN_ROIS = True
143 | 
144 |     K = 3
145 | 
146 |     def __init__(self):
147 |         """Set values of computed attributes."""
148 |         # Effective batch size
149 |         self.BATCH_SIZE = self.IMAGES_PER_GPU * self.GPU_COUNT
150 | 
151 |         # Input image size
152 |         self.IMAGE_SHAPE = np.array(
153 |             [self.IMAGE_MAX_DIM, self.IMAGE_MAX_DIM, 3])
154 | 
155 |         # Compute backbone size from input image size
156 |         self.BACKBONE_SHAPES = np.array(
157 |             [[int(math.ceil(self.IMAGE_SHAPE[0] / stride)),
158 |               int(math.ceil(self.IMAGE_SHAPE[1] / stride))]
159 |              for stride in self.BACKBONE_STRIDES])
160 | 
161 |     def display(self):
162 |         """Display Configuration values."""
163 |         print("\nConfigurations:")
164 |         for a in dir(self):
165 |             if not a.startswith("__") and not callable(getattr(self, a)):
166 |                 print("{:30} {}".format(a, getattr(self, a)))
167 |         print("\n")
168 | 


--------------------------------------------------------------------------------
/KerasRFCN/Data_generator.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Keras RFCN
  3 | Copyright (c) 2018
  4 | Licensed under the MIT License (see LICENSE for details)
  5 | Written by parap1uie-s@github.com
  6 | """
  7 | 
  8 | import numpy as np
  9 | import KerasRFCN.Utils
 10 | import logging
 11 | ############################################################
 12 | #  Data Generator
 13 | ############################################################
 14 | 
 15 | def load_image_gt(dataset, config, image_id, augment=False):
 16 |     """Load and return ground truth data for an image (image, mask, bounding boxes).
 17 | 
 18 |     augment: If true, apply random image augmentation. Currently, only
 19 |         horizontal flipping is offered.
 20 | 
 21 |     Returns:
 22 |     image: [height, width, 3]
 23 |     shape: the original shape of the image before resizing and cropping.
 24 |     class_ids: [instance_count] Integer class IDs
 25 |     bbox: [instance_count, (y1, x1, y2, x2)]
 26 |     """
 27 |     # Load image and mask
 28 |     image = dataset.load_image(image_id)
 29 |     # bbox: [num_instances, (y1, x1, y2, x2)]
 30 |     bboxes, class_ids = dataset.load_bbox(image_id)
 31 |     shape = image.shape
 32 |     image, window, scale, padding = KerasRFCN.Utils.resize_image(
 33 |         image,
 34 |         min_dim=config.IMAGE_MIN_DIM,
 35 |         max_dim=config.IMAGE_MAX_DIM,
 36 |         padding=config.IMAGE_PADDING)
 37 |     bboxes = KerasRFCN.Utils.resize_bbox(bboxes, scale, padding)
 38 |     # img_h, img_w, img_c = image.shape
 39 | 
 40 |     # Random horizontal flips.
 41 |     # TODO: data-augment:fliplr the bbox coordinate
 42 |     # if augment:
 43 |     #     if random.randint(0, 1):
 44 |     #         image = np.fliplr(image)
 45 |     #         bboxes = bbox_fliplr(bboxes, height, width)
 46 | 
 47 |     # Bounding boxes. Note that some boxes might be all zeros
 48 |     
 49 |     # Active classes
 50 |     # Different datasets have different classes, so track the
 51 |     # classes supported in the dataset of this image.
 52 |     active_class_ids = np.zeros([dataset.num_classes], dtype=np.int32)
 53 |     source_class_ids = dataset.source_class_ids[dataset.image_info[image_id]["source"]]
 54 |     active_class_ids[source_class_ids] = 1
 55 | 
 56 |     # Image meta data
 57 |     image_meta = compose_image_meta(image_id, shape, window, active_class_ids)
 58 | 
 59 |     return image, image_meta, class_ids, bboxes
 60 | 
 61 | 
 62 | def build_detection_targets(rpn_rois, gt_class_ids, gt_boxes, config):
 63 |     """Generate targets for training Stage 2 classifier.
 64 |     This is not used in normal training. It's useful for debugging or to train
 65 |     the Mask RCNN heads without using the RPN head.
 66 | 
 67 |     Inputs:
 68 |     rpn_rois: [N, (y1, x1, y2, x2)] proposal boxes.
 69 |     gt_class_ids: [instance count] Integer class IDs
 70 |     gt_boxes: [instance count, (y1, x1, y2, x2)]
 71 | 
 72 |     Returns:
 73 |     rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)]
 74 |     class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs.
 75 |     bboxes: [TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (y, x, log(h), log(w))]. Class-specific
 76 |             bbox refinments.
 77 |     """
 78 |     assert rpn_rois.shape[0] > 0
 79 |     assert gt_class_ids.dtype == np.int32, "Expected int but got {}".format(
 80 |         gt_class_ids.dtype)
 81 |     assert gt_boxes.dtype == np.int32, "Expected int but got {}".format(
 82 |         gt_boxes.dtype)
 83 | 
 84 |     # It's common to add GT Boxes to ROIs but we don't do that here because
 85 |     # according to XinLei Chen's paper, it doesn't help.
 86 | 
 87 |     # Trim empty padding in gt_boxes and gt_masks parts
 88 |     instance_ids = np.where(gt_class_ids > 0)[0]
 89 |     assert instance_ids.shape[0] > 0, "Image must contain instances."
 90 |     gt_class_ids = gt_class_ids[instance_ids]
 91 |     gt_boxes = gt_boxes[instance_ids]
 92 | 
 93 |     # Compute areas of ROIs and ground truth boxes.
 94 |     rpn_roi_area = (rpn_rois[:, 2] - rpn_rois[:, 0]) * \
 95 |         (rpn_rois[:, 3] - rpn_rois[:, 1])
 96 |     gt_box_area = (gt_boxes[:, 2] - gt_boxes[:, 0]) * \
 97 |         (gt_boxes[:, 3] - gt_boxes[:, 1])
 98 | 
 99 |     # Compute overlaps [rpn_rois, gt_boxes]
100 |     overlaps = np.zeros((rpn_rois.shape[0], gt_boxes.shape[0]))
101 |     for i in range(overlaps.shape[1]):
102 |         gt = gt_boxes[i]
103 |         overlaps[:, i] = KerasRFCN.Utils.compute_iou(
104 |             gt, rpn_rois, gt_box_area[i], rpn_roi_area)
105 | 
106 |     # Assign ROIs to GT boxes
107 |     rpn_roi_iou_argmax = np.argmax(overlaps, axis=1)
108 |     rpn_roi_iou_max = overlaps[np.arange(
109 |         overlaps.shape[0]), rpn_roi_iou_argmax]
110 |     # GT box assigned to each ROI
111 |     rpn_roi_gt_boxes = gt_boxes[rpn_roi_iou_argmax]
112 |     rpn_roi_gt_class_ids = gt_class_ids[rpn_roi_iou_argmax]
113 | 
114 |     # Positive ROIs are those with >= 0.5 IoU with a GT box.
115 |     fg_ids = np.where(rpn_roi_iou_max > 0.5)[0]
116 | 
117 |     # Negative ROIs are those with max IoU 0.1-0.5 (hard example mining)
118 |     # TODO: To hard example mine or not to hard example mine, that's the question
119 | #     bg_ids = np.where((rpn_roi_iou_max >= 0.1) & (rpn_roi_iou_max < 0.5))[0]
120 |     bg_ids = np.where(rpn_roi_iou_max < 0.5)[0]
121 | 
122 |     # Subsample ROIs. Aim for 33% foreground.
123 |     # FG
124 |     fg_roi_count = int(config.TRAIN_ROIS_PER_IMAGE * config.ROI_POSITIVE_RATIO)
125 |     if fg_ids.shape[0] > fg_roi_count:
126 |         keep_fg_ids = np.random.choice(fg_ids, fg_roi_count, replace=False)
127 |     else:
128 |         keep_fg_ids = fg_ids
129 |     # BG
130 |     remaining = config.TRAIN_ROIS_PER_IMAGE - keep_fg_ids.shape[0]
131 |     if bg_ids.shape[0] > remaining:
132 |         keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False)
133 |     else:
134 |         keep_bg_ids = bg_ids
135 |     # Combine indicies of ROIs to keep
136 |     keep = np.concatenate([keep_fg_ids, keep_bg_ids])
137 |     # Need more?
138 |     remaining = config.TRAIN_ROIS_PER_IMAGE - keep.shape[0]
139 |     if remaining > 0:
140 |         # Looks like we don't have enough samples to maintain the desired
141 |         # balance. Reduce requirements and fill in the rest. This is
142 |         # likely different from the Mask RCNN paper.
143 | 
144 |         # There is a small chance we have neither fg nor bg samples.
145 |         if keep.shape[0] == 0:
146 |             # Pick bg regions with easier IoU threshold
147 |             bg_ids = np.where(rpn_roi_iou_max < 0.5)[0]
148 |             assert bg_ids.shape[0] >= remaining
149 |             keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False)
150 |             assert keep_bg_ids.shape[0] == remaining
151 |             keep = np.concatenate([keep, keep_bg_ids])
152 |         else:
153 |             # Fill the rest with repeated bg rois.
154 |             keep_extra_ids = np.random.choice(
155 |                 keep_bg_ids, remaining, replace=True)
156 |             keep = np.concatenate([keep, keep_extra_ids])
157 |     assert keep.shape[0] == config.TRAIN_ROIS_PER_IMAGE, \
158 |         "keep doesn't match ROI batch size {}, {}".format(
159 |             keep.shape[0], config.TRAIN_ROIS_PER_IMAGE)
160 | 
161 |     # Reset the gt boxes assigned to BG ROIs.
162 |     rpn_roi_gt_boxes[keep_bg_ids, :] = 0
163 |     rpn_roi_gt_class_ids[keep_bg_ids] = 0
164 | 
165 |     # For each kept ROI, assign a class_id, and for FG ROIs also add bbox refinement.
166 |     rois = rpn_rois[keep]
167 |     roi_gt_boxes = rpn_roi_gt_boxes[keep]
168 |     roi_gt_class_ids = rpn_roi_gt_class_ids[keep]
169 |     roi_gt_assignment = rpn_roi_iou_argmax[keep]
170 | 
171 |     # Class-aware bbox deltas. [y, x, log(h), log(w)]
172 |     bboxes = np.zeros((config.TRAIN_ROIS_PER_IMAGE,
173 |                        config.NUM_CLASSES, 4), dtype=np.float32)
174 |     pos_ids = np.where(roi_gt_class_ids > 0)[0]
175 |     bboxes[pos_ids, roi_gt_class_ids[pos_ids]] = KerasRFCN.Utils.box_refinement(
176 |         rois[pos_ids], roi_gt_boxes[pos_ids, :4])
177 |     # Normalize bbox refinments
178 |     bboxes /= config.BBOX_STD_DEV
179 | 
180 |     return rois, roi_gt_class_ids, bboxes
181 | 
182 | 
183 | def build_rpn_targets(image_shape, anchors, gt_class_ids, gt_boxes, config):
184 |     """Given the anchors and GT boxes, compute overlaps and identify positive
185 |     anchors and deltas to refine them to match their corresponding GT boxes.
186 | 
187 |     anchors: [num_anchors, (y1, x1, y2, x2)]
188 |     gt_class_ids: [num_gt_boxes] Integer class IDs.
189 |     gt_boxes: [num_gt_boxes, (y1, x1, y2, x2)]
190 | 
191 |     Returns:
192 |     rpn_match: [N] (int32) matches between anchors and GT boxes.
193 |                1 = positive anchor, -1 = negative anchor, 0 = neutral
194 |     rpn_bbox: [N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas.
195 |     """
196 |     # RPN Match: 1 = positive anchor, -1 = negative anchor, 0 = neutral
197 |     rpn_match = np.zeros([anchors.shape[0]], dtype=np.int32)
198 |     # RPN bounding boxes: [max anchors per image, (dy, dx, log(dh), log(dw))]
199 |     rpn_bbox = np.zeros((config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4))
200 | 
201 |     # Handle COCO crowds
202 |     # A crowd box in COCO is a bounding box around several instances. Exclude
203 |     # them from training. A crowd box is given a negative class ID.
204 |     crowd_ix = np.where(gt_class_ids < 0)[0]
205 |     if crowd_ix.shape[0] > 0:
206 |         # Filter out crowds from ground truth class IDs and boxes
207 |         non_crowd_ix = np.where(gt_class_ids > 0)[0]
208 |         crowd_boxes = gt_boxes[crowd_ix]
209 |         gt_class_ids = gt_class_ids[non_crowd_ix]
210 |         gt_boxes = gt_boxes[non_crowd_ix]
211 |         # Compute overlaps with crowd boxes [anchors, crowds]
212 |         crowd_overlaps = KerasRFCN.Utils.compute_overlaps(anchors, crowd_boxes)
213 |         crowd_iou_max = np.amax(crowd_overlaps, axis=1)
214 |         no_crowd_bool = (crowd_iou_max < 0.001)
215 |     else:
216 |         # All anchors don't intersect a crowd
217 |         no_crowd_bool = np.ones([anchors.shape[0]], dtype=bool)
218 | 
219 |     # Compute overlaps [num_anchors, num_gt_boxes]
220 |     overlaps = KerasRFCN.Utils.compute_overlaps(anchors, gt_boxes)
221 | 
222 |     # Match anchors to GT Boxes
223 |     # If an anchor overlaps a GT box with IoU >= 0.7 then it's positive.
224 |     # If an anchor overlaps a GT box with IoU < 0.3 then it's negative.
225 |     # Neutral anchors are those that don't match the conditions above,
226 |     # and they don't influence the loss function.
227 |     # However, don't keep any GT box unmatched (rare, but happens). Instead,
228 |     # match it to the closest anchor (even if its max IoU is < 0.3).
229 |     #
230 |     # 1. Set negative anchors first. They get overwritten below if a GT box is
231 |     # matched to them. Skip boxes in crowd areas.
232 |     anchor_iou_argmax = np.argmax(overlaps, axis=1)
233 |     anchor_iou_max = overlaps[np.arange(overlaps.shape[0]), anchor_iou_argmax]
234 |     rpn_match[(anchor_iou_max < 0.3) & (no_crowd_bool)] = -1
235 |     # 2. Set an anchor for each GT box (regardless of IoU value).
236 |     # TODO: If multiple anchors have the same IoU match all of them
237 |     gt_iou_argmax = np.argmax(overlaps, axis=0)
238 |     rpn_match[gt_iou_argmax] = 1
239 |     # 3. Set anchors with high overlap as positive.
240 |     rpn_match[anchor_iou_max >= 0.7] = 1
241 | 
242 |     # Subsample to balance positive and negative anchors
243 |     # Don't let positives be more than half the anchors
244 |     ids = np.where(rpn_match == 1)[0]
245 |     extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE // 2)
246 |     if extra > 0:
247 |         # Reset the extra ones to neutral
248 |         ids = np.random.choice(ids, extra, replace=False)
249 |         rpn_match[ids] = 0
250 |     # Same for negative proposals
251 |     ids = np.where(rpn_match == -1)[0]
252 |     extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE -
253 |                         np.sum(rpn_match == 1))
254 |     if extra > 0:
255 |         # Rest the extra ones to neutral
256 |         ids = np.random.choice(ids, extra, replace=False)
257 |         rpn_match[ids] = 0
258 | 
259 |     # For positive anchors, compute shift and scale needed to transform them
260 |     # to match the corresponding GT boxes.
261 |     ids = np.where(rpn_match == 1)[0]
262 |     ix = 0  # index into rpn_bbox
263 |     # TODO: use box_refinment() rather than duplicating the code here
264 |     for i, a in zip(ids, anchors[ids]):
265 |         # Closest gt box (it might have IoU < 0.7)
266 |         gt = gt_boxes[anchor_iou_argmax[i]]
267 | 
268 |         # Convert coordinates to center plus width/height.
269 |         # GT Box
270 |         gt_h = gt[2] - gt[0]
271 |         gt_w = gt[3] - gt[1]
272 |         gt_center_y = gt[0] + 0.5 * gt_h
273 |         gt_center_x = gt[1] + 0.5 * gt_w
274 |         # Anchor
275 |         a_h = a[2] - a[0]
276 |         a_w = a[3] - a[1]
277 |         a_center_y = a[0] + 0.5 * a_h
278 |         a_center_x = a[1] + 0.5 * a_w
279 | 
280 |         # Compute the bbox refinement that the RPN should predict.
281 |         rpn_bbox[ix] = [
282 |             (gt_center_y - a_center_y) / a_h,
283 |             (gt_center_x - a_center_x) / a_w,
284 |             np.log(gt_h / a_h),
285 |             np.log(gt_w / a_w),
286 |         ]
287 |         # Normalize
288 |         rpn_bbox[ix] /= config.RPN_BBOX_STD_DEV
289 |         ix += 1
290 | 
291 |     return rpn_match, rpn_bbox
292 | 
293 | 
294 | def generate_random_rois(image_shape, count, gt_class_ids, gt_boxes):
295 |     """Generates ROI proposals similar to what a region proposal network
296 |     would generate.
297 | 
298 |     image_shape: [Height, Width, Depth]
299 |     count: Number of ROIs to generate
300 |     gt_class_ids: [N] Integer ground truth class IDs
301 |     gt_boxes: [N, (y1, x1, y2, x2)] Ground truth boxes in pixels.
302 | 
303 |     Returns: [count, (y1, x1, y2, x2)] ROI boxes in pixels.
304 |     """
305 |     # placeholder
306 |     rois = np.zeros((count, 4), dtype=np.int32)
307 | 
308 |     # Generate random ROIs around GT boxes (90% of count)
309 |     rois_per_box = int(0.9 * count / gt_boxes.shape[0])
310 |     for i in range(gt_boxes.shape[0]):
311 |         gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[i]
312 |         h = gt_y2 - gt_y1
313 |         w = gt_x2 - gt_x1
314 |         # random boundaries
315 |         r_y1 = max(gt_y1 - h, 0)
316 |         r_y2 = min(gt_y2 + h, image_shape[0])
317 |         r_x1 = max(gt_x1 - w, 0)
318 |         r_x2 = min(gt_x2 + w, image_shape[1])
319 | 
320 |         # To avoid generating boxes with zero area, we generate double what
321 |         # we need and filter out the extra. If we get fewer valid boxes
322 |         # than we need, we loop and try again.
323 |         while True:
324 |             y1y2 = np.random.randint(r_y1, r_y2, (rois_per_box * 2, 2))
325 |             x1x2 = np.random.randint(r_x1, r_x2, (rois_per_box * 2, 2))
326 |             # Filter out zero area boxes
327 |             threshold = 1
328 |             y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >=
329 |                         threshold][:rois_per_box]
330 |             x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >=
331 |                         threshold][:rois_per_box]
332 |             if y1y2.shape[0] == rois_per_box and x1x2.shape[0] == rois_per_box:
333 |                 break
334 | 
335 |         # Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape
336 |         # into x1, y1, x2, y2 order
337 |         x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)
338 |         y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)
339 |         box_rois = np.hstack([y1, x1, y2, x2])
340 |         rois[rois_per_box * i:rois_per_box * (i + 1)] = box_rois
341 | 
342 |     # Generate random ROIs anywhere in the image (10% of count)
343 |     remaining_count = count - (rois_per_box * gt_boxes.shape[0])
344 |     # To avoid generating boxes with zero area, we generate double what
345 |     # we need and filter out the extra. If we get fewer valid boxes
346 |     # than we need, we loop and try again.
347 |     while True:
348 |         y1y2 = np.random.randint(0, image_shape[0], (remaining_count * 2, 2))
349 |         x1x2 = np.random.randint(0, image_shape[1], (remaining_count * 2, 2))
350 |         # Filter out zero area boxes
351 |         threshold = 1
352 |         y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >=
353 |                     threshold][:remaining_count]
354 |         x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >=
355 |                     threshold][:remaining_count]
356 |         if y1y2.shape[0] == remaining_count and x1x2.shape[0] == remaining_count:
357 |             break
358 | 
359 |     # Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape
360 |     # into x1, y1, x2, y2 order
361 |     x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)
362 |     y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)
363 |     global_rois = np.hstack([y1, x1, y2, x2])
364 |     rois[-remaining_count:] = global_rois
365 |     return rois
366 | 
367 | 
368 | def data_generator(dataset, config, shuffle=True, augment=True, random_rois=0,
369 |                    batch_size=1, detection_targets=False):
370 |     """A generator that returns images and corresponding target class ids,
371 |     bounding box deltas.
372 | 
373 |     dataset: The Dataset object to pick data from
374 |     config: The model config object
375 |     shuffle: If True, shuffles the samples before every epoch
376 |     augment: If True, applies image augmentation to images (currently only
377 |              horizontal flips are supported)
378 |     random_rois: If > 0 then generate proposals to be used to train the
379 |                  network classifier. Useful if training
380 |                  the Mask RCNN part without the RPN.
381 |     batch_size: How many images to return in each call
382 |     detection_targets: If True, generate detection targets (class IDs, bbox
383 |         deltas). Typically for debugging or visualizations because
384 |         in trainig detection targets are generated by DetectionTargetLayer.
385 | 
386 |     Returns a Python generator. Upon calling next() on it, the
387 |     generator returns two lists, inputs and outputs. The containtes
388 |     of the lists differs depending on the received arguments:
389 |     inputs list:
390 |     - images: [batch, H, W, C]
391 |     - image_meta: [batch, size of image meta]
392 |     - rpn_match: [batch, N] Integer (1=positive anchor, -1=negative, 0=neutral)
393 |     - rpn_bbox: [batch, N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas.
394 |     - gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs
395 |     - gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)]
396 | 
397 |     outputs list: Usually empty in regular training. But if detection_targets
398 |         is True then the outputs list contains target class_ids, bbox deltas.
399 |     """
400 |     b = 0  # batch item index
401 |     image_index = -1
402 |     image_ids = np.copy(dataset.image_ids)
403 |     error_count = 0
404 | 
405 |     # Anchors
406 |     # [anchor_count, (y1, x1, y2, x2)]
407 |     anchors = KerasRFCN.Utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES,
408 |                                              config.RPN_ANCHOR_RATIOS,
409 |                                              config.BACKBONE_SHAPES,
410 |                                              config.BACKBONE_STRIDES,
411 |                                              config.RPN_ANCHOR_STRIDE)
412 | 
413 |     # Keras requires a generator to run indefinately.
414 |     while True:
415 |         try:
416 |             # Increment index to pick next image. Shuffle if at the start of an epoch.
417 |             image_index = (image_index + 1) % len(image_ids)
418 |             if shuffle and image_index == 0:
419 |                 np.random.shuffle(image_ids)
420 | 
421 |             # Get GT bounding boxes for image.
422 |             image_id = image_ids[image_index]
423 |             image, image_meta, gt_class_ids, gt_boxes = \
424 |                 load_image_gt(dataset, config, image_id, augment=augment)
425 | 
426 |             # Skip images that have no instances. This can happen in cases
427 |             # where we train on a subset of classes and the image doesn't
428 |             # have any of the classes we care about.
429 |             if not np.any(gt_class_ids > 0):
430 |                 continue
431 | 
432 |             # RPN Targets
433 |             rpn_match, rpn_bbox = build_rpn_targets(image.shape, anchors,
434 |                                                     gt_class_ids, gt_boxes, config)
435 | 
436 |             # Mask R-CNN Targets
437 |             if random_rois:
438 |                 rpn_rois = generate_random_rois(
439 |                     image.shape, random_rois, gt_class_ids, gt_boxes)
440 |                 if detection_targets:
441 |                     rois, mrcnn_class_ids, mrcnn_bbox =\
442 |                         build_detection_targets(
443 |                             rpn_rois, gt_class_ids, gt_boxes, config)
444 | 
445 |             # Init batch arrays
446 |             if b == 0:
447 |                 batch_image_meta = np.zeros(
448 |                     (batch_size,) + image_meta.shape, dtype=image_meta.dtype)
449 |                 batch_rpn_match = np.zeros(
450 |                     [batch_size, anchors.shape[0], 1], dtype=rpn_match.dtype)
451 |                 batch_rpn_bbox = np.zeros(
452 |                     [batch_size, config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4], dtype=rpn_bbox.dtype)
453 |                 batch_images = np.zeros(
454 |                     (batch_size,) + image.shape, dtype=np.float32)
455 |                 batch_gt_class_ids = np.zeros(
456 |                     (batch_size, config.MAX_GT_INSTANCES), dtype=np.int32)
457 |                 batch_gt_boxes = np.zeros(
458 |                     (batch_size, config.MAX_GT_INSTANCES, 4), dtype=np.int32)
459 | 
460 |                 if random_rois:
461 |                     batch_rpn_rois = np.zeros(
462 |                         (batch_size, rpn_rois.shape[0], 4), dtype=rpn_rois.dtype)
463 |                     if detection_targets:
464 |                         batch_rois = np.zeros(
465 |                             (batch_size,) + rois.shape, dtype=rois.dtype)
466 |                         batch_mrcnn_class_ids = np.zeros(
467 |                             (batch_size,) + mrcnn_class_ids.shape, dtype=mrcnn_class_ids.dtype)
468 |                         batch_mrcnn_bbox = np.zeros(
469 |                             (batch_size,) + mrcnn_bbox.shape, dtype=mrcnn_bbox.dtype)
470 | 
471 |             # If more instances than fits in the array, sub-sample from them.
472 |             if gt_boxes.shape[0] > config.MAX_GT_INSTANCES:
473 |                 ids = np.random.choice(
474 |                     np.arange(gt_boxes.shape[0]), config.MAX_GT_INSTANCES, replace=False)
475 |                 gt_class_ids = gt_class_ids[ids]
476 |                 gt_boxes = gt_boxes[ids]
477 | 
478 |             # Add to batch
479 |             batch_image_meta[b] = image_meta
480 |             batch_rpn_match[b] = rpn_match[:, np.newaxis]
481 |             batch_rpn_bbox[b] = rpn_bbox
482 |             batch_images[b] = mold_image(image.astype(np.float32), config)
483 |             batch_gt_class_ids[b, :gt_class_ids.shape[0]] = gt_class_ids
484 |             batch_gt_boxes[b, :gt_boxes.shape[0]] = gt_boxes
485 |             if random_rois:
486 |                 batch_rpn_rois[b] = rpn_rois
487 |                 if detection_targets:
488 |                     batch_rois[b] = rois
489 |                     batch_mrcnn_class_ids[b] = mrcnn_class_ids
490 |                     batch_mrcnn_bbox[b] = mrcnn_bbox
491 |             b += 1
492 | 
493 |             # Batch full?
494 |             if b >= batch_size:
495 |                 inputs = [batch_images, batch_image_meta, batch_rpn_match, batch_rpn_bbox,
496 |                           batch_gt_class_ids, batch_gt_boxes]
497 |                 outputs = []
498 | 
499 |                 if random_rois:
500 |                     inputs.extend([batch_rpn_rois])
501 |                     if detection_targets:
502 |                         inputs.extend([batch_rois])
503 |                         # Keras requires that output and targets have the same number of dimensions
504 |                         batch_mrcnn_class_ids = np.expand_dims(
505 |                             batch_mrcnn_class_ids, -1)
506 |                         outputs.extend(
507 |                             [batch_mrcnn_class_ids, batch_mrcnn_bbox])
508 | 
509 |                 yield inputs, outputs
510 | 
511 |                 # start a new batch
512 |                 b = 0
513 |         except (GeneratorExit, KeyboardInterrupt):
514 |             raise
515 |         except:
516 |             # Log it and skip the image
517 |             logging.exception("Error processing image {}".format(
518 |                 dataset.image_info[image_id]))
519 |             error_count += 1
520 |             if error_count > 5:
521 |                 raise
522 | 
523 | def compose_image_meta(image_id, image_shape, window, active_class_ids):
524 |     """Takes attributes of an image and puts them in one 1D array. Use
525 |     parse_image_meta() to parse the values back.
526 | 
527 |     image_id: An int ID of the image. Useful for debugging.
528 |     image_shape: [height, width, channels]
529 |     window: (y1, x1, y2, x2) in pixels. The area of the image where the real
530 |             image is (excluding the padding)
531 |     active_class_ids: List of class_ids available in the dataset from which
532 |         the image came. Useful if training on images from multiple datasets
533 |         where not all classes are present in all datasets.
534 |     """
535 |     meta = np.array(
536 |         [image_id] +            # size=1
537 |         list(image_shape) +     # size=3
538 |         list(window) +          # size=4 (y1, x1, y2, x2) in image cooredinates
539 |         list(active_class_ids)  # size=num_classes
540 |     )
541 |     return meta
542 | 
543 | def mold_image(images, config):
544 |     """Takes RGB images with 0-255 values and subtraces
545 |     the mean pixel and converts it to float. Expects image
546 |     colors in RGB order.
547 |     """
548 |     return images.astype(np.float32) - config.MEAN_PIXEL


--------------------------------------------------------------------------------
/KerasRFCN/Losses.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Keras RFCN
  3 | Copyright (c) 2018
  4 | Licensed under the MIT License (see LICENSE for details)
  5 | Written by parap1uie-s@github.com
  6 | """
  7 | 
  8 | import tensorflow as tf
  9 | import keras.backend as K
 10 | ############################################################
 11 | #  Loss Functions
 12 | ############################################################
 13 | 
 14 | def smooth_l1_loss(y_true, y_pred):
 15 |     """Implements Smooth-L1 loss.
 16 |     y_true and y_pred are typicallly: [N, 4], but could be any shape.
 17 |     """
 18 |     diff = K.abs(y_true - y_pred)
 19 |     less_than_one = K.cast(K.less(diff, 1.0), "float32")
 20 |     loss = (less_than_one * 0.5 * diff**2) + (1 - less_than_one) * (diff - 0.5)
 21 |     return loss
 22 | 
 23 | 
 24 | def rpn_class_loss_graph(rpn_match, rpn_class_logits):
 25 |     """RPN anchor classifier loss.
 26 | 
 27 |     rpn_match: [batch, anchors, 1]. Anchor match type. 1=positive,
 28 |                -1=negative, 0=neutral anchor.
 29 |     rpn_class_logits: [batch, anchors, 2]. RPN classifier logits for FG/BG.
 30 |     """
 31 |     # Squeeze last dim to simplify
 32 |     rpn_match = tf.squeeze(rpn_match, -1)
 33 |     # Get anchor classes. Convert the -1/+1 match to 0/1 values.
 34 |     anchor_class = K.cast(K.equal(rpn_match, 1), tf.int32)
 35 |     # Positive and Negative anchors contribute to the loss,
 36 |     # but neutral anchors (match value = 0) don't.
 37 |     indices = tf.where(K.not_equal(rpn_match, 0))
 38 |     # Pick rows that contribute to the loss and filter out the rest.
 39 |     rpn_class_logits = tf.gather_nd(rpn_class_logits, indices)
 40 |     anchor_class = tf.gather_nd(anchor_class, indices)
 41 |     # Crossentropy loss
 42 |     loss = K.sparse_categorical_crossentropy(target=anchor_class,
 43 |                                              output=rpn_class_logits,
 44 |                                              from_logits=True)
 45 |     loss = K.switch(tf.size(loss) > 0, K.mean(loss), tf.constant(0.0))
 46 |     return loss
 47 | 
 48 | 
 49 | def rpn_bbox_loss_graph(config, target_bbox, rpn_match, rpn_bbox):
 50 |     """Return the RPN bounding box loss graph.
 51 | 
 52 |     config: the model config object.
 53 |     target_bbox: [batch, max positive anchors, (dy, dx, log(dh), log(dw))].
 54 |         Uses 0 padding to fill in unsed bbox deltas.
 55 |     rpn_match: [batch, anchors, 1]. Anchor match type. 1=positive,
 56 |                -1=negative, 0=neutral anchor.
 57 |     rpn_bbox: [batch, anchors, (dy, dx, log(dh), log(dw))]
 58 |     """
 59 |     # Positive anchors contribute to the loss, but negative and
 60 |     # neutral anchors (match value of 0 or -1) don't.
 61 |     rpn_match = K.squeeze(rpn_match, -1)
 62 |     indices = tf.where(K.equal(rpn_match, 1))
 63 | 
 64 |     # Pick bbox deltas that contribute to the loss
 65 |     rpn_bbox = tf.gather_nd(rpn_bbox, indices)
 66 | 
 67 |     # Trim target bounding box deltas to the same length as rpn_bbox.
 68 |     batch_counts = K.sum(K.cast(K.equal(rpn_match, 1), tf.int32), axis=1)
 69 |     target_bbox = batch_pack_graph(target_bbox, batch_counts,
 70 |                                    config.IMAGES_PER_GPU)
 71 | 
 72 |     # TODO: use smooth_l1_loss() rather than reimplementing here
 73 |     #       to reduce code duplication
 74 |     diff = K.abs(target_bbox - rpn_bbox)
 75 |     less_than_one = K.cast(K.less(diff, 1.0), "float32")
 76 |     loss = (less_than_one * 0.5 * diff**2) + (1 - less_than_one) * (diff - 0.5)
 77 | 
 78 |     loss = K.switch(tf.size(loss) > 0, K.mean(loss), tf.constant(0.0))
 79 |     return loss
 80 | 
 81 | 
 82 | def mrcnn_class_loss_graph(target_class_ids, pred_class_logits,
 83 |                            active_class_ids):
 84 |     """Loss for the classifier head of Mask RCNN.
 85 | 
 86 |     target_class_ids: [batch, num_rois]. Integer class IDs. Uses zero
 87 |         padding to fill in the array.
 88 |     pred_class_logits: [batch, num_rois, num_classes]
 89 |     active_class_ids: [batch, num_classes]. Has a value of 1 for
 90 |         classes that are in the dataset of the image, and 0
 91 |         for classes that are not in the dataset.
 92 |     """
 93 |     target_class_ids = tf.cast(target_class_ids, 'int64')
 94 | 
 95 |     # Find predictions of classes that are not in the dataset.
 96 |     pred_class_ids = tf.argmax(pred_class_logits, axis=2)
 97 |     # TODO: Update this line to work with batch > 1. Right now it assumes all
 98 |     #       images in a batch have the same active_class_ids
 99 |     pred_active = tf.gather(active_class_ids[0], pred_class_ids)
100 | 
101 |     # Loss
102 |     loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
103 |         labels=target_class_ids, logits=pred_class_logits)
104 | 
105 |     # Erase losses of predictions of classes that are not in the active
106 |     # classes of the image.
107 |     loss = loss * pred_active
108 | 
109 |     # Computer loss mean. Use only predictions that contribute
110 |     # to the loss to get a correct mean.
111 |     loss = tf.reduce_sum(loss) / tf.reduce_sum(pred_active)
112 |     return loss
113 | 
114 | 
115 | def mrcnn_bbox_loss_graph(target_bbox, target_class_ids, pred_bbox):
116 |     """Loss for Mask R-CNN bounding box refinement.
117 | 
118 |     target_bbox: [batch, num_rois, (dy, dx, log(dh), log(dw))]
119 |     target_class_ids: [batch, num_rois]. Integer class IDs.
120 |     pred_bbox: [batch, num_rois, num_classes, (dy, dx, log(dh), log(dw))]
121 |     """
122 |     # Reshape to merge batch and roi dimensions for simplicity.
123 |     target_class_ids = K.reshape(target_class_ids, (-1,))
124 |     target_bbox = K.reshape(target_bbox, (-1, 4))
125 |     pred_bbox = K.reshape(pred_bbox, (-1, K.int_shape(pred_bbox)[2], 4))
126 | 
127 |     # Only positive ROIs contribute to the loss. And only
128 |     # the right class_id of each ROI. Get their indicies.
129 |     positive_roi_ix = tf.where(target_class_ids > 0)[:, 0]
130 |     positive_roi_class_ids = tf.cast(
131 |         tf.gather(target_class_ids, positive_roi_ix), tf.int64)
132 |     indices = tf.stack([positive_roi_ix, positive_roi_class_ids], axis=1)
133 | 
134 |     # Gather the deltas (predicted and true) that contribute to loss
135 |     target_bbox = tf.gather(target_bbox, positive_roi_ix)
136 |     pred_bbox = tf.gather_nd(pred_bbox, indices)
137 | 
138 |     # Smooth-L1 Loss
139 |     loss = K.switch(tf.size(target_bbox) > 0,
140 |                     smooth_l1_loss(y_true=target_bbox, y_pred=pred_bbox),
141 |                     tf.constant(0.0))
142 |     loss = K.mean(loss)
143 |     loss = K.reshape(loss, [1, 1])
144 |     return loss
145 | 
146 | def batch_pack_graph(x, counts, num_rows):
147 |     """Picks different number of values from each row
148 |     in x depending on the values in counts.
149 |     """
150 |     outputs = []
151 |     for i in range(num_rows):
152 |         outputs.append(x[i, :counts[i]])
153 |     return tf.concat(outputs, axis=0)


--------------------------------------------------------------------------------
/KerasRFCN/Model/BaseModel.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Keras RFCN
  3 | Copyright (c) 2018
  4 | Licensed under the MIT License (see LICENSE for details)
  5 | Written by parap1uie-s@github.com
  6 | """
  7 | 
  8 | '''
  9 | This is base class of RFCN Model
 10 | Contain some functions like load_weights、find_last...etc
 11 | '''
 12 | 
 13 | import re
 14 | import keras
 15 | import tensorflow as tf
 16 | import datetime
 17 | from KerasRFCN.Data_generator import data_generator
 18 | import os
 19 | import KerasRFCN.Utils
 20 | import numpy as np
 21 | 
 22 | class BaseModel(object):
 23 |     """docstring for BaseModel"""
 24 |     def __init__(self, arg):
 25 |         super(BaseModel, self).__init__()
 26 |         self.arg = arg    
 27 | 
 28 |     def find_last(self):
 29 |         """Finds the last checkpoint file of the last trained model in the
 30 |         model directory.
 31 |         Returns:
 32 |             log_dir: The directory where events and weights are saved
 33 |             checkpoint_path: the path to the last checkpoint file
 34 |         """
 35 |         # Get directory names. Each directory corresponds to a model
 36 |         dir_names = next(os.walk(self.model_dir))[1]
 37 |         key = self.config.NAME.lower()
 38 |         dir_names = filter(lambda f: f.startswith(key), dir_names)
 39 |         dir_names = sorted(dir_names)
 40 |         if not dir_names:
 41 |             return None, None
 42 |         # Pick last directory
 43 |         dir_name = os.path.join(self.model_dir, dir_names[-1])
 44 |         # Find the last checkpoint
 45 |         checkpoints = next(os.walk(dir_name))[2]
 46 |         checkpoints = filter(lambda f: f.startswith("Keras-RFCN"), checkpoints)
 47 |         checkpoints = sorted(checkpoints)
 48 |         if not checkpoints:
 49 |             return dir_name, None
 50 |         checkpoint = os.path.join(dir_name, checkpoints[-1])
 51 |         return dir_name, checkpoint
 52 | 
 53 |     def load_weights(self, filepath, by_name=False, exclude=None):
 54 |         """Modified version of the correspoding Keras function with
 55 |         the addition of multi-GPU support and the ability to exclude
 56 |         some layers from loading.
 57 |         exlude: list of layer names to excluce
 58 |         """
 59 |         import h5py
 60 |         # Keras 2.2 use saving
 61 |         try:
 62 |             from keras.engine import saving
 63 |         except ImportError:
 64 |             # Keras before 2.2 used the 'topology' namespace.
 65 |             from keras.engine import topology as saving
 66 | 
 67 |         if exclude:
 68 |             by_name = True
 69 | 
 70 |         if h5py is None:
 71 |             raise ImportError('`load_weights` requires h5py.')
 72 |         f = h5py.File(filepath, mode='r')
 73 |         if 'layer_names' not in f.attrs and 'model_weights' in f:
 74 |             f = f['model_weights']
 75 | 
 76 |         # In multi-GPU training, we wrap the model. Get layers
 77 |         # of the inner model because they have the weights.
 78 |         keras_model = self.keras_model
 79 |         layers = keras_model.inner_model.layers if hasattr(keras_model, "inner_model")\
 80 |             else keras_model.layers
 81 | 
 82 |         # Exclude some layers
 83 |         if exclude:
 84 |             layers = filter(lambda l: l.name not in exclude, layers)
 85 | 
 86 |         if by_name:
 87 |             saving.load_weights_from_hdf5_group_by_name(f, layers)
 88 |         else:
 89 |             saving.load_weights_from_hdf5_group(f, layers)
 90 |         if hasattr(f, 'close'):
 91 |             f.close()
 92 | 
 93 |         # Update the log directory
 94 |         self.set_log_dir(filepath)
 95 | 
 96 |     def get_imagenet_weights(self):
 97 |         """Downloads ImageNet trained weights from Keras.
 98 |         Returns path to weights file.
 99 |         """
100 |         from keras.utils.data_utils import get_file
101 |         TF_WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/'\
102 |                                  'releases/download/v0.2/'\
103 |                                  'resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5'
104 |         weights_path = get_file('resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5',
105 |                                 TF_WEIGHTS_PATH_NO_TOP,
106 |                                 cache_subdir='models',
107 |                                 md5_hash='a268eb855778b3df3c7506639542a6af')
108 |         return weights_path
109 | 
110 |     def compile(self, learning_rate, momentum):
111 |         """Gets the model ready for training. Adds losses, regularization, and
112 |         metrics. Then calls the Keras compile() function.
113 |         """
114 |         # Optimizer object
115 |         optimizer = keras.optimizers.SGD(lr=learning_rate, momentum=momentum, clipnorm=5.0)
116 |         # Add Losses
117 |         # First, clear previously set losses to avoid duplication
118 |         self.keras_model._losses = []
119 |         self.keras_model._per_input_losses = {}
120 |         loss_names = ["rpn_class_loss", "rpn_bbox_loss",
121 |                       "mrcnn_class_loss", "mrcnn_bbox_loss"]
122 |         for name in loss_names:
123 |             layer = self.keras_model.get_layer(name)
124 |             if layer.output in self.keras_model.losses:
125 |                 continue
126 |             self.keras_model.add_loss(
127 |                 tf.reduce_mean(layer.output, keepdims=True))
128 | 
129 |         # Add L2 Regularization
130 |         # Skip gamma and beta weights of batch normalization layers.
131 |         reg_losses = [keras.regularizers.l2(self.config.WEIGHT_DECAY)(w) / tf.cast(tf.size(w), tf.float32)
132 |                       for w in self.keras_model.trainable_weights
133 |                       if 'gamma' not in w.name and 'beta' not in w.name]
134 |         self.keras_model.add_loss(tf.add_n(reg_losses))
135 | 
136 |         # Compile
137 |         self.keras_model.compile(optimizer=optimizer, loss=[
138 |                                  None] * len(self.keras_model.outputs))
139 | 
140 |         # Add metrics for losses
141 |         for name in loss_names:
142 |             if name in self.keras_model.metrics_names:
143 |                 continue
144 |             layer = self.keras_model.get_layer(name)
145 |             self.keras_model.metrics_names.append(name)
146 |             self.keras_model.metrics_tensors.append(tf.reduce_mean(
147 |                 layer.output, keepdims=True))
148 | 
149 |     def set_trainable(self, layer_regex, keras_model=None, indent=0, verbose=1):
150 |         """Sets model layers as trainable if their names match
151 |         the given regular expression.
152 |         """
153 |         # Print message on the first call (but not on recursive calls)
154 |         if verbose > 0 and keras_model is None:
155 |             print("Selecting layers to train")
156 | 
157 |         keras_model = keras_model or self.keras_model
158 | 
159 |         # In multi-GPU training, we wrap the model. Get layers
160 |         # of the inner model because they have the weights.
161 |         layers = keras_model.inner_model.layers if hasattr(keras_model, "inner_model")\
162 |             else keras_model.layers
163 | 
164 |         for layer in layers:
165 |             # Is the layer a model?
166 |             if layer.__class__.__name__ == 'Model':
167 |                 print("In model: ", layer.name)
168 |                 self.set_trainable(
169 |                     layer_regex, keras_model=layer, indent=indent + 4)
170 |                 continue
171 | 
172 |             if not layer.weights:
173 |                 continue
174 |             # Is it trainable?
175 |             trainable = bool(re.fullmatch(layer_regex, layer.name))
176 |             # Update layer. If layer is a container, update inner layer.
177 |             if layer.__class__.__name__ == 'TimeDistributed':
178 |                 layer.layer.trainable = trainable
179 |             else:
180 |                 layer.trainable = trainable
181 |             # Print trainble layer names
182 |             if trainable and verbose > 0:
183 |                 print("{}{:20}   ({})".format(" " * indent, layer.name,
184 |                                             layer.__class__.__name__))
185 | 
186 |     def set_log_dir(self, model_path=None):
187 |         """Sets the model log directory and epoch counter.
188 | 
189 |         model_path: If None, or a format different from what this code uses
190 |             then set a new log directory and start epochs from 0. Otherwise,
191 |             extract the log directory and the epoch counter from the file
192 |             name.
193 |         """
194 |         # Set date and epoch counter as if starting a new model
195 |         self.epoch = 0
196 |         now = datetime.datetime.now()
197 | 
198 |         # If we have a model path with date and epochs use them
199 |         if model_path:
200 |             # Continue from we left of. Get epoch and date from the file name
201 |             # A sample model path might look like:
202 |             regex = r".*/\w+(\d{4})(\d{2})(\d{2})T(\d{2})(\d{2})/Keras-RFCN\_\w+(\d{4})\.h5"
203 |             m = re.match(regex, model_path)
204 |             if m:
205 |                 now = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
206 |                                         int(m.group(4)), int(m.group(5)))
207 |                 self.epoch = int(m.group(6)) + 1
208 | 
209 |         # Directory for training logs
210 |         self.log_dir = os.path.join(self.model_dir, "{}{:%Y%m%dT%H%M}".format(
211 |             self.config.NAME.lower(), now))
212 | 
213 |         # Path to save after each epoch. Include placeholders that get filled by Keras.
214 |         self.checkpoint_path = os.path.join(self.log_dir, "Keras-RFCN_{}_*epoch*.h5".format(
215 |             self.config.NAME.lower()))
216 |         self.checkpoint_path = self.checkpoint_path.replace(
217 |             "*epoch*", "{epoch:04d}")
218 | 
219 |     def train(self, train_dataset, val_dataset, learning_rate, epochs, layers):
220 |         """Train the model.
221 |         train_dataset, val_dataset: Training and validation Dataset objects.
222 |         learning_rate: The learning rate to train with
223 |         epochs: Number of training epochs. Note that previous training epochs
224 |                 are considered to be done alreay, so this actually determines
225 |                 the epochs to train in total rather than in this particaular
226 |                 call.
227 |         layers: Allows selecting wich layers to train. It can be:
228 |             - A regular expression to match layer names to train
229 |             - One of these predefined values:
230 |               heaads: The RPN, classifier and mask heads of the network
231 |               all: All the layers
232 |               3+: Train Resnet stage 3 and up
233 |               4+: Train Resnet stage 4 and up
234 |               5+: Train Resnet stage 5 and up
235 |         """
236 |         assert self.mode == "training", "Create model in training mode."
237 | 
238 |         # Pre-defined layer regular expressions
239 |         layer_regex = {
240 |             # all layers but the backbone
241 |             "heads": r"(mrcnn\_.*)|(rpn\_.*)|(score_map\_.*)|(regr\_.*)|(classify\_.*)",
242 |             # From a specific Resnet stage and up
243 |             "3+": r"(res3.*)|(bn3.*)|(res4.*)|(bn4.*)|(res5.*)|(bn5.*)|(res6.*)|(bn6.*)|(mrcnn\_.*)|(rpn\_.*)|(score_map\_.*)|(regr\_.*)|(classify\_.*)",
244 |             "4+": r"(res4.*)|(bn4.*)|(res5.*)|(bn5.*)|(res6.*)|(bn6.*)|(mrcnn\_.*)|(rpn\_.*)|(score_map\_.*)|(regr\_.*)|(classify\_.*)",
245 |             "5+": r"(res5.*)|(bn5.*)|(res6.*)|(bn6.*)|(mrcnn\_.*)|(rpn\_.*)|(score_map\_.*)|(regr\_.*)|(classify\_.*)",
246 |             # All layers
247 |             "all": ".*",
248 |         }
249 |         if layers in layer_regex.keys():
250 |             layers = layer_regex[layers]
251 | 
252 |         # Data generators
253 |         train_generator = data_generator(train_dataset, self.config, shuffle=True,
254 |                                          batch_size=self.config.BATCH_SIZE)
255 |         val_generator = data_generator(val_dataset, self.config, shuffle=True,
256 |                                        batch_size=self.config.BATCH_SIZE,
257 |                                        augment=False)
258 | 
259 |         # Callbacks
260 |         callbacks = [
261 |             keras.callbacks.TensorBoard(log_dir=self.log_dir,
262 |                                         histogram_freq=0, write_graph=True, write_images=False),
263 |             keras.callbacks.ModelCheckpoint(self.checkpoint_path,
264 |                                             verbose=0, save_weights_only=True, save_best_only=True),
265 |             keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.01, patience=10, verbose=1, mode='auto', min_delta=0.001, min_lr=0)
266 |         ]
267 | 
268 |         # Train
269 |         print("\nStarting at epoch {}. LR={}\n".format(self.epoch, learning_rate))
270 |         print("Checkpoint Path: {}".format(self.checkpoint_path))
271 |         self.set_trainable(layers, verbose=0)
272 |         self.compile(learning_rate, self.config.LEARNING_MOMENTUM)
273 | 
274 |         # Work-around for Windows: Keras fails on Windows when using
275 |         # multiprocessing workers. See discussion here:
276 |         # https://github.com/matterport/Mask_RCNN/issues/13#issuecomment-353124009
277 |         if os.name is 'nt':
278 |             workers = 0
279 |         else:
280 |             workers = max(self.config.BATCH_SIZE // 2, 2)
281 | 
282 |         self.keras_model.fit_generator(
283 |             train_generator,
284 |             initial_epoch=self.epoch,
285 |             epochs=epochs,
286 |             steps_per_epoch=self.config.STEPS_PER_EPOCH,
287 |             callbacks=callbacks,
288 |             validation_data=next(val_generator),
289 |             validation_steps=self.config.VALIDATION_STEPS,
290 |             max_queue_size=100,
291 |             workers=4,
292 |             use_multiprocessing=True,
293 |         )
294 |         self.epoch = max(self.epoch, epochs)
295 | 
296 |     def detect(self, images, verbose=0):
297 |         """Runs the detection pipeline.
298 | 
299 |         images: List of images, potentially of different sizes.
300 | 
301 |         Returns a list of dicts, one dict per image. The dict contains:
302 |         rois: [N, (y1, x1, y2, x2)] detection bounding boxes
303 |         class_ids: [N] int class IDs
304 |         scores: [N] float probability scores for the class IDs
305 |         """
306 |         assert self.mode == "inference", "Create model in inference mode."
307 |         assert len(
308 |             images) == self.config.BATCH_SIZE, "len(images) must be equal to BATCH_SIZE"
309 | 
310 |         if verbose:
311 |             print("Processing {} images".format(len(images)))
312 | 
313 |         # Mold inputs to format expected by the neural network
314 |         molded_images, image_metas, windows = self.mold_inputs(images)
315 | 
316 |         # Run object detection
317 |         detections, mrcnn_class, mrcnn_bbox, \
318 |             rois, rpn_class, rpn_bbox =\
319 |             self.keras_model.predict([molded_images, image_metas], verbose=0)
320 | 
321 |         # Process detections
322 |         results = []
323 |         for i, image in enumerate(images):
324 |             final_rois, final_class_ids, final_scores =\
325 |                 self.unmold_detections(detections[i], image.shape, windows[i])
326 |             results.append({
327 |                 "rois": final_rois,
328 |                 "class_ids": final_class_ids,
329 |                 "scores": final_scores
330 |             })
331 |         return results
332 | 
333 |     def mold_inputs(self, images):
334 |         """Takes a list of images and modifies them to the format expected
335 |         as an input to the neural network.
336 |         images: List of image matricies [height,width,depth]. Images can have
337 |             different sizes.
338 | 
339 |         Returns 3 Numpy matricies:
340 |         molded_images: [N, h, w, 3]. Images resized and normalized.
341 |         image_metas: [N, length of meta data]. Details about each image.
342 |         windows: [N, (y1, x1, y2, x2)]. The portion of the image that has the
343 |             original image (padding excluded).
344 |         """
345 |         molded_images = []
346 |         image_metas = []
347 |         windows = []
348 |         for image in images:
349 |             # Resize image to fit the model expected size
350 |             # TODO: move resizing to mold_image()
351 |             molded_image, window, scale, padding = KerasRFCN.Utils.resize_image(
352 |                 image,
353 |                 min_dim=self.config.IMAGE_MIN_DIM,
354 |                 max_dim=self.config.IMAGE_MAX_DIM,
355 |                 padding=self.config.IMAGE_PADDING)
356 |             molded_image = KerasRFCN.Utils.mold_image(molded_image, self.config)
357 |             # Build image_meta
358 |             image_meta = KerasRFCN.Utils.compose_image_meta(
359 |                 0, image.shape, window,
360 |                 np.zeros([self.config.NUM_CLASSES], dtype=np.int32))
361 |             # Append
362 |             molded_images.append(molded_image)
363 |             windows.append(window)
364 |             image_metas.append(image_meta)
365 |         # Pack into arrays
366 |         molded_images = np.stack(molded_images)
367 |         image_metas = np.stack(image_metas)
368 |         windows = np.stack(windows)
369 |         return molded_images, image_metas, windows
370 | 
371 |     def unmold_detections(self, detections, image_shape, window):
372 |         """Reformats the detections of one image from the format of the neural
373 |         network output to a format suitable for use in the rest of the
374 |         application.
375 | 
376 |         detections: [N, (y1, x1, y2, x2, class_id, score)]
377 |         image_shape: [height, width, depth] Original size of the image before resizing
378 |         window: [y1, x1, y2, x2] Box in the image where the real image is
379 |                 excluding the padding.
380 | 
381 |         Returns:
382 |         boxes: [N, (y1, x1, y2, x2)] Bounding boxes in pixels
383 |         class_ids: [N] Integer class IDs for each bounding box
384 |         scores: [N] Float probability scores of the class_id
385 |         """
386 |         # How many detections do we have?
387 |         # Detections array is padded with zeros. Find the first class_id == 0.
388 |         zero_ix = np.where(detections[:, 4] == 0)[0]
389 |         N = zero_ix[0] if zero_ix.shape[0] > 0 else detections.shape[0]
390 | 
391 |         # Extract boxes, class_ids, scores
392 |         boxes = detections[:N, :4]
393 |         class_ids = detections[:N, 4].astype(np.int32)
394 |         scores = detections[:N, 5]
395 | 
396 |         # Compute scale and shift to translate coordinates to image domain.
397 |         h_scale = image_shape[0] / (window[2] - window[0])
398 |         w_scale = image_shape[1] / (window[3] - window[1])
399 |         scale = min(h_scale, w_scale)
400 |         shift = window[:2]  # y, x
401 |         scales = np.array([scale, scale, scale, scale])
402 |         shifts = np.array([shift[0], shift[1], shift[0], shift[1]])
403 | 
404 |         # Translate bounding boxes to image domain
405 |         boxes = np.multiply(boxes - shifts, scales).astype(np.int32)
406 | 
407 |         # Filter out detections with zero area. Often only happens in early
408 |         # stages of training when the network weights are still a bit random.
409 |         exclude_ix = np.where(
410 |             (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) <= 0)[0]
411 |         if exclude_ix.shape[0] > 0:
412 |             boxes = np.delete(boxes, exclude_ix, axis=0)
413 |             class_ids = np.delete(class_ids, exclude_ix, axis=0)
414 |             scores = np.delete(scores, exclude_ix, axis=0)
415 |             N = class_ids.shape[0]
416 | 
417 |         return boxes, class_ids, scores
418 | 


--------------------------------------------------------------------------------
/KerasRFCN/Model/Model.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Keras RFCN
  3 | Copyright (c) 2018
  4 | Licensed under the MIT License (see LICENSE for details)
  5 | Written by parap1uie-s@github.com
  6 | """
  7 | 
  8 | '''
  9 | This is Main class of RFCN Model
 10 | Contain the model's framework and call the backbone
 11 | '''
 12 | 
 13 | from KerasRFCN.Model.ResNet import ResNet
 14 | from KerasRFCN.Model.ResNet_dilated import ResNet_dilated
 15 | from KerasRFCN.Model.BaseModel import BaseModel
 16 | import KerasRFCN.Utils
 17 | import KerasRFCN.Losses
 18 | 
 19 | import keras.layers as KL
 20 | import keras.engine as KE
 21 | import tensorflow as tf
 22 | import numpy as np
 23 | import keras
 24 | import keras.backend as K
 25 | import keras.models as KM
 26 | 
 27 | class RFCN_Model(BaseModel):
 28 |     """docstring for RFCN_Model"""
 29 |     def __init__(self, mode, config, model_dir):
 30 |         """
 31 |         mode: Either "training" or "inference"
 32 |         config: A Sub-class of the Config class
 33 |         model_dir: Directory to save training logs and trained weights
 34 |         """
 35 |         assert mode in ['training', 'inference']
 36 |         assert config.BACKBONE in ['resnet50', 'resnet101', 'resnet50_dilated', 'resnet101_dilated']
 37 | 
 38 |         self.mode = mode
 39 |         self.config = config
 40 |         self.model_dir = model_dir
 41 |         self.set_log_dir()
 42 |         self.keras_model = self.build(mode=mode, config=config)
 43 | 
 44 |     def build(self, mode, config):
 45 |         assert mode in ['training', 'inference']
 46 | 
 47 |         h, w = config.IMAGE_SHAPE[:2]
 48 |         if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6):
 49 |             raise Exception("Image size must be dividable by 2 at least 6 times "
 50 |                             "to avoid fractions when downscaling and upscaling."
 51 |                             "For example, use 256, 320, 384, 448, 512, ... etc. ")
 52 |         # Inputs
 53 |         input_image = KL.Input(
 54 |             shape=config.IMAGE_SHAPE.tolist(), name="input_image")
 55 |         input_image_meta = KL.Input(shape=[None], name="input_image_meta")
 56 |         if mode == "training":
 57 |             # RPN GT
 58 |             input_rpn_match = KL.Input(
 59 |                 shape=[None, 1], name="input_rpn_match", dtype=tf.int32)
 60 |             input_rpn_bbox = KL.Input(
 61 |                 shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32)
 62 | 
 63 |             # Detection GT (class IDs, bounding boxes)
 64 |             # 1. GT Class IDs (zero padded)
 65 |             input_gt_class_ids = KL.Input(
 66 |                 shape=[None], name="input_gt_class_ids", dtype=tf.int32)
 67 |             # 2. GT Boxes in pixels (zero padded)
 68 |             # [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates
 69 |             input_gt_boxes = KL.Input(
 70 |                 shape=[None, 4], name="input_gt_boxes", dtype=tf.float32)
 71 |             # Normalize coordinates
 72 |             h, w = K.shape(input_image)[1], K.shape(input_image)[2]
 73 |             image_scale = K.cast(K.stack([h, w, h, w], axis=0), tf.float32)
 74 |             gt_boxes = KL.Lambda(lambda x: x / image_scale)(input_gt_boxes)
 75 | 
 76 |         if config.BACKBONE in ['resnet50', 'resnet101']:
 77 |             P2, P3, P4, P5, P6 = ResNet(input_image, architecture=config.BACKBONE).output_layers
 78 |         else:
 79 |             P2, P3, P4, P5, P6 = ResNet_dilated(input_image, architecture=config.BACKBONE).output_layers
 80 | 
 81 |         # Note that P6 is used in RPN, but not in the classifier heads.
 82 |         rpn_feature_maps = [P2, P3, P4, P5, P6]
 83 |         mrcnn_feature_maps = [P2, P3, P4, P5]
 84 | 
 85 |         ### RPN ###
 86 |         rpn = self.build_rpn_model(config.RPN_ANCHOR_STRIDE,
 87 |                               len(config.RPN_ANCHOR_RATIOS), 256)
 88 |         # Loop through pyramid layers
 89 |         layer_outputs = []  # list of lists
 90 |         for p in rpn_feature_maps:
 91 |             layer_outputs.append(rpn([p]))
 92 |         # Concatenate layer outputs
 93 |         # Convert from list of lists of level outputs to list of lists
 94 |         # of outputs across levels.
 95 |         # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
 96 |         output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"]
 97 |         outputs = list(zip(*layer_outputs))
 98 |         outputs = [KL.Concatenate(axis=1, name=n)(list(o))
 99 |                    for o, n in zip(outputs, output_names)]
100 | 
101 |         rpn_class_logits, rpn_class, rpn_bbox = outputs
102 | 
103 |         self.anchors = KerasRFCN.Utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES,
104 |                                               config.RPN_ANCHOR_RATIOS,
105 |                                               config.BACKBONE_SHAPES,
106 |                                               config.BACKBONE_STRIDES,
107 |                                               config.RPN_ANCHOR_STRIDE)
108 |         # window size K and total classed num C
109 |         # Example: For coco, C = 80+1
110 |         scoreMapSize = config.K * config.K
111 |         ScoreMaps_classify = []
112 |         for feature_map_count, feature_map in enumerate(mrcnn_feature_maps):
113 |             # [W * H * class_num] * k^2
114 |             ScoreMap = KL.Conv2D(config.C * scoreMapSize, kernel_size=(1,1), name="score_map_class_{}".format(feature_map_count), padding='valid')(feature_map)
115 |             ScoreMaps_classify.append(ScoreMap)
116 | 
117 |         ScoreMaps_regr = []
118 |         for feature_map_count, feature_map in enumerate(mrcnn_feature_maps):
119 |             # [W * H * 4] * k^2 ==> 4 = (x,y,w,h)
120 |             ScoreMap = KL.Conv2D(4 * scoreMapSize, kernel_size=(1,1), name="score_map_regr_{}".format(feature_map_count), padding='valid')(feature_map)
121 |             ScoreMaps_regr.append(ScoreMap)
122 | 
123 |         # Generate proposals
124 |         # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates
125 |         # and zero padded.
126 |         proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training"\
127 |             else config.POST_NMS_ROIS_INFERENCE
128 |         rpn_rois = ProposalLayer(proposal_count=proposal_count,
129 |                                  nms_threshold=config.RPN_NMS_THRESHOLD,
130 |                                  name="ROI",
131 |                                  anchors=self.anchors,
132 |                                  config=config)([rpn_class, rpn_bbox])
133 | 
134 |         if mode == "training":
135 |             # Class ID mask to mark class IDs supported by the dataset the image
136 |             # came from.
137 |             _, _, _, active_class_ids = KL.Lambda(lambda x: parse_image_meta_graph(x))(input_image_meta)
138 | 
139 |             # Generate detection targets
140 |             # Subsamples proposals and generates target outputs for training
141 |             # Note that proposal class IDs, gt_boxes, and gt_masks are zero
142 |             # padded. Equally, returned rois and targets are zero padded.
143 |             rois, target_class_ids, target_bbox =\
144 |                 DetectionTargetLayer(config, name="proposal_targets")([
145 |                     rpn_rois, input_gt_class_ids, gt_boxes])
146 | 
147 |             # size = [batch, num_rois, class_num]
148 |             classify_vote = VotePooling(config.TRAIN_ROIS_PER_IMAGE, config.C, config.K, config.POOL_SIZE, config.BATCH_SIZE, config.IMAGE_SHAPE, name="classify_vote")([rois] + ScoreMaps_classify)
149 |             classify_output = KL.TimeDistributed(KL.Activation('softmax'),name="classify_output")(classify_vote)
150 | 
151 |             # 4 k^2 rather than 4k^2*C
152 |             regr_vote = VotePooling(config.TRAIN_ROIS_PER_IMAGE, 4, config.K, config.POOL_SIZE, config.BATCH_SIZE, config.IMAGE_SHAPE, name="regr_vote")([rois] + ScoreMaps_regr)
153 |             regr_output = KL.TimeDistributed(KL.Activation('linear'),name="regr_output")(regr_vote)
154 | 
155 |             rpn_class_loss = KL.Lambda(lambda x: KerasRFCN.Losses.rpn_class_loss_graph(*x), name="rpn_class_loss")(
156 |                 [input_rpn_match, rpn_class_logits])
157 |             rpn_bbox_loss = KL.Lambda(lambda x: KerasRFCN.Losses.rpn_bbox_loss_graph(config, *x), name="rpn_bbox_loss")(
158 |                 [input_rpn_bbox, input_rpn_match, rpn_bbox])
159 |             class_loss = KL.Lambda(lambda x: KerasRFCN.Losses.mrcnn_class_loss_graph(*x), name="mrcnn_class_loss")(
160 |                 [target_class_ids, classify_vote, active_class_ids])
161 |             bbox_loss = KL.Lambda(lambda x: KerasRFCN.Losses.mrcnn_bbox_loss_graph(*x), name="mrcnn_bbox_loss")(
162 |                 [target_bbox, target_class_ids, regr_output])
163 | 
164 |             inputs = [input_image, input_image_meta,
165 |                       input_rpn_match, input_rpn_bbox, input_gt_class_ids, input_gt_boxes]
166 | 
167 |             outputs = [rpn_class_logits, rpn_class, rpn_bbox,
168 |                        classify_vote, classify_output, regr_output,
169 |                        rpn_rois, rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss]
170 | 
171 |             keras_model = KM.Model(inputs, outputs, name='rfcn_train')
172 |         else: # inference
173 | 
174 |             # Network Heads
175 |             # Proposal classifier and BBox regressor heads
176 |             # size = [batch, num_rois, class_num]
177 |             classify_vote = VotePooling(proposal_count, config.C, config.K, config.POOL_SIZE, config.BATCH_SIZE, config.IMAGE_SHAPE, name="classify_vote")([rpn_rois] + ScoreMaps_classify)
178 |             classify_output = KL.TimeDistributed(KL.Activation('softmax'),name="classify_output")(classify_vote)
179 | 
180 |             # 4 k^2 rather than 4k^2*C
181 |             regr_vote = VotePooling(proposal_count, 4, config.K, config.POOL_SIZE, config.BATCH_SIZE, config.IMAGE_SHAPE, name="regr_vote")([rpn_rois] + ScoreMaps_regr)
182 |             regr_output = KL.TimeDistributed(KL.Activation('linear'),name="regr_output")(regr_vote)
183 | 
184 |             # Detections
185 |             # output is [batch, num_detections, (y1, x1, y2, x2, score)] in image coordinates
186 |             detections = DetectionLayer(config, name="mrcnn_detection")(
187 |                 [rpn_rois, classify_output, regr_output, input_image_meta])
188 | 
189 |             keras_model = KM.Model([input_image, input_image_meta],
190 |                              [detections, classify_output, regr_output, rpn_rois, rpn_class, rpn_bbox],
191 |                              name='rfcn_inference')
192 |         return keras_model
193 |             
194 |     def build_rpn_model(self, anchor_stride, anchors_per_location, depth):
195 |         """Builds a Keras model of the Region Proposal Network.
196 |         It wraps the RPN graph so it can be used multiple times with shared
197 |         weights.
198 | 
199 |         anchors_per_location: number of anchors per pixel in the feature map
200 |         anchor_stride: Controls the density of anchors. Typically 1 (anchors for
201 |                        every pixel in the feature map), or 2 (every other pixel).
202 |         depth: Depth of the backbone feature map.
203 | 
204 |         Returns a Keras Model object. The model outputs, when called, are:
205 |         rpn_logits: [batch, H, W, 2] Anchor classifier logits (before softmax)
206 |         rpn_probs: [batch, W, W, 2] Anchor classifier probabilities.
207 |         rpn_bbox: [batch, H, W, (dy, dx, log(dh), log(dw))] Deltas to be
208 |                     applied to anchors.
209 |         """
210 |         input_feature_map = KL.Input(shape=[None, None, depth],
211 |                                      name="input_rpn_feature_map")
212 |         outputs = self.rpn(input_feature_map, anchors_per_location, anchor_stride)
213 |         return KM.Model([input_feature_map], outputs, name="rpn_model")
214 | 
215 |     def rpn(self, feature_map, anchors_per_location, anchor_stride):
216 |         """Builds a Keras model of the Region Proposal Network.
217 |         It wraps the RPN graph so it can be used multiple times with shared
218 |         weights.
219 | 
220 |         anchors_per_location: number of anchors per pixel in the feature map
221 |         anchor_stride: Controls the density of anchors. Typically 1 (anchors for
222 |                        every pixel in the feature map), or 2 (every other pixel).
223 |         depth: Depth of the backbone feature map.
224 | 
225 |         Returns a Keras Model object. The model outputs, when called, are:
226 |         rpn_logits: [batch, H, W, 2] Anchor classifier logits (before softmax)
227 |         rpn_probs: [batch, W, W, 2] Anchor classifier probabilities.
228 |         rpn_bbox: [batch, H, W, (dy, dx, log(dh), log(dw))] Deltas to be
229 |                     applied to anchors.
230 |         """
231 | 
232 |         shared = KL.Conv2D(512, (3, 3), padding='same', activation='relu',
233 |                            strides=anchor_stride,
234 |                            name='rpn_conv_shared')(feature_map)
235 | 
236 |         # Anchor Score. [batch, height, width, anchors per location * 2].
237 |         x = KL.Conv2D(2 * anchors_per_location, (1, 1), padding='valid',
238 |                       activation='linear', name='rpn_class_raw')(shared)
239 | 
240 |         # Reshape to [batch, anchors, 2]
241 |         rpn_class_logits = KL.Lambda(
242 |             lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 2]))(x)
243 | 
244 |         # Softmax on last dimension of BG/FG.
245 |         rpn_probs = KL.Activation(
246 |             "softmax", name="rpn_class_xxx")(rpn_class_logits)
247 | 
248 |         # Bounding box refinement. [batch, H, W, anchors per location, depth]
249 |         # where depth is [x, y, log(w), log(h)]
250 |         x = KL.Conv2D(anchors_per_location * 4, (1, 1), padding="valid",
251 |                       activation='linear', name='rpn_bbox_pred')(shared)
252 | 
253 |         # Reshape to [batch, anchors, 4]
254 |         rpn_bbox = KL.Lambda(lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 4]))(x)
255 | 
256 |         return rpn_class_logits, rpn_probs, rpn_bbox
257 | 
258 | 
259 | ############################################################
260 | #  Proposal Layer
261 | ############################################################
262 | 
263 | def apply_box_deltas_graph(boxes, deltas):
264 |     """Applies the given deltas to the given boxes.
265 |     boxes: [N, 4] where each row is y1, x1, y2, x2
266 |     deltas: [N, 4] where each row is [dy, dx, log(dh), log(dw)]
267 |     """
268 |     # Convert to y, x, h, w
269 |     height = boxes[:, 2] - boxes[:, 0]
270 |     width = boxes[:, 3] - boxes[:, 1]
271 |     center_y = boxes[:, 0] + 0.5 * height
272 |     center_x = boxes[:, 1] + 0.5 * width
273 |     # Apply deltas
274 |     center_y += deltas[:, 0] * height
275 |     center_x += deltas[:, 1] * width
276 |     height *= tf.exp(deltas[:, 2])
277 |     width *= tf.exp(deltas[:, 3])
278 |     # Convert back to y1, x1, y2, x2
279 |     y1 = center_y - 0.5 * height
280 |     x1 = center_x - 0.5 * width
281 |     y2 = y1 + height
282 |     x2 = x1 + width
283 |     result = tf.stack([y1, x1, y2, x2], axis=1, name="apply_box_deltas_out")
284 |     return result
285 | 
286 | 
287 | def clip_boxes_graph(boxes, window):
288 |     """
289 |     boxes: [N, 4] each row is y1, x1, y2, x2
290 |     window: [4] in the form y1, x1, y2, x2
291 |     """
292 |     # Split corners
293 |     wy1, wx1, wy2, wx2 = tf.split(window, 4)
294 |     y1, x1, y2, x2 = tf.split(boxes, 4, axis=1)
295 |     # Clip
296 |     y1 = tf.maximum(tf.minimum(y1, wy2), wy1)
297 |     x1 = tf.maximum(tf.minimum(x1, wx2), wx1)
298 |     y2 = tf.maximum(tf.minimum(y2, wy2), wy1)
299 |     x2 = tf.maximum(tf.minimum(x2, wx2), wx1)
300 |     clipped = tf.concat([y1, x1, y2, x2], axis=1, name="clipped_boxes")
301 |     return clipped
302 | 
303 | 
304 | class ProposalLayer(KE.Layer):
305 |     """Receives anchor scores and selects a subset to pass as proposals
306 |     to the second stage. Filtering is done based on anchor scores and
307 |     non-max suppression to remove overlaps. It also applies bounding
308 |     box refinment detals to anchors.
309 | 
310 |     Inputs:
311 |         rpn_probs: [batch, anchors, (bg prob, fg prob)]
312 |         rpn_bbox: [batch, anchors, (dy, dx, log(dh), log(dw))]
313 | 
314 |     Returns:
315 |         Proposals in normalized coordinates [batch, rois, (y1, x1, y2, x2)]
316 |     """
317 | 
318 |     def __init__(self, proposal_count, nms_threshold, anchors,
319 |                  config=None, **kwargs):
320 |         """
321 |         anchors: [N, (y1, x1, y2, x2)] anchors defined in image coordinates
322 |         """
323 |         super(ProposalLayer, self).__init__(**kwargs)
324 |         self.config = config
325 |         self.proposal_count = proposal_count
326 |         self.nms_threshold = nms_threshold
327 |         self.anchors = anchors.astype(np.float32)
328 | 
329 |     def call(self, inputs):
330 |         # Box Scores. Use the foreground class confidence. [Batch, num_rois, 1]
331 |         scores = inputs[0][:, :, 1]
332 |         # Box deltas [batch, num_rois, 4]
333 |         deltas = inputs[1]
334 |         deltas = deltas * np.reshape(self.config.RPN_BBOX_STD_DEV, [1, 1, 4])
335 |         # Base anchors
336 |         anchors = self.anchors
337 | 
338 |         # Improve performance by trimming to top anchors by score
339 |         # and doing the rest on the smaller subset.
340 |         pre_nms_limit = min(6000, self.anchors.shape[0])
341 |         ix = tf.nn.top_k(scores, pre_nms_limit, sorted=True,
342 |                          name="top_anchors").indices
343 |         scores = KerasRFCN.Utils.batch_slice([scores, ix], lambda x, y: tf.gather(x, y),
344 |                                    self.config.IMAGES_PER_GPU)
345 |         deltas = KerasRFCN.Utils.batch_slice([deltas, ix], lambda x, y: tf.gather(x, y),
346 |                                    self.config.IMAGES_PER_GPU)
347 |         anchors = KerasRFCN.Utils.batch_slice(ix, lambda x: tf.gather(anchors, x),
348 |                                     self.config.IMAGES_PER_GPU,
349 |                                     names=["pre_nms_anchors"])
350 | 
351 |         # Apply deltas to anchors to get refined anchors.
352 |         # [batch, N, (y1, x1, y2, x2)]
353 |         boxes = KerasRFCN.Utils.batch_slice([anchors, deltas],
354 |                                   lambda x, y: apply_box_deltas_graph(x, y),
355 |                                   self.config.IMAGES_PER_GPU,
356 |                                   names=["refined_anchors"])
357 | 
358 |         # Clip to image boundaries. [batch, N, (y1, x1, y2, x2)]
359 |         height, width = self.config.IMAGE_SHAPE[:2]
360 |         window = np.array([0, 0, height, width]).astype(np.float32)
361 |         boxes = KerasRFCN.Utils.batch_slice(boxes,
362 |                                   lambda x: clip_boxes_graph(x, window),
363 |                                   self.config.IMAGES_PER_GPU,
364 |                                   names=["refined_anchors_clipped"])
365 | 
366 |         # Filter out small boxes
367 |         # According to Xinlei Chen's paper, this reduces detection accuracy
368 |         # for small objects, so we're skipping it.
369 | 
370 |         # Normalize dimensions to range of 0 to 1.
371 |         normalized_boxes = boxes / np.array([[height, width, height, width]])
372 | 
373 |         # Non-max suppression
374 |         def nms(normalized_boxes, scores):
375 |             indices = tf.image.non_max_suppression(
376 |                 normalized_boxes, scores, self.proposal_count,
377 |                 self.nms_threshold, name="rpn_non_max_suppression")
378 |             proposals = tf.gather(normalized_boxes, indices)
379 |             # Pad if needed
380 |             padding = tf.maximum(self.proposal_count - tf.shape(proposals)[0], 0)
381 |             proposals = tf.pad(proposals, [(0, padding), (0, 0)])
382 |             return proposals
383 |         proposals = KerasRFCN.Utils.batch_slice([normalized_boxes, scores], nms,
384 |                                       self.config.IMAGES_PER_GPU)
385 |         return proposals
386 | 
387 |     def compute_output_shape(self, input_shape):
388 |         return (None, self.proposal_count, 4)
389 | 
390 | ############################################################
391 | #  Detection Target Layer
392 | ############################################################
393 | 
394 | def overlaps_graph(boxes1, boxes2):
395 |     """Computes IoU overlaps between two sets of boxes.
396 |     boxes1, boxes2: [N, (y1, x1, y2, x2)].
397 |     """
398 |     # 1. Tile boxes2 and repeate boxes1. This allows us to compare
399 |     # every boxes1 against every boxes2 without loops.
400 |     # TF doesn't have an equivalent to np.repeate() so simulate it
401 |     # using tf.tile() and tf.reshape.
402 |     b1 = tf.reshape(tf.tile(tf.expand_dims(boxes1, 1),
403 |                             [1, 1, tf.shape(boxes2)[0]]), [-1, 4])
404 |     b2 = tf.tile(boxes2, [tf.shape(boxes1)[0], 1])
405 |     # 2. Compute intersections
406 |     b1_y1, b1_x1, b1_y2, b1_x2 = tf.split(b1, 4, axis=1)
407 |     b2_y1, b2_x1, b2_y2, b2_x2 = tf.split(b2, 4, axis=1)
408 |     y1 = tf.maximum(b1_y1, b2_y1)
409 |     x1 = tf.maximum(b1_x1, b2_x1)
410 |     y2 = tf.minimum(b1_y2, b2_y2)
411 |     x2 = tf.minimum(b1_x2, b2_x2)
412 |     intersection = tf.maximum(x2 - x1, 0) * tf.maximum(y2 - y1, 0)
413 |     # 3. Compute unions
414 |     b1_area = (b1_y2 - b1_y1) * (b1_x2 - b1_x1)
415 |     b2_area = (b2_y2 - b2_y1) * (b2_x2 - b2_x1)
416 |     union = b1_area + b2_area - intersection
417 |     # 4. Compute IoU and reshape to [boxes1, boxes2]
418 |     iou = intersection / union
419 |     overlaps = tf.reshape(iou, [tf.shape(boxes1)[0], tf.shape(boxes2)[0]])
420 |     return overlaps
421 | 
422 | 
423 | def detection_targets_graph(proposals, gt_class_ids, gt_boxes, config):
424 |     """Generates detection targets for one image. Subsamples proposals and
425 |     generates target class IDs, bounding box deltas for each.
426 | 
427 |     Inputs:
428 |     proposals: [N, (y1, x1, y2, x2)] in normalized coordinates. Might
429 |                be zero padded if there are not enough proposals.
430 |     gt_class_ids: [MAX_GT_INSTANCES] int class IDs
431 |     gt_boxes: [MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized coordinates.
432 | 
433 |     Returns: Target ROIs and corresponding class IDs, bounding box shifts
434 |     rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized coordinates
435 |     class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs. Zero padded.
436 |     deltas: [TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (dy, dx, log(dh), log(dw))]
437 |             Class-specific bbox refinments.
438 | 
439 |     Note: Returned arrays might be zero padded if not enough target ROIs.
440 |     """
441 |     # Assertions
442 |     asserts = [
443 |         tf.Assert(tf.greater(tf.shape(proposals)[0], 0), [proposals],
444 |                   name="roi_assertion"),
445 |     ]
446 |     with tf.control_dependencies(asserts):
447 |         proposals = tf.identity(proposals)
448 | 
449 |     # Remove zero padding
450 |     proposals, _ = trim_zeros_graph(proposals, name="trim_proposals")
451 |     gt_boxes, non_zeros = trim_zeros_graph(gt_boxes, name="trim_gt_boxes")
452 |     gt_class_ids = tf.boolean_mask(gt_class_ids, non_zeros,
453 |                                    name="trim_gt_class_ids")
454 | 
455 |     # Handle COCO crowds
456 |     # A crowd box in COCO is a bounding box around several instances. Exclude
457 |     # them from training. A crowd box is given a negative class ID.
458 |     crowd_ix = tf.where(gt_class_ids < 0)[:, 0]
459 |     non_crowd_ix = tf.where(gt_class_ids > 0)[:, 0]
460 |     crowd_boxes = tf.gather(gt_boxes, crowd_ix)
461 | 
462 |     gt_class_ids = tf.gather(gt_class_ids, non_crowd_ix)
463 |     gt_boxes = tf.gather(gt_boxes, non_crowd_ix)
464 | 
465 |     # Compute overlaps matrix [proposals, gt_boxes]
466 |     overlaps = overlaps_graph(proposals, gt_boxes)
467 | 
468 |     # Compute overlaps with crowd boxes [anchors, crowds]
469 |     crowd_overlaps = overlaps_graph(proposals, crowd_boxes)
470 |     crowd_iou_max = tf.reduce_max(crowd_overlaps, axis=1)
471 |     no_crowd_bool = (crowd_iou_max < 0.001)
472 | 
473 |     # Determine postive and negative ROIs
474 |     roi_iou_max = tf.reduce_max(overlaps, axis=1)
475 |     # 1. Positive ROIs are those with >= 0.5 IoU with a GT box
476 |     positive_roi_bool = (roi_iou_max >= 0.5)
477 |     positive_indices = tf.where(positive_roi_bool)[:, 0]
478 |     # 2. Negative ROIs are those with < 0.5 with every GT box. Skip crowds.
479 |     negative_indices = tf.where(tf.logical_and(roi_iou_max < 0.5, no_crowd_bool))[:, 0]
480 | 
481 |     # Subsample ROIs. Aim for 33% positive
482 |     # Positive ROIs
483 |     positive_count = int(config.TRAIN_ROIS_PER_IMAGE *
484 |                          config.ROI_POSITIVE_RATIO)
485 |     positive_indices = tf.random_shuffle(positive_indices)[:positive_count]
486 |     positive_count = tf.shape(positive_indices)[0]
487 |     # Negative ROIs. Add enough to maintain positive:negative ratio.
488 |     r = 1.0 / config.ROI_POSITIVE_RATIO
489 |     negative_count = tf.cast(r * tf.cast(positive_count, tf.float32), tf.int32) - positive_count
490 |     negative_indices = tf.random_shuffle(negative_indices)[:negative_count]
491 |     # Gather selected ROIs
492 |     positive_rois = tf.gather(proposals, positive_indices)
493 |     negative_rois = tf.gather(proposals, negative_indices)
494 | 
495 |     # Assign positive ROIs to GT boxes.
496 |     positive_overlaps = tf.gather(overlaps, positive_indices)
497 |     roi_gt_box_assignment = tf.argmax(positive_overlaps, axis=1)
498 |     roi_gt_boxes = tf.gather(gt_boxes, roi_gt_box_assignment)
499 |     roi_gt_class_ids = tf.gather(gt_class_ids, roi_gt_box_assignment)
500 | 
501 |     # Compute bbox refinement for positive ROIs
502 |     deltas = KerasRFCN.Utils.box_refinement_graph(positive_rois, roi_gt_boxes)
503 |     deltas /= config.BBOX_STD_DEV
504 | 
505 |     # Append negative ROIs and pad bbox deltas and masks that
506 |     # are not used for negative ROIs with zeros.
507 |     rois = tf.concat([positive_rois, negative_rois], axis=0)
508 |     N = tf.shape(negative_rois)[0]
509 |     P = tf.maximum(config.TRAIN_ROIS_PER_IMAGE - tf.shape(rois)[0], 0)
510 |     rois = tf.pad(rois, [(0, P), (0, 0)])
511 |     roi_gt_boxes = tf.pad(roi_gt_boxes, [(0, N + P), (0, 0)])
512 |     roi_gt_class_ids = tf.pad(roi_gt_class_ids, [(0, N + P)])
513 |     deltas = tf.pad(deltas, [(0, N + P), (0, 0)])
514 | 
515 |     return rois, roi_gt_class_ids, deltas
516 | 
517 | def trim_zeros_graph(boxes, name=None):
518 |     """Often boxes are represented with matricies of shape [N, 4] and
519 |     are padded with zeros. This removes zero boxes.
520 | 
521 |     boxes: [N, 4] matrix of boxes.
522 |     non_zeros: [N] a 1D boolean mask identifying the rows to keep
523 |     """
524 |     non_zeros = tf.cast(tf.reduce_sum(tf.abs(boxes), axis=1), tf.bool)
525 |     boxes = tf.boolean_mask(boxes, non_zeros, name=name)
526 |     return boxes, non_zeros
527 | 
528 | class DetectionTargetLayer(KE.Layer):
529 |     """Subsamples proposals and generates target box refinment, class_ids for each.
530 | 
531 |     Inputs:
532 |     proposals: [batch, N, (y1, x1, y2, x2)] in normalized coordinates. Might
533 |                be zero padded if there are not enough proposals.
534 |     gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs.
535 |     gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized
536 |               coordinates.
537 | 
538 |     Returns: Target ROIs and corresponding class IDs, bounding box shifts
539 |     rois: [batch, TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized
540 |           coordinates
541 |     target_class_ids: [batch, TRAIN_ROIS_PER_IMAGE]. Integer class IDs.
542 |     target_deltas: [batch, TRAIN_ROIS_PER_IMAGE, NUM_CLASSES,
543 |                     (dy, dx, log(dh), log(dw), class_id)]
544 |                    Class-specific bbox refinments.
545 | 
546 |     Note: Returned arrays might be zero padded if not enough target ROIs.
547 |     """
548 | 
549 |     def __init__(self, config, **kwargs):
550 |         super(DetectionTargetLayer, self).__init__(**kwargs)
551 |         self.config = config
552 | 
553 |     def call(self, inputs):
554 |         proposals = inputs[0]
555 |         gt_class_ids = inputs[1]
556 |         gt_boxes = inputs[2]
557 | 
558 |         # Slice the batch and run a graph for each slice
559 |         # TODO: Rename target_bbox to target_deltas for clarity
560 |         names = ["rois", "target_class_ids", "target_bbox"]
561 |         outputs = KerasRFCN.Utils.batch_slice(
562 |             [proposals, gt_class_ids, gt_boxes],
563 |             lambda w, x, y: detection_targets_graph(
564 |                 w, x, y, self.config),
565 |             self.config.IMAGES_PER_GPU, names=names)
566 |         return outputs
567 | 
568 |     def compute_output_shape(self, input_shape):
569 |         return [
570 |             (None, self.config.TRAIN_ROIS_PER_IMAGE, 4),  # rois
571 |             (None, 1),  # class_ids
572 |             (None, self.config.TRAIN_ROIS_PER_IMAGE, 4),  # deltas
573 |         ]
574 | 
575 | ############################################################
576 | #  ROI pooling on Muti Bins
577 | ############################################################
578 | 
579 | def log2_graph(x):
580 |     """Implementatin of Log2. TF doesn't have a native implemenation."""
581 |     return tf.log(x) / tf.log(2.0)
582 | 
583 | class VotePooling(KE.Layer):
584 |     def __init__(self, num_rois, channel_num, k, pool_shape, batch_size, image_shape, **kwargs):
585 |         super(VotePooling, self).__init__(**kwargs)
586 |         self.channel_num = channel_num
587 |         self.k = k
588 |         self.num_rois = num_rois
589 |         self.pool_shape = pool_shape
590 |         self.batch_size = batch_size
591 |         self.image_shape = image_shape
592 | 
593 |     def call(self, inputs):
594 |         boxes = inputs[0]
595 | 
596 |         # Feature Maps. List of feature maps from different level of the
597 |         # feature pyramid. Each is [batch, height, width, channels]
598 |         score_maps = inputs[1:]
599 | 
600 |         # Assign each ROI to a level in the pyramid based on the ROI area.
601 |         y1, x1, y2, x2 = tf.split(boxes, 4, axis=2)
602 |         h = y2 - y1
603 |         w = x2 - x1
604 |         # Equation 1 in the Feature Pyramid Networks paper. Account for
605 |         # the fact that our coordinates are normalized here.
606 |         # e.g. a 224x224 ROI (in pixels) maps to P4
607 |         image_area = tf.cast(
608 |             self.image_shape[0] * self.image_shape[1], tf.float32)
609 |         roi_level = log2_graph(tf.sqrt(h * w) / (224.0 / tf.sqrt(image_area)))
610 |         roi_level = tf.minimum(5, tf.maximum(
611 |             2, 4 + tf.cast(tf.round(roi_level), tf.int32)))
612 |         roi_level = tf.squeeze(roi_level, 2)
613 | 
614 |         # Loop through levels and apply ROI pooling to each. P2 to P5.
615 |         pooled = []
616 |         box_to_level = []
617 |         for i, level in enumerate(range(2, 6)):
618 |             ix = tf.where(tf.equal(roi_level, level))
619 |             level_boxes = tf.gather_nd(boxes, ix)
620 | 
621 |             # Box indicies for crop_and_resize.
622 |             box_indices = tf.cast(ix[:, 0], tf.int32)
623 | 
624 |             # Keep track of which box is mapped to which level
625 |             box_to_level.append(ix)
626 | 
627 |             # Stop gradient propogation to ROI proposals
628 |             level_boxes = tf.stop_gradient(level_boxes)
629 |             box_indices = tf.stop_gradient(box_indices)
630 | 
631 |             # Here we use the simplified approach of a single value per bin,
632 |             # which is how it's done in tf.crop_and_resize()
633 |             # Result: [batch * num_boxes, pool_height, pool_width, channels]
634 |             pooled.append(tf.image.crop_and_resize(
635 |                 score_maps[i], level_boxes, box_indices, [self.pool_shape * self.k, self.pool_shape * self.k],
636 |                 method="bilinear"))
637 | 
638 |         # Pack pooled features into one tensor
639 |         pooled = tf.concat(pooled, axis=0)
640 | 
641 |         # position-sensitive ROI pooling + classify
642 |         score_map_bins = []
643 |         for channel_step in range(self.k*self.k):
644 |             bin_x = K.variable( int(channel_step % self.k) * self.pool_shape, dtype='int32')
645 |             bin_y = K.variable( int(channel_step / self.k) * self.pool_shape, dtype='int32')
646 |             channel_indices = K.variable(list(range(channel_step*self.channel_num, (channel_step+1)*self.channel_num)), dtype='int32')
647 |             croped = tf.image.crop_to_bounding_box(
648 |                 tf.gather( pooled, indices=channel_indices, axis=-1), bin_y, bin_x, self.pool_shape, self.pool_shape)
649 |             # [pool_shape, pool_shape, channel_num] ==> [1,1,channel_num] ==> [1, channel_num]
650 |             croped_mean = K.pool2d(croped, (self.pool_shape, self.pool_shape), strides=(1, 1), padding='valid', data_format="channels_last", pool_mode='avg')
651 |             # [batch * num_rois, 1,1,channel_num] ==> [batch * num_rois, 1, channel_num] 
652 |             croped_mean = K.squeeze(croped_mean, axis=1)
653 |             score_map_bins.append(croped_mean)
654 |         # [batch * num_rois, k^2, channel_num]
655 |         score_map_bins = tf.concat(score_map_bins, axis=1)
656 |         # [batch * num_rois, k*k, channel_num] ==> [batch * num_rois,channel_num]
657 |         # because "keepdims=False", the axis 1 will not keep. else will be [batch * num_rois,1,channel_num]
658 |         pooled = K.sum(score_map_bins, axis=1)
659 | 
660 |         # Pack box_to_level mapping into one array and add another
661 |         # column representing the order of pooled boxes
662 |         box_to_level = tf.concat(box_to_level, axis=0)
663 |         box_range = tf.expand_dims(tf.range(tf.shape(box_to_level)[0]), 1)
664 |         box_to_level = tf.concat([tf.cast(box_to_level, tf.int32), box_range],
665 |                                  axis=1)
666 | 
667 |         # Rearrange pooled features to match the order of the original boxes
668 |         # Sort box_to_level by batch then box index
669 |         # TF doesn't have a way to sort by two columns, so merge them and sort.
670 |         sorting_tensor = box_to_level[:, 0] * 100000 + box_to_level[:, 1]
671 |         ix = tf.nn.top_k(sorting_tensor, k=tf.shape(
672 |             box_to_level)[0]).indices[::-1]
673 |         ix = tf.gather(box_to_level[:, 2], ix)
674 |         pooled = tf.gather(pooled, ix)
675 | 
676 |         # Re-add the batch dimension
677 |         pooled = tf.expand_dims(pooled, 0)
678 |         
679 |         return pooled
680 | 
681 |     def compute_output_shape(self, input_shape):
682 |         return None, self.num_rois, self.channel_num
683 | 
684 | ############################################################
685 | #  Detection Layer
686 | ############################################################
687 | 
688 | def clip_to_window(window, boxes):
689 |     """
690 |     window: (y1, x1, y2, x2). The window in the image we want to clip to.
691 |     boxes: [N, (y1, x1, y2, x2)]
692 |     """
693 |     boxes[:, 0] = np.maximum(np.minimum(boxes[:, 0], window[2]), window[0])
694 |     boxes[:, 1] = np.maximum(np.minimum(boxes[:, 1], window[3]), window[1])
695 |     boxes[:, 2] = np.maximum(np.minimum(boxes[:, 2], window[2]), window[0])
696 |     boxes[:, 3] = np.maximum(np.minimum(boxes[:, 3], window[3]), window[1])
697 |     return boxes
698 | 
699 | 
700 | def refine_detections_graph(rois, probs, deltas, window, config):
701 |     """Refine classified proposals and filter overlaps and return final
702 |     detections.
703 |     
704 |     Inputs:
705 |         rois: [N, (y1, x1, y2, x2)] in normalized coordinates
706 |         probs: [N, num_classes]. Class probabilities.
707 |         deltas: [N, (dy, dx, log(dh), log(dw))]. Class-specific
708 |                 bounding box deltas.
709 |         window: (y1, x1, y2, x2) in image coordinates. The part of the image
710 |             that contains the image excluding the padding.
711 | 
712 |     Returns detections shaped: [N, (y1, x1, y2, x2, class_id, score)] where
713 |         coordinates are in image domain.
714 |     """
715 | 
716 |     # Class IDs per ROI
717 |     class_ids = tf.argmax(probs, axis=1, output_type=tf.int32)
718 |     # Class probability of the top class of each ROI
719 |     indices = tf.stack([tf.range(probs.shape[0]), class_ids], axis=1)
720 |     class_scores = tf.gather_nd(probs, indices)
721 |     # Class-specific bounding box deltas
722 |     # deltas_specific = tf.gather_nd(deltas, indices)
723 |     # Apply bounding box deltas
724 |     # Shape: [boxes, (y1, x1, y2, x2)] in normalized coordinates
725 |     refined_rois = apply_box_deltas_graph(
726 |         rois, deltas * config.BBOX_STD_DEV)
727 |     # Convert coordiates to image domain
728 |     # TODO: better to keep them normalized until later
729 |     height, width = config.IMAGE_SHAPE[:2]
730 |     refined_rois *= tf.constant([height, width, height, width], dtype=tf.float32)
731 |     # Clip boxes to image window
732 |     refined_rois = clip_boxes_graph(refined_rois, window)
733 |     # Round and cast to int since we're deadling with pixels now
734 |     refined_rois = tf.to_int32(tf.rint(refined_rois))
735 | 
736 |     # TODO: Filter out boxes with zero area
737 | 
738 |     # Filter out background boxes
739 |     keep = tf.where(class_ids > 0)[:, 0]
740 |     # Filter out low confidence boxes
741 |     if config.DETECTION_MIN_CONFIDENCE:
742 |         conf_keep = tf.where(class_scores >= config.DETECTION_MIN_CONFIDENCE)[:, 0]
743 |         keep = tf.sets.set_intersection(tf.expand_dims(keep, 0),
744 |                                         tf.expand_dims(conf_keep, 0))
745 |         keep = tf.sparse_tensor_to_dense(keep)[0]
746 | 
747 |     # Apply per-class NMS
748 |     # 1. Prepare variables
749 |     pre_nms_class_ids = tf.gather(class_ids, keep)
750 |     pre_nms_scores = tf.gather(class_scores, keep)
751 |     pre_nms_rois = tf.gather(refined_rois,   keep)
752 |     unique_pre_nms_class_ids = tf.unique(pre_nms_class_ids)[0]
753 | 
754 |     def nms_keep_map(class_id):
755 |         """Apply Non-Maximum Suppression on ROIs of the given class."""
756 |         # Indices of ROIs of the given class
757 |         ixs = tf.where(tf.equal(pre_nms_class_ids, class_id))[:, 0]
758 |         # Apply NMS
759 |         class_keep = tf.image.non_max_suppression(
760 |                 tf.to_float(tf.gather(pre_nms_rois, ixs)),
761 |                 tf.gather(pre_nms_scores, ixs),
762 |                 max_output_size=config.DETECTION_MAX_INSTANCES,
763 |                 iou_threshold=config.DETECTION_NMS_THRESHOLD)
764 |         # Map indicies
765 |         class_keep = tf.gather(keep, tf.gather(ixs, class_keep))
766 |         # Pad with -1 so returned tensors have the same shape
767 |         gap = config.DETECTION_MAX_INSTANCES - tf.shape(class_keep)[0]
768 |         class_keep = tf.pad(class_keep, [(0, gap)],
769 |                             mode='CONSTANT', constant_values=-1)
770 |         # Set shape so map_fn() can infer result shape
771 |         class_keep.set_shape([config.DETECTION_MAX_INSTANCES])
772 |         return class_keep
773 | 
774 |     # 2. Map over class IDs
775 |     nms_keep = tf.map_fn(nms_keep_map, unique_pre_nms_class_ids,
776 |                          dtype=tf.int64)
777 |     # 3. Merge results into one list, and remove -1 padding
778 |     nms_keep = tf.reshape(nms_keep, [-1])
779 |     nms_keep = tf.gather(nms_keep, tf.where(nms_keep > -1)[:, 0])
780 |     # 4. Compute intersection between keep and nms_keep
781 |     keep = tf.sets.set_intersection(tf.expand_dims(keep, 0),
782 |                                     tf.expand_dims(nms_keep, 0))
783 |     keep = tf.sparse_tensor_to_dense(keep)[0]
784 |     # Keep top detections
785 |     roi_count = config.DETECTION_MAX_INSTANCES
786 |     class_scores_keep = tf.gather(class_scores, keep)
787 |     num_keep = tf.minimum(tf.shape(class_scores_keep)[0], roi_count)
788 |     top_ids = tf.nn.top_k(class_scores_keep, k=num_keep, sorted=True)[1]
789 |     keep = tf.gather(keep, top_ids)
790 | 
791 |     # Arrange output as [N, (y1, x1, y2, x2, class_id, score)]
792 |     # Coordinates are in image domain.
793 |     detections = tf.concat([
794 |         tf.to_float(tf.gather(refined_rois, keep)),
795 |         tf.to_float(tf.gather(class_ids, keep))[..., tf.newaxis],
796 |         tf.gather(class_scores, keep)[..., tf.newaxis]
797 |         ], axis=1)
798 | 
799 |     # Pad with zeros if detections < DETECTION_MAX_INSTANCES
800 |     gap = config.DETECTION_MAX_INSTANCES - tf.shape(detections)[0]
801 |     detections = tf.pad(detections, [(0, gap), (0, 0)], "CONSTANT")
802 |     return detections
803 | 
804 | def parse_image_meta_graph(meta):
805 |     """Parses a tensor that contains image attributes to its components.
806 |     See compose_image_meta() for more details.
807 | 
808 |     meta: [batch, meta length] where meta length depends on NUM_CLASSES
809 |     """
810 |     image_id = meta[:, 0]
811 |     image_shape = meta[:, 1:4]
812 |     window = meta[:, 4:8]
813 |     active_class_ids = meta[:, 8:]
814 |     return [image_id, image_shape, window, active_class_ids]
815 | 
816 | class DetectionLayer(KE.Layer):
817 |     """Takes classified proposal boxes and their bounding box deltas and
818 |     returns the final detection boxes.
819 | 
820 |     Returns:
821 |     [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] where
822 |     coordinates are in image domain
823 |     """
824 | 
825 |     def __init__(self, config=None, **kwargs):
826 |         super(DetectionLayer, self).__init__(**kwargs)
827 |         self.config = config
828 | 
829 |     def call(self, inputs):
830 |         rois = inputs[0]
831 |         mrcnn_class = inputs[1]
832 |         mrcnn_bbox = inputs[2]
833 |         image_meta = inputs[3]
834 | 
835 |         # Run detection refinement graph on each item in the batch
836 |         _, _, window, _ = parse_image_meta_graph(image_meta)
837 |         detections_batch = KerasRFCN.Utils.batch_slice(
838 |             [rois, mrcnn_class, mrcnn_bbox, window],
839 |             lambda x, y, w, z: refine_detections_graph(x, y, w, z, self.config),
840 |             self.config.IMAGES_PER_GPU)
841 | 
842 |         # Reshape output
843 |         # [batch, num_detections, (y1, x1, y2, x2, class_score)] in pixels
844 |         return tf.reshape(
845 |             detections_batch,
846 |             [self.config.BATCH_SIZE, self.config.DETECTION_MAX_INSTANCES, 6])
847 | 
848 |     def compute_output_shape(self, input_shape):
849 |         return (None, self.config.DETECTION_MAX_INSTANCES, 6)
850 | 


--------------------------------------------------------------------------------
/KerasRFCN/Model/ResNet.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Keras RFCN
  3 | Copyright (c) 2018
  4 | Licensed under the MIT License (see LICENSE for details)
  5 | Written by parap1uie-s@github.com
  6 | """
  7 | 
  8 | '''
  9 | This is Backbone of RFCN Model
 10 | ResNet50 or 101
 11 | '''
 12 | 
 13 | import keras.layers as KL
 14 | 
 15 | class ResNet(object):
 16 |     """docstring for ResNet101"""
 17 |     def __init__(self, input_tensor, architecture='resnet50'):
 18 |         self.keras_model = ""
 19 |         self.input_tensor = input_tensor
 20 |         self.output_layers = ""
 21 |         assert architecture in ['resnet50', 'resnet101'], 'architecture must be resnet50 or resnet101!'
 22 |         self.architecture = architecture
 23 |         self.construct_graph(input_tensor)
 24 |         
 25 |     def construct_graph(self, input_tensor, stage5=True):
 26 |         assert self.input_tensor is not None, "input_tensor can not be none!"
 27 |         # Stage 1
 28 |         x = KL.ZeroPadding2D((3, 3))(input_tensor)
 29 |         x = KL.Conv2D(64, (7, 7), strides=(2, 2), name='conv1', use_bias=True)(x)
 30 |         x = BatchNorm(axis=3, name='bn_conv1')(x)
 31 |         x = KL.Activation('relu')(x)
 32 |         C1 = x = KL.MaxPooling2D((3, 3), strides=(2, 2), padding="same")(x)
 33 |         # Stage 2
 34 |         x = self.conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1))
 35 |         x = self.identity_block(x, 3, [64, 64, 256], stage=2, block='b')
 36 |         C2 = x = self.identity_block(x, 3, [64, 64, 256], stage=2, block='c')
 37 |         # Stage 3
 38 |         x = self.conv_block(x, 3, [128, 128, 512], stage=3, block='a')
 39 |         x = self.identity_block(x, 3, [128, 128, 512], stage=3, block='b')
 40 |         x = self.identity_block(x, 3, [128, 128, 512], stage=3, block='c')
 41 |         C3 = x = self.identity_block(x, 3, [128, 128, 512], stage=3, block='d')
 42 |         # Stage 4
 43 |         x = self.conv_block(x, 3, [256, 256, 1024], stage=4, block='a')
 44 |         block_count = {"resnet50": 5, "resnet101": 22}[self.architecture]
 45 |         for i in range(block_count):
 46 |             x = self.identity_block(x, 3, [256, 256, 1024], stage=4, block=chr(98 + i))
 47 |         C4 = x
 48 |         # Stage 5
 49 |         if stage5:
 50 |             x = self.conv_block(x, 3, [512, 512, 2048], stage=5, block='a')
 51 |             x = self.identity_block(x, 3, [512, 512, 2048], stage=5, block='b')
 52 |             C5 = x = self.identity_block(x, 3, [512, 512, 2048], stage=5, block='c')
 53 |         else:
 54 |             C5 = None
 55 | 
 56 |         P5 = KL.Conv2D(256, (1, 1), name='fpn_c5p5')(C5)
 57 |         P4 = KL.Add(name="fpn_p4add")([
 58 |             KL.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5),
 59 |             KL.Conv2D(256, (1, 1), name='fpn_c4p4')(C4)])
 60 |         P3 = KL.Add(name="fpn_p3add")([
 61 |             KL.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4),
 62 |             KL.Conv2D(256, (1, 1), name='fpn_c3p3')(C3)])
 63 |         P2 = KL.Add(name="fpn_p2add")([
 64 |             KL.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3),
 65 |             KL.Conv2D(256, (1, 1), name='fpn_c2p2')(C2)])
 66 | 
 67 |         # Attach 3x3 conv to all P layers to get the final feature maps.
 68 |         P2 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p2")(P2)
 69 |         P3 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p3")(P3)
 70 |         P4 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p4")(P4)
 71 |         P5 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p5")(P5)
 72 |         # P6 is used for the 5th anchor scale in RPN. Generated by
 73 |         # subsampling from P5 with stride of 2.
 74 |         P6 = KL.MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(P5)
 75 | 
 76 |         self.output_layers = [P2, P3, P4, P5, P6]
 77 | 
 78 |     def conv_block(self, input_tensor, kernel_size, filters, stage, block,
 79 |                strides=(2, 2), use_bias=True):
 80 |         """conv_block is the block that has a conv layer at shortcut
 81 |         # Arguments
 82 |             input_tensor: input tensor
 83 |             kernel_size: defualt 3, the kernel size of middle conv layer at main path
 84 |             filters: list of integers, the nb_filters of 3 conv layer at main path
 85 |             stage: integer, current stage label, used for generating layer names
 86 |             block: 'a','b'..., current block label, used for generating layer names
 87 |         Note that from stage 3, the first conv layer at main path is with subsample=(2,2)
 88 |         And the shortcut should have subsample=(2,2) as well
 89 |         """
 90 |         nb_filter1, nb_filter2, nb_filter3 = filters
 91 |         conv_name_base = 'res' + str(stage) + block + '_branch'
 92 |         bn_name_base = 'bn' + str(stage) + block + '_branch'
 93 | 
 94 |         x = KL.Conv2D(nb_filter1, (1, 1), strides=strides,
 95 |                       name=conv_name_base + '2a', use_bias=use_bias)(input_tensor)
 96 |         x = BatchNorm(axis=3, name=bn_name_base + '2a')(x)
 97 |         x = KL.Activation('relu')(x)
 98 | 
 99 |         x = KL.Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same',
100 |                       name=conv_name_base + '2b', use_bias=use_bias)(x)
101 |         x = BatchNorm(axis=3, name=bn_name_base + '2b')(x)
102 |         x = KL.Activation('relu')(x)
103 | 
104 |         x = KL.Conv2D(nb_filter3, (1, 1), name=conv_name_base +
105 |                       '2c', use_bias=use_bias)(x)
106 |         x = BatchNorm(axis=3, name=bn_name_base + '2c')(x)
107 | 
108 |         shortcut = KL.Conv2D(nb_filter3, (1, 1), strides=strides,
109 |                              name=conv_name_base + '1', use_bias=use_bias)(input_tensor)
110 |         shortcut = BatchNorm(axis=3, name=bn_name_base + '1')(shortcut)
111 | 
112 |         x = KL.Add()([x, shortcut])
113 |         x = KL.Activation('relu', name='res' + str(stage) + block + '_out')(x)
114 |         return x
115 | 
116 |     def identity_block(self, input_tensor, kernel_size, filters, stage, block,
117 |                    use_bias=True):
118 |         """The identity_block is the block that has no conv layer at shortcut
119 |         # Arguments
120 |             input_tensor: input tensor
121 |             kernel_size: defualt 3, the kernel size of middle conv layer at main path
122 |             filters: list of integers, the nb_filters of 3 conv layer at main path
123 |             stage: integer, current stage label, used for generating layer names
124 |             block: 'a','b'..., current block label, used for generating layer names
125 |         """
126 |         nb_filter1, nb_filter2, nb_filter3 = filters
127 |         conv_name_base = 'res' + str(stage) + block + '_branch'
128 |         bn_name_base = 'bn' + str(stage) + block + '_branch'
129 | 
130 |         x = KL.Conv2D(nb_filter1, (1, 1), name=conv_name_base + '2a',
131 |                       use_bias=use_bias)(input_tensor)
132 |         x = BatchNorm(axis=3, name=bn_name_base + '2a')(x)
133 |         x = KL.Activation('relu')(x)
134 | 
135 |         x = KL.Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same',
136 |                       name=conv_name_base + '2b', use_bias=use_bias)(x)
137 |         x = BatchNorm(axis=3, name=bn_name_base + '2b')(x)
138 |         x = KL.Activation('relu')(x)
139 | 
140 |         x = KL.Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c',
141 |                       use_bias=use_bias)(x)
142 |         x = BatchNorm(axis=3, name=bn_name_base + '2c')(x)
143 | 
144 |         x = KL.Add()([x, input_tensor])
145 |         x = KL.Activation('relu', name='res' + str(stage) + block + '_out')(x)
146 |         return x
147 | 
148 | class BatchNorm(KL.BatchNormalization):
149 |     """Batch Normalization class. Subclasses the Keras BN class and
150 |     hardcodes training=False so the BN layer doesn't update
151 |     during training.
152 | 
153 |     Batch normalization has a negative effect on training if batches are small
154 |     so we disable it here.
155 |     """
156 | 
157 |     def call(self, inputs, training=None):
158 |         return super(self.__class__, self).call(inputs, training=False)


--------------------------------------------------------------------------------
/KerasRFCN/Model/ResNet_dilated.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Keras RFCN
  3 | Copyright (c) 2018
  4 | Licensed under the MIT License (see LICENSE for details)
  5 | Written by parap1uie-s@github.com
  6 | """
  7 | 
  8 | '''
  9 | This is Backbone of RFCN Model
 10 | Dilated ResNet50 or 101
 11 | Paper: DetNet: A Backbone network for Object Detection
 12 | https://arxiv.org/abs/1804.06215
 13 | '''
 14 | 
 15 | import keras.layers as KL
 16 | 
 17 | class ResNet_dilated(object):
 18 |     """docstring for ResNet101"""
 19 |     def __init__(self, input_tensor, architecture='resnet50'):
 20 |         self.keras_model = ""
 21 |         self.input_tensor = input_tensor
 22 |         self.output_layers = ""
 23 |         assert architecture in ['resnet50', 'resnet101'], 'architecture must be resnet50 or resnet101!'
 24 |         self.architecture = architecture
 25 |         self.construct_graph(input_tensor)
 26 |         
 27 |     def construct_graph(self, input_tensor, stage5=True):
 28 |         assert self.input_tensor is not None, "input_tensor can not be none!"
 29 |         # Stage 1
 30 |         x = KL.ZeroPadding2D((3, 3))(input_tensor)
 31 |         x = KL.Conv2D(64, (7, 7), strides=(2, 2), name='conv1', use_bias=True)(x)
 32 |         x = BatchNorm(axis=3, name='bn_conv1')(x)
 33 |         x = KL.Activation('relu')(x)
 34 |         C1 = x = KL.MaxPooling2D((3, 3), strides=(2, 2), padding="same")(x)
 35 |         # Stage 2
 36 |         x = self.conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1))
 37 |         x = self.identity_block(x, 3, [64, 64, 256], stage=2, block='b')
 38 |         C2 = x = self.identity_block(x, 3, [64, 64, 256], stage=2, block='c')
 39 |         # Stage 3
 40 |         x = self.conv_block(x, 3, [128, 128, 512], stage=3, block='a')
 41 |         x = self.identity_block(x, 3, [128, 128, 512], stage=3, block='b')
 42 |         x = self.identity_block(x, 3, [128, 128, 512], stage=3, block='c')
 43 |         C3 = x = self.identity_block(x, 3, [128, 128, 512], stage=3, block='d')
 44 |         # Stage 4
 45 |         x = self.conv_block(x, 3, [256, 256, 1024], stage=4, block='a')
 46 |         block_count = {"resnet50": 5, "resnet101": 22}[self.architecture]
 47 |         for i in range(block_count):
 48 |             x = self.identity_block(x, 3, [256, 256, 1024], stage=4, block=chr(98 + i))
 49 |         C4 = x
 50 |         # Stage 5
 51 |         x = self.conv_block(x, 3, [256, 256, 256], stage=5, block='a', dilated=2, strides=(1, 1))
 52 |         x = self.identity_block(x, 3, [256, 256, 256], stage=5, block='b', dilated=2)
 53 |         C5 = x = self.identity_block(x, 3, [256, 256, 256], stage=5, block='c', dilated=2)
 54 |         # Stage 6
 55 |         x = self.conv_block(x, 3, [256, 256, 256], stage=6, block='a', dilated=2, strides=(1, 1))
 56 |         x = self.identity_block(x, 3, [256, 256, 256], stage=6, block='b', dilated=2)
 57 |         C6 = x = self.identity_block(x, 3, [256, 256, 256], stage=6, block='c', dilated=2)
 58 | 
 59 |         P6 = KL.Conv2D(256, (1, 1), name='fpn_c6p6')(C6)
 60 |         P5 = KL.Add(name="fpn_p5add")([P6, KL.Conv2D(256, (1, 1), name='fpn_c5p5')(C5)])
 61 |         P4 = KL.Add(name="fpn_p4add")([P5, KL.Conv2D(256, (1, 1), name='fpn_c4p4')(C4)])
 62 |         P3 = KL.Add(name="fpn_p3add")([
 63 |             KL.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4),
 64 |             KL.Conv2D(256, (1, 1), name='fpn_c3p3')(C3)])
 65 |         P2 = KL.Add(name="fpn_p2add")([
 66 |             KL.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3),
 67 |             KL.Conv2D(256, (1, 1), name='fpn_c2p2')(C2)])
 68 | 
 69 |         # Attach 3x3 conv to all P layers to get the final feature maps.
 70 |         P2 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p2")(P2)
 71 |         P3 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p3")(P3)
 72 |         P4 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p4")(P4)
 73 |         P5 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p5")(P5)
 74 |         # P6 is used for the 5th anchor scale in RPN. Generated by
 75 |         # subsampling from P5 with stride of 2.
 76 |         P6 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p6")(P6)
 77 | 
 78 |         self.output_layers = [P2, P3, P4, P5, P6]
 79 | 
 80 |     def conv_block(self, input_tensor, kernel_size, filters, stage, block,
 81 |                strides=(2, 2), use_bias=True, dilated=1):
 82 |         """conv_block is the block that has a conv layer at shortcut
 83 |         # Arguments
 84 |             input_tensor: input tensor
 85 |             kernel_size: defualt 3, the kernel size of middle conv layer at main path
 86 |             filters: list of integers, the nb_filters of 3 conv layer at main path
 87 |             stage: integer, current stage label, used for generating layer names
 88 |             block: 'a','b'..., current block label, used for generating layer names
 89 |         Note that from stage 3, the first conv layer at main path is with subsample=(2,2)
 90 |         And the shortcut should have subsample=(2,2) as well
 91 |         """
 92 |         nb_filter1, nb_filter2, nb_filter3 = filters
 93 |         conv_name_base = 'res' + str(stage) + block + '_branch'
 94 |         bn_name_base = 'bn' + str(stage) + block + '_branch'
 95 | 
 96 |         x = KL.Conv2D(nb_filter1, (1, 1), strides=strides,
 97 |                       name=conv_name_base + '2a', use_bias=use_bias)(input_tensor)
 98 |         x = BatchNorm(axis=3, name=bn_name_base + '2a')(x)
 99 |         x = KL.Activation('relu')(x)
100 | 
101 |         x = KL.Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same',
102 |                       name=conv_name_base + '2b', use_bias=use_bias, dilation_rate=dilated)(x)
103 |         x = BatchNorm(axis=3, name=bn_name_base + '2b')(x)
104 |         x = KL.Activation('relu')(x)
105 | 
106 |         x = KL.Conv2D(nb_filter3, (1, 1), name=conv_name_base +
107 |                       '2c', use_bias=use_bias)(x)
108 |         x = BatchNorm(axis=3, name=bn_name_base + '2c')(x)
109 | 
110 |         shortcut = KL.Conv2D(nb_filter3, (1, 1), strides=strides,
111 |                              name=conv_name_base + '1', use_bias=use_bias)(input_tensor)
112 |         shortcut = BatchNorm(axis=3, name=bn_name_base + '1')(shortcut)
113 | 
114 |         x = KL.Add()([x, shortcut])
115 |         x = KL.Activation('relu', name='res' + str(stage) + block + '_out')(x)
116 |         return x
117 | 
118 |     def identity_block(self, input_tensor, kernel_size, filters, stage, block,
119 |                    use_bias=True, dilated=1):
120 |         """The identity_block is the block that has no conv layer at shortcut
121 |         # Arguments
122 |             input_tensor: input tensor
123 |             kernel_size: defualt 3, the kernel size of middle conv layer at main path
124 |             filters: list of integers, the nb_filters of 3 conv layer at main path
125 |             stage: integer, current stage label, used for generating layer names
126 |             block: 'a','b'..., current block label, used for generating layer names
127 |         """
128 |         nb_filter1, nb_filter2, nb_filter3 = filters
129 |         conv_name_base = 'res' + str(stage) + block + '_branch'
130 |         bn_name_base = 'bn' + str(stage) + block + '_branch'
131 | 
132 |         x = KL.Conv2D(nb_filter1, (1, 1), name=conv_name_base + '2a',
133 |                       use_bias=use_bias)(input_tensor)
134 |         x = BatchNorm(axis=3, name=bn_name_base + '2a')(x)
135 |         x = KL.Activation('relu')(x)
136 | 
137 |         x = KL.Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same',
138 |                       name=conv_name_base + '2b', use_bias=use_bias, dilation_rate=dilated)(x)
139 |         x = BatchNorm(axis=3, name=bn_name_base + '2b')(x)
140 |         x = KL.Activation('relu')(x)
141 | 
142 |         x = KL.Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c',
143 |                       use_bias=use_bias)(x)
144 |         x = BatchNorm(axis=3, name=bn_name_base + '2c')(x)
145 | 
146 |         x = KL.Add()([x, input_tensor])
147 |         x = KL.Activation('relu', name='res' + str(stage) + block + '_out')(x)
148 |         return x
149 | 
150 | class BatchNorm(KL.BatchNormalization):
151 |     """Batch Normalization class. Subclasses the Keras BN class and
152 |     hardcodes training=False so the BN layer doesn't update
153 |     during training.
154 | 
155 |     Batch normalization has a negative effect on training if batches are small
156 |     so we disable it here.
157 |     """
158 | 
159 |     def call(self, inputs, training=None):
160 |         return super(self.__class__, self).call(inputs, training=False)


--------------------------------------------------------------------------------
/KerasRFCN/Utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Keras RFCN
  3 | Copyright (c) 2018
  4 | Licensed under the MIT License (see LICENSE for details)
  5 | Written by parap1uie-s@github.com
  6 | """
  7 | 
  8 | import sys
  9 | import os
 10 | import math
 11 | import random
 12 | import numpy as np
 13 | import tensorflow as tf
 14 | import scipy.misc
 15 | import skimage.color
 16 | import skimage.io
 17 | import urllib.request
 18 | import shutil
 19 | 
 20 | ############################################################
 21 | #  Bounding Boxes
 22 | ############################################################
 23 | 
 24 | # def extract_bboxes(mask):
 25 | #     """Compute bounding boxes.
 26 | #     mask: [height, width, num_instances]. Mask pixels are either 1 or 0.
 27 | 
 28 | #     Returns: bbox array [num_instances, (y1, x1, y2, x2)].
 29 | #     """
 30 | #     boxes = np.zeros([mask.shape[-1], 4], dtype=np.int32)
 31 | #     for i in range(mask.shape[-1]):
 32 | #         m = mask[:, :, i]
 33 | #         # Bounding box.
 34 | #         horizontal_indicies = np.where(np.any(m, axis=0))[0]
 35 | #         vertical_indicies = np.where(np.any(m, axis=1))[0]
 36 | #         if horizontal_indicies.shape[0]:
 37 | #             x1, x2 = horizontal_indicies[[0, -1]]
 38 | #             y1, y2 = vertical_indicies[[0, -1]]
 39 | #             # x2 and y2 should not be part of the box. Increment by 1.
 40 | #             x2 += 1
 41 | #             y2 += 1
 42 | #         else:
 43 | #             # No mask for this instance. Might happen due to
 44 | #             # resizing or cropping. Set bbox to zeros
 45 | #             x1, x2, y1, y2 = 0, 0, 0, 0
 46 | #         boxes[i] = np.array([y1, x1, y2, x2])
 47 | #     return boxes.astype(np.int32)
 48 | 
 49 | 
 50 | def compute_iou(box, boxes, box_area, boxes_area):
 51 |     """Calculates IoU of the given box with the array of the given boxes.
 52 |     box: 1D vector [y1, x1, y2, x2]
 53 |     boxes: [boxes_count, (y1, x1, y2, x2)]
 54 |     box_area: float. the area of 'box'
 55 |     boxes_area: array of length boxes_count.
 56 | 
 57 |     Note: the areas are passed in rather than calculated here for
 58 |           efficency. Calculate once in the caller to avoid duplicate work.
 59 |     """
 60 |     # Calculate intersection areas
 61 |     y1 = np.maximum(box[0], boxes[:, 0])
 62 |     y2 = np.minimum(box[2], boxes[:, 2])
 63 |     x1 = np.maximum(box[1], boxes[:, 1])
 64 |     x2 = np.minimum(box[3], boxes[:, 3])
 65 |     intersection = np.maximum(x2 - x1, 0) * np.maximum(y2 - y1, 0)
 66 |     union = box_area + boxes_area[:] - intersection[:]
 67 |     iou = intersection / union
 68 |     return iou
 69 | 
 70 | 
 71 | def compute_overlaps(boxes1, boxes2):
 72 |     """Computes IoU overlaps between two sets of boxes.
 73 |     boxes1, boxes2: [N, (y1, x1, y2, x2)].
 74 | 
 75 |     For better performance, pass the largest set first and the smaller second.
 76 |     """
 77 |     # Areas of anchors and GT boxes
 78 |     area1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])
 79 |     area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])
 80 | 
 81 |     # Compute overlaps to generate matrix [boxes1 count, boxes2 count]
 82 |     # Each cell contains the IoU value.
 83 |     overlaps = np.zeros((boxes1.shape[0], boxes2.shape[0]))
 84 |     for i in range(overlaps.shape[1]):
 85 |         box2 = boxes2[i]
 86 |         overlaps[:, i] = compute_iou(box2, boxes1, area2[i], area1)
 87 |     return overlaps
 88 | 
 89 | 
 90 | def non_max_suppression(boxes, scores, threshold):
 91 |     """Performs non-maximum supression and returns indicies of kept boxes.
 92 |     boxes: [N, (y1, x1, y2, x2)]. Notice that (y2, x2) lays outside the box.
 93 |     scores: 1-D array of box scores.
 94 |     threshold: Float. IoU threshold to use for filtering.
 95 |     """
 96 |     assert boxes.shape[0] > 0
 97 |     if boxes.dtype.kind != "f":
 98 |         boxes = boxes.astype(np.float32)
 99 | 
100 |     # Compute box areas
101 |     y1 = boxes[:, 0]
102 |     x1 = boxes[:, 1]
103 |     y2 = boxes[:, 2]
104 |     x2 = boxes[:, 3]
105 |     area = (y2 - y1) * (x2 - x1)
106 | 
107 |     # Get indicies of boxes sorted by scores (highest first)
108 |     ixs = scores.argsort()[::-1]
109 | 
110 |     pick = []
111 |     while len(ixs) > 0:
112 |         # Pick top box and add its index to the list
113 |         i = ixs[0]
114 |         pick.append(i)
115 |         # Compute IoU of the picked box with the rest
116 |         iou = compute_iou(boxes[i], boxes[ixs[1:]], area[i], area[ixs[1:]])
117 |         # Identify boxes with IoU over the threshold. This
118 |         # returns indicies into ixs[1:], so add 1 to get
119 |         # indicies into ixs.
120 |         remove_ixs = np.where(iou > threshold)[0] + 1
121 |         # Remove indicies of the picked and overlapped boxes.
122 |         ixs = np.delete(ixs, remove_ixs)
123 |         ixs = np.delete(ixs, 0)
124 |     return np.array(pick, dtype=np.int32)
125 | 
126 | 
127 | def apply_box_deltas(boxes, deltas):
128 |     """Applies the given deltas to the given boxes.
129 |     boxes: [N, (y1, x1, y2, x2)]. Note that (y2, x2) is outside the box.
130 |     deltas: [N, (dy, dx, log(dh), log(dw))]
131 |     """
132 |     boxes = boxes.astype(np.float32)
133 |     # Convert to y, x, h, w
134 |     height = boxes[:, 2] - boxes[:, 0]
135 |     width = boxes[:, 3] - boxes[:, 1]
136 |     center_y = boxes[:, 0] + 0.5 * height
137 |     center_x = boxes[:, 1] + 0.5 * width
138 |     # Apply deltas
139 |     center_y += deltas[:, 0] * height
140 |     center_x += deltas[:, 1] * width
141 |     height *= np.exp(deltas[:, 2])
142 |     width *= np.exp(deltas[:, 3])
143 |     # Convert back to y1, x1, y2, x2
144 |     y1 = center_y - 0.5 * height
145 |     x1 = center_x - 0.5 * width
146 |     y2 = y1 + height
147 |     x2 = x1 + width
148 |     return np.stack([y1, x1, y2, x2], axis=1)
149 | 
150 | 
151 | def box_refinement_graph(box, gt_box):
152 |     """Compute refinement needed to transform box to gt_box.
153 |     box and gt_box are [N, (y1, x1, y2, x2)]
154 |     """
155 |     box = tf.cast(box, tf.float32)
156 |     gt_box = tf.cast(gt_box, tf.float32)
157 | 
158 |     height = box[:, 2] - box[:, 0]
159 |     width = box[:, 3] - box[:, 1]
160 |     center_y = box[:, 0] + 0.5 * height
161 |     center_x = box[:, 1] + 0.5 * width
162 | 
163 |     gt_height = gt_box[:, 2] - gt_box[:, 0]
164 |     gt_width = gt_box[:, 3] - gt_box[:, 1]
165 |     gt_center_y = gt_box[:, 0] + 0.5 * gt_height
166 |     gt_center_x = gt_box[:, 1] + 0.5 * gt_width
167 | 
168 |     dy = (gt_center_y - center_y) / height
169 |     dx = (gt_center_x - center_x) / width
170 |     dh = tf.log(gt_height / height)
171 |     dw = tf.log(gt_width / width)
172 | 
173 |     result = tf.stack([dy, dx, dh, dw], axis=1)
174 |     return result
175 | 
176 | 
177 | def box_refinement(box, gt_box):
178 |     """Compute refinement needed to transform box to gt_box.
179 |     box and gt_box are [N, (y1, x1, y2, x2)]. (y2, x2) is
180 |     assumed to be outside the box.
181 |     """
182 |     box = box.astype(np.float32)
183 |     gt_box = gt_box.astype(np.float32)
184 | 
185 |     height = box[:, 2] - box[:, 0]
186 |     width = box[:, 3] - box[:, 1]
187 |     center_y = box[:, 0] + 0.5 * height
188 |     center_x = box[:, 1] + 0.5 * width
189 | 
190 |     gt_height = gt_box[:, 2] - gt_box[:, 0]
191 |     gt_width = gt_box[:, 3] - gt_box[:, 1]
192 |     gt_center_y = gt_box[:, 0] + 0.5 * gt_height
193 |     gt_center_x = gt_box[:, 1] + 0.5 * gt_width
194 | 
195 |     dy = (gt_center_y - center_y) / height
196 |     dx = (gt_center_x - center_x) / width
197 |     dh = np.log(gt_height / height)
198 |     dw = np.log(gt_width / width)
199 | 
200 |     return np.stack([dy, dx, dh, dw], axis=1)
201 | 
202 | 
203 | ############################################################
204 | #  Dataset
205 | ############################################################
206 | 
207 | class Dataset(object):
208 |     """The base class for dataset classes.
209 |     To use it, create a new class that adds functions specific to the dataset
210 |     you want to use. For example:
211 | 
212 |     class CatsAndDogsDataset(Dataset):
213 |         def load_cats_and_dogs(self):
214 |             ...
215 |         def load_bbox(self, image_id):
216 |             ...
217 |         def image_reference(self, image_id):
218 |             ...
219 | 
220 |     See COCODataset and ShapesDataset as examples.
221 |     """
222 | 
223 |     def __init__(self, class_map=None):
224 |         self._image_ids = []
225 |         self.image_info = []
226 |         # Background is always the first class
227 |         self.class_info = [{"source": "", "id": 0, "name": "BG"}]
228 |         self.source_class_ids = {}
229 | 
230 |     def add_class(self, source, class_id, class_name):
231 |         assert "." not in source, "Source name cannot contain a dot"
232 |         # Does the class exist already?
233 |         for info in self.class_info:
234 |             if info['source'] == source and info["id"] == class_id:
235 |                 # source.class_id combination already available, skip
236 |                 return
237 |         # Add the class
238 |         self.class_info.append({
239 |             "source": source,
240 |             "id": class_id,
241 |             "name": class_name,
242 |         })
243 | 
244 |     def add_image(self, source, image_id, path, **kwargs):
245 |         image_info = {
246 |             "id": image_id,
247 |             "source": source,
248 |             "path": path,
249 |         }
250 |         image_info.update(kwargs)
251 |         self.image_info.append(image_info)
252 | 
253 |     def image_reference(self, image_id):
254 |         """Return a link to the image in its source Website or details about
255 |         the image that help looking it up or debugging it.
256 | 
257 |         Override for your dataset, but pass to this function
258 |         if you encounter images not in your dataset.
259 |         """
260 |         return ""
261 | 
262 |     def prepare(self, class_map=None):
263 |         """Prepares the Dataset class for use.
264 | 
265 |         TODO: class map is not supported yet. When done, it should handle mapping
266 |               classes from different datasets to the same class ID.
267 |         """
268 |         def clean_name(name):
269 |             """Returns a shorter version of object names for cleaner display."""
270 |             return ",".join(name.split(",")[:1])
271 | 
272 |         # Build (or rebuild) everything else from the info dicts.
273 |         self.num_classes = len(self.class_info)
274 |         self.class_ids = np.arange(self.num_classes)
275 |         self.class_names = [clean_name(c["name"]) for c in self.class_info]
276 |         self.num_images = len(self.image_info)
277 |         self._image_ids = np.arange(self.num_images)
278 | 
279 |         self.class_from_source_map = {"{}.{}".format(info['source'], info['id']): id
280 |                                       for info, id in zip(self.class_info, self.class_ids)}
281 | 
282 |         # Map sources to class_ids they support
283 |         self.sources = list(set([i['source'] for i in self.class_info]))
284 |         self.source_class_ids = {}
285 |         # Loop over datasets
286 |         for source in self.sources:
287 |             self.source_class_ids[source] = []
288 |             # Find classes that belong to this dataset
289 |             for i, info in enumerate(self.class_info):
290 |                 # Include BG class in all datasets
291 |                 if i == 0 or source == info['source']:
292 |                     self.source_class_ids[source].append(i)
293 | 
294 |     def map_source_class_id(self, source_class_id):
295 |         """Takes a source class ID and returns the int class ID assigned to it.
296 | 
297 |         For example:
298 |         dataset.map_source_class_id("coco.12") -> 23
299 |         """
300 |         return self.class_from_source_map[source_class_id]
301 | 
302 |     def get_source_class_id(self, class_id, source):
303 |         """Map an internal class ID to the corresponding class ID in the source dataset."""
304 |         info = self.class_info[class_id]
305 |         assert info['source'] == source
306 |         return info['id']
307 | 
308 |     def append_data(self, class_info, image_info):
309 |         self.external_to_class_id = {}
310 |         for i, c in enumerate(self.class_info):
311 |             for ds, id in c["map"]:
312 |                 self.external_to_class_id[ds + str(id)] = i
313 | 
314 |         # Map external image IDs to internal ones.
315 |         self.external_to_image_id = {}
316 |         for i, info in enumerate(self.image_info):
317 |             self.external_to_image_id[info["ds"] + str(info["id"])] = i
318 | 
319 |     @property
320 |     def image_ids(self):
321 |         return self._image_ids
322 | 
323 |     def source_image_link(self, image_id):
324 |         """Returns the path or URL to the image.
325 |         Override this to return a URL to the image if it's availble online for easy
326 |         debugging.
327 |         """
328 |         return self.image_info[image_id]["path"]
329 | 
330 |     def load_image(self, image_id):
331 |         """Load the specified image and return a [H,W,3] Numpy array.
332 |         """
333 |         # Load image
334 |         image = skimage.io.imread(self.image_info[image_id]['path'])
335 |         # If grayscale. Convert to RGB for consistency.
336 |         if image.ndim != 3:
337 |             image = skimage.color.gray2rgb(image)
338 |         return image
339 | 
340 |     def load_bbox(self, image_id):
341 |         """Load instance bbox for the given image.
342 | 
343 |         Different datasets use different ways to store bbox. Override this
344 |         method to load instance bbox and return them in the form of am
345 |         array of binary bbox of shape [height, width, instances].
346 | 
347 |         Returns:
348 |             bbox: A bool array of shape [height, width, instance count] with
349 |                 a binary bbox per instance.
350 |             class_ids: a 1D array of class IDs of the instance bbox.
351 |         """
352 |         # Override this function to load a bbox from your dataset.
353 |         # Otherwise, it returns an empty bbox.
354 |         bbox = np.empty([0, 0, 0])
355 |         class_ids = np.empty([0], np.int32)
356 |         return bbox, class_ids
357 | 
358 | 
359 | def resize_image(image, min_dim=None, max_dim=None, padding=False):
360 |     """
361 |     Resizes an image keeping the aspect ratio.
362 | 
363 |     min_dim: if provided, resizes the image such that it's smaller
364 |         dimension == min_dim
365 |     max_dim: if provided, ensures that the image longest side doesn't
366 |         exceed this value.
367 |     padding: If true, pads image with zeros so it's size is max_dim x max_dim
368 | 
369 |     Returns:
370 |     image: the resized image
371 |     window: (y1, x1, y2, x2). If max_dim is provided, padding might
372 |         be inserted in the returned image. If so, this window is the
373 |         coordinates of the image part of the full image (excluding
374 |         the padding). The x2, y2 pixels are not included.
375 |     scale: The scale factor used to resize the image
376 |     padding: Padding added to the image [(top, bottom), (left, right), (0, 0)]
377 |     """
378 |     # Default window (y1, x1, y2, x2) and default scale == 1.
379 |     h, w = image.shape[:2]
380 |     window = (0, 0, h, w)
381 |     scale = 1
382 | 
383 |     # Scale?
384 |     if min_dim:
385 |         # Scale up but not down
386 |         scale = max(1, min_dim / min(h, w))
387 |     # Does it exceed max dim?
388 |     if max_dim:
389 |         image_max = max(h, w)
390 |         if round(image_max * scale) > max_dim:
391 |             scale = max_dim / image_max
392 |     # Resize image and mask
393 |     if scale != 1:
394 |         image = scipy.misc.imresize(
395 |             image, (round(h * scale), round(w * scale)))
396 |     # Need padding?
397 |     if padding:
398 |         # Get new height and width
399 |         h, w = image.shape[:2]
400 |         top_pad = (max_dim - h) // 2
401 |         bottom_pad = max_dim - h - top_pad
402 |         left_pad = (max_dim - w) // 2
403 |         right_pad = max_dim - w - left_pad
404 |         padding = [(top_pad, bottom_pad), (left_pad, right_pad), (0, 0)]
405 |         image = np.pad(image, padding, mode='constant', constant_values=0)
406 |         window = (top_pad, left_pad, h + top_pad, w + left_pad)
407 |     return image, window, scale, padding
408 | 
409 | 
410 | def resize_bbox(boxes, scale, padding):
411 |     """Resizes a bbox using the given scale and padding.
412 |     Typically, you get the scale and padding from resize_image() to
413 |     ensure both, the image and the bbox, are resized consistently.
414 | 
415 |     scale: bbox scaling factor
416 |     padding: Padding to add to the bbox in the form
417 |             [(top, bottom), (left, right), (0, 0)]
418 |     """
419 |     top_pad = padding[0][0]
420 |     left_pad = padding[1][0]
421 | 
422 |     resized_boxes = []
423 |     for box in boxes:
424 |         temp_new_box = box * scale
425 |         y1 = temp_new_box[0] + top_pad
426 |         x1 = temp_new_box[1] + left_pad
427 |         y2 = temp_new_box[2] + top_pad
428 |         x2 = temp_new_box[3] + left_pad
429 |         resized_boxes.append((y1,x1,y2,x2))
430 |     return np.array(resized_boxes)
431 | 
432 | ############################################################
433 | #  Anchors
434 | ############################################################
435 | 
436 | def generate_anchors(scales, ratios, shape, feature_stride, anchor_stride):
437 |     """
438 |     scales: 1D array of anchor sizes in pixels. Example: [32, 64, 128]
439 |     ratios: 1D array of anchor ratios of width/height. Example: [0.5, 1, 2]
440 |     shape: [height, width] spatial shape of the feature map over which
441 |             to generate anchors.
442 |     feature_stride: Stride of the feature map relative to the image in pixels.
443 |     anchor_stride: Stride of anchors on the feature map. For example, if the
444 |         value is 2 then generate anchors for every other feature map pixel.
445 |     """
446 |     # Get all combinations of scales and ratios
447 |     scales, ratios = np.meshgrid(np.array(scales), np.array(ratios))
448 |     scales = scales.flatten()
449 |     ratios = ratios.flatten()
450 | 
451 |     # Enumerate heights and widths from scales and ratios
452 |     heights = scales / np.sqrt(ratios)
453 |     widths = scales * np.sqrt(ratios)
454 | 
455 |     # Enumerate shifts in feature space
456 |     shifts_y = np.arange(0, shape[0], anchor_stride) * feature_stride
457 |     shifts_x = np.arange(0, shape[1], anchor_stride) * feature_stride
458 |     shifts_x, shifts_y = np.meshgrid(shifts_x, shifts_y)
459 | 
460 |     # Enumerate combinations of shifts, widths, and heights
461 |     box_widths, box_centers_x = np.meshgrid(widths, shifts_x)
462 |     box_heights, box_centers_y = np.meshgrid(heights, shifts_y)
463 | 
464 |     # Reshape to get a list of (y, x) and a list of (h, w)
465 |     box_centers = np.stack(
466 |         [box_centers_y, box_centers_x], axis=2).reshape([-1, 2])
467 |     box_sizes = np.stack([box_heights, box_widths], axis=2).reshape([-1, 2])
468 | 
469 |     # Convert to corner coordinates (y1, x1, y2, x2)
470 |     boxes = np.concatenate([box_centers - 0.5 * box_sizes,
471 |                             box_centers + 0.5 * box_sizes], axis=1)
472 |     return boxes
473 | 
474 | 
475 | def generate_pyramid_anchors(scales, ratios, feature_shapes, feature_strides,
476 |                              anchor_stride):
477 |     """Generate anchors at different levels of a feature pyramid. Each scale
478 |     is associated with a level of the pyramid, but each ratio is used in
479 |     all levels of the pyramid.
480 | 
481 |     Returns:
482 |     anchors: [N, (y1, x1, y2, x2)]. All generated anchors in one array. Sorted
483 |         with the same order of the given scales. So, anchors of scale[0] come
484 |         first, then anchors of scale[1], and so on.
485 |     """
486 |     # Anchors
487 |     # [anchor_count, (y1, x1, y2, x2)]
488 |     anchors = []
489 |     for i in range(len(scales)):
490 |         anchors.append(generate_anchors(scales[i], ratios, feature_shapes[i],
491 |                                         feature_strides[i], anchor_stride))
492 |     return np.concatenate(anchors, axis=0)
493 | 
494 | 
495 | ############################################################
496 | #  Miscellaneous
497 | ############################################################
498 | 
499 | def trim_zeros(x):
500 |     """It's common to have tensors larger than the available data and
501 |     pad with zeros. This function removes rows that are all zeros.
502 | 
503 |     x: [rows, columns].
504 |     """
505 |     assert len(x.shape) == 2
506 |     return x[~np.all(x == 0, axis=1)]
507 | 
508 | 
509 | def compute_ap(gt_boxes, gt_class_ids,
510 |                pred_boxes, pred_class_ids, pred_scores,
511 |                iou_threshold=0.5):
512 |     """Compute Average Precision at a set IoU threshold (default 0.5).
513 | 
514 |     Returns:
515 |     mAP: Mean Average Precision
516 |     precisions: List of precisions at different class score thresholds.
517 |     recalls: List of recall values at different class score thresholds.
518 |     overlaps: [pred_boxes, gt_boxes] IoU overlaps.
519 |     """
520 |     # Trim zero padding and sort predictions by score from high to low
521 |     # TODO: cleaner to do zero unpadding upstream
522 |     gt_boxes = trim_zeros(gt_boxes)
523 |     pred_boxes = trim_zeros(pred_boxes)
524 |     pred_scores = pred_scores[:pred_boxes.shape[0]]
525 |     indices = np.argsort(pred_scores)[::-1]
526 |     pred_boxes = pred_boxes[indices]
527 |     pred_class_ids = pred_class_ids[indices]
528 |     pred_scores = pred_scores[indices]
529 | 
530 |     # Compute IoU overlaps [pred_boxes, gt_boxes]
531 |     overlaps = compute_overlaps(pred_boxes, gt_boxes)
532 | 
533 |     # Loop through ground truth boxes and find matching predictions
534 |     match_count = 0
535 |     pred_match = np.zeros([pred_boxes.shape[0]])
536 |     gt_match = np.zeros([gt_boxes.shape[0]])
537 |     for i in range(len(pred_boxes)):
538 |         # Find best matching ground truth box
539 |         sorted_ixs = np.argsort(overlaps[i])[::-1]
540 |         for j in sorted_ixs:
541 |             # If ground truth box is already matched, go to next one
542 |             if gt_match[j] == 1:
543 |                 continue
544 |             # If we reach IoU smaller than the threshold, end the loop
545 |             iou = overlaps[i, j]
546 |             if iou < iou_threshold:
547 |                 break
548 |             # Do we have a match?
549 |             if pred_class_ids[i] == gt_class_ids[j]:
550 |                 match_count += 1
551 |                 gt_match[j] = 1
552 |                 pred_match[i] = 1
553 |                 break
554 | 
555 |     # Compute precision and recall at each prediction box step
556 |     precisions = np.cumsum(pred_match) / (np.arange(len(pred_match)) + 1)
557 |     recalls = np.cumsum(pred_match).astype(np.float32) / len(gt_match)
558 | 
559 |     # Pad with start and end values to simplify the math
560 |     precisions = np.concatenate([[0], precisions, [0]])
561 |     recalls = np.concatenate([[0], recalls, [1]])
562 | 
563 |     # Ensure precision values decrease but don't increase. This way, the
564 |     # precision value at each recall threshold is the maximum it can be
565 |     # for all following recall thresholds, as specified by the VOC paper.
566 |     for i in range(len(precisions) - 2, -1, -1):
567 |         precisions[i] = np.maximum(precisions[i], precisions[i + 1])
568 | 
569 |     # Compute mean AP over recall range
570 |     indices = np.where(recalls[:-1] != recalls[1:])[0] + 1
571 |     mAP = np.sum((recalls[indices] - recalls[indices - 1]) *
572 |                  precisions[indices])
573 | 
574 |     return mAP, precisions, recalls, overlaps
575 | 
576 | 
577 | def compute_recall(pred_boxes, gt_boxes, iou):
578 |     """Compute the recall at the given IoU threshold. It's an indication
579 |     of how many GT boxes were found by the given prediction boxes.
580 | 
581 |     pred_boxes: [N, (y1, x1, y2, x2)] in image coordinates
582 |     gt_boxes: [N, (y1, x1, y2, x2)] in image coordinates
583 |     """
584 |     # Measure overlaps
585 |     overlaps = compute_overlaps(pred_boxes, gt_boxes)
586 |     iou_max = np.max(overlaps, axis=1)
587 |     iou_argmax = np.argmax(overlaps, axis=1)
588 |     positive_ids = np.where(iou_max >= iou)[0]
589 |     matched_gt_boxes = iou_argmax[positive_ids]
590 | 
591 |     recall = len(set(matched_gt_boxes)) / gt_boxes.shape[0]
592 |     return recall, positive_ids
593 | 
594 | 
595 | # ## Batch Slicing
596 | # Some custom layers support a batch size of 1 only, and require a lot of work
597 | # to support batches greater than 1. This function slices an input tensor
598 | # across the batch dimension and feeds batches of size 1. Effectively,
599 | # an easy way to support batches > 1 quickly with little code modification.
600 | # In the long run, it's more efficient to modify the code to support large
601 | # batches and getting rid of this function. Consider this a temporary solution
602 | def batch_slice(inputs, graph_fn, batch_size, names=None):
603 |     """Splits inputs into slices and feeds each slice to a copy of the given
604 |     computation graph and then combines the results. It allows you to run a
605 |     graph on a batch of inputs even if the graph is written to support one
606 |     instance only.
607 | 
608 |     inputs: list of tensors. All must have the same first dimension length
609 |     graph_fn: A function that returns a TF tensor that's part of a graph.
610 |     batch_size: number of slices to divide the data into.
611 |     names: If provided, assigns names to the resulting tensors.
612 |     """
613 |     if not isinstance(inputs, list):
614 |         inputs = [inputs]
615 | 
616 |     outputs = []
617 |     for i in range(batch_size):
618 |         inputs_slice = [x[i] for x in inputs]
619 |         output_slice = graph_fn(*inputs_slice)
620 |         if not isinstance(output_slice, (tuple, list)):
621 |             output_slice = [output_slice]
622 |         outputs.append(output_slice)
623 |     # Change outputs from a list of slices where each is
624 |     # a list of outputs to a list of outputs and each has
625 |     # a list of slices
626 |     outputs = list(zip(*outputs))
627 | 
628 |     if names is None:
629 |         names = [None] * len(outputs)
630 | 
631 |     result = [tf.stack(o, axis=0, name=n)
632 |               for o, n in zip(outputs, names)]
633 |     if len(result) == 1:
634 |         result = result[0]
635 | 
636 |     return result
637 | 
638 | ############################################################
639 | #  Data Formatting
640 | ############################################################
641 | 
642 | def compose_image_meta(image_id, image_shape, window, active_class_ids):
643 |     """Takes attributes of an image and puts them in one 1D array. Use
644 |     parse_image_meta() to parse the values back.
645 | 
646 |     image_id: An int ID of the image. Useful for debugging.
647 |     image_shape: [height, width, channels]
648 |     window: (y1, x1, y2, x2) in pixels. The area of the image where the real
649 |             image is (excluding the padding)
650 |     active_class_ids: List of class_ids available in the dataset from which
651 |         the image came. Useful if training on images from multiple datasets
652 |         where not all classes are present in all datasets.
653 |     """
654 |     meta = np.array(
655 |         [image_id] +            # size=1
656 |         list(image_shape) +     # size=3
657 |         list(window) +          # size=4 (y1, x1, y2, x2) in image cooredinates
658 |         list(active_class_ids)  # size=num_classes
659 |     )
660 |     return meta
661 | 
662 | 
663 | # Two functions (for Numpy and TF) to parse image_meta tensors.
664 | def parse_image_meta(meta):
665 |     """Parses an image info Numpy array to its components.
666 |     See compose_image_meta() for more details.
667 |     """
668 |     image_id = meta[:, 0]
669 |     image_shape = meta[:, 1:4]
670 |     window = meta[:, 4:8]   # (y1, x1, y2, x2) window of image in in pixels
671 |     active_class_ids = meta[:, 8:]
672 |     return image_id, image_shape, window, active_class_ids
673 | 
674 | 
675 | def parse_image_meta_graph(meta):
676 |     """Parses a tensor that contains image attributes to its components.
677 |     See compose_image_meta() for more details.
678 | 
679 |     meta: [batch, meta length] where meta length depends on NUM_CLASSES
680 |     """
681 |     image_id = meta[:, 0]
682 |     image_shape = meta[:, 1:4]
683 |     window = meta[:, 4:8]
684 |     active_class_ids = meta[:, 8:]
685 |     return [image_id, image_shape, window, active_class_ids]
686 | 
687 | 
688 | def mold_image(images, config):
689 |     """Takes RGB images with 0-255 values and subtraces
690 |     the mean pixel and converts it to float. Expects image
691 |     colors in RGB order.
692 |     """
693 |     return images.astype(np.float32) - config.MEAN_PIXEL
694 | 
695 | 
696 | def unmold_image(normalized_images, config):
697 |     """Takes a image normalized with mold() and returns the original."""
698 |     return (normalized_images + config.MEAN_PIXEL).astype(np.uint8)


--------------------------------------------------------------------------------
/KerasRFCN/__init__.py:
--------------------------------------------------------------------------------
1 | # __init.py__
2 | __all__ = ['Config', 'Data_generator', 'Losses', 'Utils','Model.BaseModel','Model.Model','Model.ResNet_dilated','Model.ResNet']


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 parap1uie-s
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Keras-RFCN
 2 | RFCN implement based on Keras&amp;Tensorflow
 3 | 
 4 | This is an implementation of [Region-based Fully Convolutional Networks](https://arxiv.org/pdf/1605.06409v2.pdf) on Python 3, Keras, and TensorFlow. The model generates bounding boxes for each instance of an object in the image. It's based on Feature Pyramid Network (FPN) and a [ResNet50](https://arxiv.org/abs/1512.03385) or ResNet101 backbone.
 5 | 
 6 | The repository includes:
 7 | 
 8 | * Source code of RFCN built on FPN and ResNet50/101.
 9 | * Training code for [DeepFashion Dataset](http://mmlab.ie.cuhk.edu.hk/projects/DeepFashion.html) with 46 clothes classes.
10 | * Pre-trained weights for [DeepFashion Dataset](http://mmlab.ie.cuhk.edu.hk/projects/DeepFashion.html) - See release
11 | * Example of training on your own dataset&nbsp;-&nbsp;see Fashion_Train.py and Fashion_Test.py
12 | 
13 | 
14 | # Introduction
15 | 
16 | Thanks to the [Mask-RCNN implement by matterport](https://github.com/matterport/Mask_RCNN), we have the great framework so that we don't have the needs to generate bounding box and implement the Non-Maximum-Suppression algorithm.
17 | 
18 | If you are already fimilar with matterport's framework, this repository is easy to understand and use. What I have done is remove the mask head in the framework, which makes it be a Faster-RCNN, and implement a position sensitive ROI pooling layer and a VOTE layer. For more details, please read the [paper](https://arxiv.org/pdf/1605.06409v2.pdf).
19 | 
20 | ![position sensitive ROI](ReadmeImages/1.png)
21 | 
22 | # Getting Started
23 | 
24 | ## Train on your own dataset
25 | 
26 | As you can see in **Fashion_Train.py**, for a specific dataset, all you need is listed below:
27 | 
28 | * A **XXConfig** class inherit from the **Config** base class in the framework.
29 | * A **XXDataset** class inherit from the **Utils.Dataset**.
30 | * Implement **load_image(image_id)** which returns a numpy array I with I.shape == (Img_h, Img_w, 3) for the specific image id. You don't have the needs to resize image, or take data augment. The framework will do all of them automatically. Just keep the data what it is.
31 | * Implement **load_bbox(image_id)** which returns a tuple with 2 numpy array for the specific image id, the first one is boundbox coordinates (y<sub>1</sub>,x<sub>1</sub>,y<sub>2</sub>,x<sub>2</sub>), and class ID for the boundbox in order.
32 | * Define your training schedule.
33 | 
34 | ## Predict on your own dataset
35 | 
36 | See **Fashion_Test.py** as a demo.
37 | More details in writing.
38 | 
39 | # Framework
40 | 
41 | This RFCN framework consists of FIVE parts:
42 | 
43 | * Config.py - the base class of config for a specific dataset.
44 | * Data_generator.py - generate data for a object detection model, like background box, and true box. The images are resized in this script.
45 | * Losses.py - define the l1_smooth loss for box regression and cross-entropy loss for box classification.
46 | * Utils.py - all the auxiliary functions, like compute_iou, non-maximum-suppression, etc.
47 | * BaseModel.py - the base class of our keras model, contains the auxiliary functions for the model, like load weights, save checkpoints, decode the training schedule.
48 | * **Model.py** - the CORE script of our framework, contains the RPN, ScoreMap, position sensitive ROI pooling layer, etc.
49 | * ResNet.py - The resnet backbone, you can choose resnet50 or resnet101.
50 | * ResNet_dilated.py - Take the stages 4+ have a dilate ratio = 2, named DetNet, which is the latest research results in this [paper](https://arxiv.org/abs/1804.06215).
51 | 
52 | # Experiment Result
53 | 
54 | To make sure the framework work normally, we have trained the model 240 epochs with DeepFashion dataset. And the detection result might useful:
55 | 
56 | <figure class="half">
57 | 	<img src="ReadmeImages/result_1.jpg" width="300">
58 | 	<img src="ReadmeImages/result_2.jpg" width="300">
59 | </figure>
60 | 
61 | # TODO
62 | 
63 | * ~~Complete the load_weights function of model.(Done)~~
64 | * Add the callback for eval mAP after each batch end
65 | * Train on MSCOCO


--------------------------------------------------------------------------------
/ReadmeImages/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parap1uie-s/Keras-RFCN/5466d0da653041cf2e1ae74d26fb4c126a95330d/ReadmeImages/1.png


--------------------------------------------------------------------------------
/ReadmeImages/result_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parap1uie-s/Keras-RFCN/5466d0da653041cf2e1ae74d26fb4c126a95330d/ReadmeImages/result_1.jpg


--------------------------------------------------------------------------------
/ReadmeImages/result_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parap1uie-s/Keras-RFCN/5466d0da653041cf2e1ae74d26fb4c126a95330d/ReadmeImages/result_2.jpg


--------------------------------------------------------------------------------
/data.pk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parap1uie-s/Keras-RFCN/5466d0da653041cf2e1ae74d26fb4c126a95330d/data.pk


--------------------------------------------------------------------------------