├── test
    ├── __init__.py
    ├── frcnn_test
    │   ├── __init__.py
    │   └── util_test
    │   │   ├── __init__.py
    │   │   ├── test_bbox.py
    │   │   └── test_anchor.py
    └── generate_random_bbox.py
├── xrcnn
    ├── __init__.py
    ├── util
    │   ├── __init__.py
    │   ├── misc.py
    │   ├── log.py
    │   ├── image.py
    │   ├── voc_dataset.py
    │   ├── bbox.py
    │   ├── coco_dataset.py
    │   └── anchor.py
    ├── batchnorm.py
    ├── roi_align_layer.py
    ├── config.py
    ├── region_proposal_layer.py
    ├── loss.py
    ├── frcnn.py
    └── mrcnn.py
├── model.png
├── resource
    ├── result_01.png
    ├── result_02.png
    ├── result_03.png
    ├── result_04.png
    ├── result_05.png
    ├── result_06.png
    ├── result_07.png
    ├── result_08.png
    ├── result_09.png
    ├── result_10.png
    ├── result_11.png
    ├── result_12.png
    ├── result_13.png
    └── network_summary.png
├── LICENSE
├── .gitignore
├── image_test.py
├── confirm_gt_anchor.py
├── README.md
├── train_mrcnn.py
└── predict_mrcnn.py


/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/xrcnn/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/xrcnn/util/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/frcnn_test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/frcnn_test/util_test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shtamura/maskrcnn/HEAD/model.png


--------------------------------------------------------------------------------
/resource/result_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shtamura/maskrcnn/HEAD/resource/result_01.png


--------------------------------------------------------------------------------
/resource/result_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shtamura/maskrcnn/HEAD/resource/result_02.png


--------------------------------------------------------------------------------
/resource/result_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shtamura/maskrcnn/HEAD/resource/result_03.png


--------------------------------------------------------------------------------
/resource/result_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shtamura/maskrcnn/HEAD/resource/result_04.png


--------------------------------------------------------------------------------
/resource/result_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shtamura/maskrcnn/HEAD/resource/result_05.png


--------------------------------------------------------------------------------
/resource/result_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shtamura/maskrcnn/HEAD/resource/result_06.png


--------------------------------------------------------------------------------
/resource/result_07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shtamura/maskrcnn/HEAD/resource/result_07.png


--------------------------------------------------------------------------------
/resource/result_08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shtamura/maskrcnn/HEAD/resource/result_08.png


--------------------------------------------------------------------------------
/resource/result_09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shtamura/maskrcnn/HEAD/resource/result_09.png


--------------------------------------------------------------------------------
/resource/result_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shtamura/maskrcnn/HEAD/resource/result_10.png


--------------------------------------------------------------------------------
/resource/result_11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shtamura/maskrcnn/HEAD/resource/result_11.png


--------------------------------------------------------------------------------
/resource/result_12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shtamura/maskrcnn/HEAD/resource/result_12.png


--------------------------------------------------------------------------------
/resource/result_13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shtamura/maskrcnn/HEAD/resource/result_13.png


--------------------------------------------------------------------------------
/resource/network_summary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shtamura/maskrcnn/HEAD/resource/network_summary.png


--------------------------------------------------------------------------------
/xrcnn/util/misc.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | def stack_each_number(max_num, stack_count):
 5 |     """
 6 |         0からmax_num-1までの整数を数字毎にstack_count回積み上げたリストを得る。
 7 |         例:
 8 |             max_num=3
 9 |             stack_count=2
10 |             return=[0,0,1,1,2,2]
11 |     """
12 |     v = tf.range(max_num)
13 |     v = tf.reshape(v, [-1, 1])
14 |     v = tf.tile(v, [1, stack_count])
15 |     v = tf.reshape(v, [-1])
16 |     return v
17 | 


--------------------------------------------------------------------------------
/xrcnn/batchnorm.py:
--------------------------------------------------------------------------------
 1 | import keras.layers as KL
 2 | 
 3 | 
 4 | class BatchNorm(KL.BatchNormalization):
 5 |     # https://github.com/matterport/Mask_RCNN/
 6 |     # より。
 7 |     """Batch Normalization class. Subclasses the Keras BN class and
 8 |     hardcodes training=False so the BN layer doesn't update
 9 |     during training.
10 | 
11 |     Batch normalization has a negative effect on training if batches are small
12 |     so we disable it here.
13 |     """
14 | 
15 |     def call(self, inputs, training=None):
16 |         return super(self.__class__, self).call(inputs, training=False)
17 | 


--------------------------------------------------------------------------------
/xrcnn/util/log.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import tensorflow as tf
 3 | 
 4 | stop = False
 5 | out_name_pattern = ".*"
 6 | 
 7 | 
 8 | def tfprint(tensor, prefix=None, summarize=256):
 9 |     """tf.Printのショートカットメソッド
10 |         Return:
11 |             tf.Print(tensor, [tensor],・・・・)
12 |             の復帰値をそのまま返す。
13 |     """
14 |     if stop:
15 |         # stopならprint無し
16 |         return tensor
17 | 
18 |     if prefix is None:
19 |         prefix = tensor.name
20 | 
21 |     if not re.match(out_name_pattern, prefix):
22 |         return tensor
23 | 
24 |     return tf.Print(tensor, [tensor], prefix + ": ",
25 |                     summarize=summarize)
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Shoichi Tamura
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/test/generate_random_bbox.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from keras import backend as K
 3 | 
 4 | 
 5 | def generate_random_bbox(n, img_size, min_length, max_length):
 6 |     """Generate valid bounding boxes with random position and shape.
 7 | 
 8 |     Args:
 9 |         n (int): The number of bounding boxes.
10 |         img_size (tuple): A tuple of length 2. The height and the width
11 |             of the image on which bounding boxes locate.
12 |         min_length (float): The minimum length of edges of bounding boxes.
13 |         max_length (float): The maximum length of edges of bounding boxes.
14 | 
15 |     Return:
16 |         Keras.variable:
17 |         Coordinates of bounding boxes. Its shape is :math:`(R, 4)`. \
18 |         Here, :math:`R` equals :obj:`n`.
19 |         The second axis contains :math:`y_{min}, x_{min}, y_{max}, x_{max}`,
20 |         where
21 |         :math:`min\_length \\leq y_{max} - y_{min} < max\_length`.
22 |         and
23 |         :math:`min\_length \\leq x_{max} - x_{min} < max\_length`
24 | 
25 |     """
26 |     H, W = img_size
27 |     y_min = np.random.uniform(0, H - max_length, size=(n,))
28 |     x_min = np.random.uniform(0, W - max_length, size=(n,))
29 |     y_max = y_min + np.random.uniform(min_length, max_length, size=(n,))
30 |     x_max = x_min + np.random.uniform(min_length, max_length, size=(n,))
31 |     bbox = np.stack((y_min, x_min, y_max, x_max), axis=1).astype(np.float32)
32 |     return K.variable(bbox)
33 | 


--------------------------------------------------------------------------------
/test/frcnn_test/util_test/test_bbox.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from keras import backend as K
 4 | import numpy as np
 5 | import xrcnn.util.bbox as bbox
 6 | from test.generate_random_bbox import generate_random_bbox
 7 | 
 8 | 
 9 | class TestBbox(unittest.TestCase):
10 |     def setUp(self):
11 |         self.src_bbox = generate_random_bbox(8, (64, 32), 4, 16)
12 |         self.dst_bbox = self.src_bbox + 1
13 | 
14 |     def test_restore_bbox(self):
15 |         offset = bbox.get_offset(self.src_bbox, self.dst_bbox)
16 |         out_raw_bbox = bbox.get_bbox(self.src_bbox, offset)
17 | 
18 |         np.testing.assert_almost_equal(
19 |             K.get_value(out_raw_bbox), K.get_value(self.dst_bbox), decimal=5)
20 | 
21 |     def test_get_iou(self):
22 |         gtbox = K.variable([[1, 1, 3, 3], [2, 2, 4, 4]])
23 |         anchor = K.variable([
24 |             [1, 1, 3, 3],  # gtbox[0]とは完全に一致。つまりIoU=1。
25 |             # gtbox[1]とは1/4重なる。つまりIoU=1/7。
26 |             [1, 0, 3, 2],  # gtbox[0]とは半分重なる。つまりIoU=1/3。
27 |             [2, 2, 4, 4],  # gtbox[0]とは1/4重なる。つまりIoU=1/7。gtbox[1]とは一致。
28 |             [0, 3, 2, 5],  # gtbox[0]とは隣接。
29 |             [4, 3, 6, 5],  # gtbox[0]とは接点無し。
30 |         ])
31 |         expected = np.array([
32 |             [1, 1 / 7],
33 |             [1 / 3, 0],
34 |             [1 / 7, 1],
35 |             [0, 0],
36 |             [0, 0],
37 |         ])
38 |         iou = K.get_value(bbox.get_iou(anchor, gtbox))
39 |         np.testing.assert_almost_equal(iou, expected, decimal=5)
40 | 


--------------------------------------------------------------------------------
/test/frcnn_test/util_test/test_anchor.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import math
 3 | 
 4 | import numpy as np
 5 | import xrcnn.util.anchor as anchor
 6 | 
 7 | from xrcnn import config
 8 | 
 9 | 
10 | class TestAnchor(unittest.TestCase):
11 |     def test_generate_anchors(self):
12 |         conf = config.Config()
13 |         a = anchor.Anchor(conf)
14 |         anchors = a._generate_anchors((2, 2))
15 |         centers = (anchors[:, 2:] - anchors[:, :2]) / 2 + anchors[:, :2]
16 |         expected_centers = np.array(
17 |             [[8, 8], [8, 24], [24, 8], [24, 24]]).repeat(9, axis=0)
18 |         np.testing.assert_almost_equal(centers, expected_centers, decimal=5)
19 | 
20 |     def test_generate_gt_offsets(self):
21 |         conf = config.Config()
22 |         conf.anchor_box_aspect_ratios = [
23 |             (1. / math.sqrt(2), 2. / math.sqrt(2)),
24 |             (1., 1.),
25 |             (2. / math.sqrt(2), 1. / math.sqrt(2))]
26 |         conf.anchor_box_scales = [4, 8, 16]
27 |         conf.backbone_shape = [64, 64]
28 |         conf.stride_per_base_nn_feature = 2
29 |         anc = anchor.Anchor(conf)
30 | 
31 |         bbox = np.array([[1,  1,  5,  5], [1,  3,  9,  10],
32 |                          [3,  6,  12,  12], [9, 9, 13, 13]])
33 |         bbox2 = np.array([[2,  2,  6,  6], [1,  3,  9,  10],
34 |                           [3,  6,  12,  12], [9, 9, 13, 13]])
35 |         offset, clazz = anc.generate_gt_offsets(
36 |             bbox,  (16, 16), n_max_sample=128)
37 |         offset2, clazz2 = anc.generate_gt_offsets(
38 |             bbox2, (16, 16), n_max_sample=128)
39 |         self.assertEqual(len(np.where(clazz >= 1)[0]), 5)
40 |         self.assertEqual(len(np.where(clazz2 >= 1)[0]), 4)
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | .static_storage/
 57 | .media/
 58 | local_settings.py
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # Environments
 86 | .env
 87 | .venv
 88 | env/
 89 | venv/
 90 | ENV/
 91 | env.bak/
 92 | venv.bak/
 93 | 
 94 | # Spyder project settings
 95 | .spyderproject
 96 | .spyproject
 97 | 
 98 | # Rope project settings
 99 | .ropeproject
100 | 
101 | # mkdocs documentation
102 | /site
103 | 
104 | # mypy
105 | .mypy_cache/
106 | 
107 | .tags*
108 | 
109 | # add
110 | tb_log
111 | out
112 | 


--------------------------------------------------------------------------------
/image_test.py:
--------------------------------------------------------------------------------
 1 | if __name__ == '__main__':
 2 |     import argparse
 3 |     import matplotlib.pyplot as plt
 4 |     import matplotlib.patches as patches
 5 |     import xrcnn.util.dataset as dataset
 6 |     import xrcnn.util.image as FI
 7 |     from xrcnn.config import Config
 8 | 
 9 |     config = Config()
10 | 
11 |     def add_rect(dest_ax, bbox):
12 |         rect = patches.Rectangle((bbox[1], bbox[0]),
13 |                                  bbox[3] - bbox[1], bbox[2] - bbox[0],
14 |                                  linewidth=1, edgecolor='r', facecolor='none',)
15 |         dest_ax.add_patch(rect)
16 | 
17 |     argparser = argparse.ArgumentParser()
18 |     argparser.add_argument('--path', type=str,
19 |                            required=True)
20 |     args = argparser.parse_args()
21 | 
22 |     images, _ = dataset.load_pascal_voc_traindata(args.path, 4)
23 |     for image in images:
24 |         # original
25 |         img = FI.load_image_as_ndarray(image['image_path'])
26 |         fig, ax = plt.subplots(1)
27 |         ax.imshow(img)
28 |         for obj in image['objects']:
29 |             bbox = obj['bbox']
30 |             add_rect(ax, bbox)
31 |         plt.show()
32 |         plt.close()
33 | 
34 |         # resize
35 |         resized_image, window, scale = FI.resize_with_padding(
36 |             img, config.image_min_size, config.image_max_size)
37 |         fig, ax = plt.subplots(1)
38 |         ax.imshow(resized_image)
39 |         resized_bbox = []
40 |         for obj in image['objects']:
41 |             bbox = obj['bbox']
42 |             resized = FI.resize_bbox(bbox, window[:2], scale)
43 |             resized_bbox.append(resized)
44 |             add_rect(ax, resized)
45 |         plt.show()
46 |         plt.close()
47 | 
48 |         # flip
49 |         flipped_image, x_flip, y_flip = FI.random_flip(resized_image)
50 |         fig, ax = plt.subplots(1)
51 |         ax.imshow(flipped_image)
52 |         for bbox in resized_bbox:
53 |             flipped = FI.flip_bbox(bbox,
54 |                                    flipped_image.shape[:2], x_flip, y_flip)
55 |             add_rect(ax, flipped)
56 |         plt.show()
57 |         plt.close()
58 | 


--------------------------------------------------------------------------------
/xrcnn/roi_align_layer.py:
--------------------------------------------------------------------------------
 1 | from keras import layers as KL
 2 | from keras import backend as K
 3 | import tensorflow as tf
 4 | import logging
 5 | 
 6 | from xrcnn.util import log
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | class RoiAlignLayer(KL.Layer):
12 |     """RoI Alignを行う。
13 |     フィーチャマップにRoIを適用し、固定サイズのフィーチャマップに変換する。
14 | 
15 |     Args:
16 |         config: config
17 | 
18 |     Inputs:
19 |         features: backboneの出力フィーチャマップ。
20 |             （VGG16の畳み込み層(5回目のプーリングの1つ前まで)をの出力）
21 |             入力画像サイズが1024であれば(N, 64, 64, 512)のはず。
22 |         rois: RoI
23 |             (N, n_rois, 4)
24 |             2軸目は0〜1に正規化された座標
25 |             (y1,x1,y2,x2)
26 | 
27 |     Outputs:
28 |         (N, n_rois, config.roi_align_out_size,
29 |             config.roi_align_out_size, channels)
30 |     """
31 | 
32 |     def __init__(self, out_shape, config, **kwargs):
33 |         super(RoiAlignLayer, self).__init__(**kwargs)
34 |         self.out_shape = out_shape
35 |         self.batch_size = config.batch_size
36 | 
37 |     def call(self, inputs):
38 |         features = inputs[0]
39 |         rois = inputs[1]
40 |         n_roi_boxes = K.shape(rois)[1]
41 | 
42 |         # roisには[0,0,0,0]のRoIも含むが、バッチ毎の要素数を合わせるため、そのまま処理する。
43 | 
44 |         # crop_and_resizeの準備
45 |         # roisを0軸目を除き（バッチを示す次元を除き）、フラットにする。
46 |         roi_unstack = K.concatenate(tf.unstack(rois), axis=0)
47 |         # roi_unstackの各roiに対応するバッチを指すindex
48 |         batch_pos = K.flatten(
49 |             K.repeat(K.reshape(K.arange(self.batch_size), [-1, 1]),
50 |                      n_roi_boxes))
51 |         # RoiAlignの代わりにcrop_and_resizeを利用。
52 |         # crop_and_resize内部でbilinear interporlationしてようなので、アルゴリズム的には同じっぽい
53 |         crop_boxes = tf.image.crop_and_resize(features,
54 |                                               roi_unstack, batch_pos,
55 |                                               self.out_shape)
56 | 
57 |         # (N * n_rois, out_size, out_size, channels)
58 |         # から
59 |         # (N, n_rois, out_size, out_size, channels)
60 |         # へ変換
61 |         crop_boxes = K.reshape(crop_boxes,
62 |                                [self.batch_size, n_roi_boxes]
63 |                                + self.out_shape + [-1])
64 |         log.tfprint(crop_boxes, "crop_boxes: ")
65 |         return crop_boxes
66 | 
67 |     def compute_output_shape(self, input_shape):
68 |         return (input_shape[1][0], input_shape[1][1],
69 |                 self.out_shape[0], self.out_shape[1], input_shape[0][-1])
70 | 


--------------------------------------------------------------------------------
/confirm_gt_anchor.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import numpy as np
 4 | import xrcnn.util.anchor as anchor
 5 | import xrcnn.util.bbox as bbox
 6 | import xrcnn.config as config
 7 | import xrcnn.util.dataset as dataset
 8 | import matplotlib.pyplot as plt
 9 | import matplotlib.patches as patches
10 | import cv2
11 | 
12 | FORMAT = '%(asctime)-15s %(levelname)s #[%(thread)d] %(message)s'
13 | logging.basicConfig(format=FORMAT, level=logging.DEBUG)
14 | 
15 | logger = logging.getLogger(__name__)
16 | logger.info("---start---")
17 | 
18 | argparser = argparse.ArgumentParser()
19 | argparser.add_argument('--path', type=str,
20 |                        required=True, help="VOCデータセットが配置してあるディレクトリ")
21 | argparser.add_argument('--prefix', type=str,
22 |                        required=True)
23 | args = argparser.parse_args()
24 | 
25 | conf = config.Config()
26 | anc = anchor.Anchor(conf)
27 | 
28 | di = dataset.pascal_voc_data_generator(args.path, anc, conf, train_val='train',
29 |                                        n_max=3, prefix=args.prefix)
30 | data, _ = next(di)
31 | # print(data)
32 | 
33 | img = data[0]
34 | rpn_offset = data[1]
35 | rpn_offset *= np.array(conf.bbox_refinement_std)
36 | rpn_fbs = data[2]
37 | pos_idx = np.where(rpn_fbs == 1)
38 | pos_anchor = anc.anchors[pos_idx[1]]
39 | pos_offset = rpn_offset[pos_idx[0], pos_idx[1]]
40 | box = bbox.get_bbox(pos_anchor, pos_offset)
41 | box = box.astype('int32')
42 | print(box)
43 | img = np.squeeze(img, axis=0)
44 | print(img.shape)
45 | 
46 | fig, ax = plt.subplots(1)
47 | ax.imshow(img)
48 | 
49 | 
50 | def add_rect(dest_ax, bbox):
51 |     rect = patches.Rectangle((bbox[1], bbox[0]),
52 |                              bbox[3] - bbox[1], bbox[2] - bbox[0],
53 |                              linewidth=1, edgecolor='r', facecolor='none',)
54 |     dest_ax.add_patch(rect)
55 | 
56 | 
57 | for b in box:
58 |     add_rect(ax, b)
59 | plt.show()
60 | plt.close()
61 | 
62 | 
63 | def add_rect_cv(dest_img, box, color):
64 |     cv2.rectangle(dest_img, (box[1], box[0]),
65 |                   (box[3], box[2]),
66 |                   color)
67 | 
68 | 
69 | for i, b in enumerate(box):
70 |     add_rect_cv(img, b, (255, 0, 0))
71 | 
72 | cv2.imwrite('./check_gt.png', img)
73 | 
74 | 
75 | #  確認
76 | # get_bboxする値の尺度がGTと予測結果でズレてそう。。。
77 | #   →これは大丈夫だった。。。子要素しか取得しないのでOK
78 | # XMLファイルにある矩形以上の情報が得られている。。。
79 | #
80 | #  GTの値に全てのBBOXが含まれていない？2つあるはずのGTBBOXが1つになっている。。。
81 | #  data_generator
82 | #   矩形がネストしているケースあり。。。
83 | #       人->顔、手、脚、的な。。。
84 | #   座標がおかしい。。。x1=x2、y1=y2になっている。。。
85 | # [np.array(images), np.array(rpn_offsets),
86 | #        np.array(rpn_fbs), np.array(bboxes),
87 | #        np.array(labels)], []
88 | # ↑確認済み
89 | #
90 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # MaskRCNN
  2 | 物体検出、セグメンテーションの手法である MaskRCNN を Keras で実装しています。   
  3 | 
  4 | MaskRCNN は ICCV'17 Best Paper に選出された手法です。  
  5 | （参考資料[^1]より。ICCV＝International Conference on Computer Vision）  
  6 | MaskRCNN は元となる FasterRCNN にインスタンス検出ネットワーク(mask head)を追加した以下のようなネットワーク構造になります。  
  7 | ![](resource/network_summary.png)
  8 | MaskRCNN論文より抜粋[^2]  
  9 | 
 10 | *class box* が FasterRCNN のクラス識別、バウンディングボックス検出を行うネットワークで、その下の畳込み2層（本実装では4層+逆畳み込み1層としている）がインスタンス検出ネットワーク。
 11 | 
 12 | # 環境
 13 | - Python 3
 14 | - TensorFlow 1.4 (TensorFlow-gpu)
 15 | - Keras 2.1
 16 | - OpenCV
 17 | - COCO API
 18 | 
 19 | ## GCPのDatalabインスタンスの場合
 20 | 当方ではGCPのDatalabインスタンスを利用し検証しました。
 21 | その際のパッケージインストールコマンドは以下の通りです。
 22 | 参考まで。
 23 | ```
 24 | apt-get update 
 25 | apt-get install -y --allow-unauthenticated graphviz 
 26 | apt-get install -y --allow-unauthenticated python-opencv 
 27 | apt-get install -y --allow-unauthenticated vim 
 28 | apt-get install -y --allow-unauthenticated python3 
 29 | apt-get install -y --allow-unauthenticated python3-pip 
 30 | apt-get install -y --allow-unauthenticated python3-tk 
 31 | pip3 install --upgrade pip 
 32 | pip3 install cython 
 33 | pip3 install numpy 
 34 | pip3 install tensorflow-gpu 
 35 | pip3 install scikit-image 
 36 | pip3 install scikit-learn 
 37 | pip3 install keras 
 38 | pip3 install h5py 
 39 | pip3 install pydot 
 40 | pip3 install pydot3 
 41 | pip3 install pydot-ng 
 42 | pip3 install graphviz 
 43 | pip3 install opencv-python 
 44 | ```
 45 | 
 46 | # 学習に利用したデータセット
 47 | COCO  
 48 | http://cocodataset.org/  
 49 | ```
 50 | mkdir /path/to/dataset
 51 | cd /path/to/dataset
 52 | wget http://images.cocodataset.org/zips/train2017.zip
 53 | wget http://images.cocodataset.org/zips/val2017.zip
 54 | wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
 55 | unzip train2017.zip
 56 | unzip val2017.zip
 57 | unzip annotations_trainval2017.zip
 58 | ```
 59 | 
 60 | # 論文と異なる点、制約事項
 61 | 学習時間、コスト削減のため、以下のような制限を設けた。　　
 62 | (GPUインスタンス費用がきついため。。。)
 63 | - バックボーンネットワークにはKerasの学習済みのVGGを利用。
 64 |   - 論文ではResNetかFPN。
 65 | - 検出オブジェクトを 人(ラベル:person) に限定。
 66 | - 入力画像サイズを224＊224ピクセルに縮小。
 67 |   - 学習済みVGGの入力サイズに合わせる。
 68 | - 高さ、または幅が23ピクセル以下のオブジェクトは学習対象外。
 69 | 
 70 | # 使い方
 71 | ## 学習
 72 | 3ステージに分けて学習する。
 73 | - stage1  
 74 | RPNのみの学習。
 75 | ```
 76 | python3 train_mrcnn.py --data_path /path/to/dataset --stage 1
 77 | ```
 78 | - stage2  
 79 | Headのみの学習。
 80 | ```
 81 | python3 train_mrcnn.py --weights_path ./model/maskrcnn.h5 --data_path /path/to/dataset --stage 2
 82 | ```
 83 | - stage3  
 84 | RPN+Headの学習。
 85 | ```
 86 | python3 train_mrcnn.py --weights_path ./model/maskrcnn.h5 --data_path /path/to/dataset --stage 3
 87 | ```
 88 | 
 89 | train_mrcnn.pyで指定する各学習のイテレーションは少なめなので、実行環境や許容されるコストに合わせて調整してください。
 90 | 
 91 | 
 92 | ## テスト
 93 | ```
 94 | python3 predict_mrcnn.py --weights_path ./model/maskrcnn.h5 --input_path /path/to/testdata
 95 | ```
 96 | 
 97 | ### 結果
 98 | 以下のように学習した結果。
 99 | - stage1: 10万イテレーション
100 | - stage2: 4万イテレーション
101 | - stage3: 無し
102 | 
103 | ![](resource/result_02.png)
104 | ![](resource/result_03.png)
105 | ![](resource/result_05.png)
106 | ![](resource/result_06.png)
107 | ![](resource/result_07.png)
108 | ![](resource/result_09.png)
109 | ![](resource/result_12.png)
110 | ![](resource/result_13.png)
111 | 
112 | # 課題
113 | - マスクの精度が低い。
114 |   - 更なる学習が必要。
115 | - 画像の周辺部の検出精度が低い？
116 |   - 周辺部のアンカーを残す？
117 | - サイズの小さいオブジェクトの検出
118 |   - 除外条件「高さ、**または**幅が23ピクセル以下」の影響か。
119 | ![](resource/result_01.png)
120 | ![](resource/result_04.png)
121 | ![](resource/result_08.png)
122 | ![](resource/result_10.png)
123 | ![](resource/result_11.png)
124 | 
125 | 
126 | # 参考資料
127 | - https://arxiv.org/abs/1703.06870  
128 | - https://engineer.dena.jp/2017/12/chainercvmask-r-cnn.html  
129 | - https://qiita.com/yu4u/items/5cbe9db166a5d72f9eb8
130 | - https://github.com/matterport/Mask_RCNN
131 | - https://github.com/chainer/chainercv
132 | 
133 | [^1]: https://engineer.dena.jp/2017/12/chainercvmask-r-cnn.html  
134 | [^2]: https://arxiv.org/abs/1703.06870  
135 | 


--------------------------------------------------------------------------------
/xrcnn/util/image.py:
--------------------------------------------------------------------------------
  1 | import skimage.io
  2 | import scipy.misc
  3 | import numpy as np
  4 | import random
  5 | 
  6 | from logging import getLogger
  7 | 
  8 | logger = getLogger(__name__)
  9 | 
 10 | 
 11 | def load_image_as_ndarray(image_path):
 12 |     """画像を読み込んで、[h,w,channel(RGB)]形式のnumpy配列として取得する。
 13 |         Args:
 14 |             image_path: 画像ファイルのパス
 15 |         Returns:
 16 |             [h,w,channel(RGB)]形式
 17 |     """
 18 |     image = skimage.io.imread(image_path)
 19 |     return image
 20 | 
 21 | 
 22 | def resize_with_padding(image_array, min_size, max_size):
 23 |     """アスペクト比を維持したままリサイズする。
 24 |     高さ、または幅の小さい方がmin_sizeとなるようリサイズする。
 25 |     リサイズの結果、高さ、または幅の大きい方がmax_sizeを超える場合は、高さ、または幅の大きい方をmax_sizeとする。
 26 |     リサイズ後画像を max_size*max_size の枠の中央に配置し、周辺を0でPaddingする。
 27 | 
 28 |     Args:
 29 |         image_array: [h,w,3]の配列
 30 |         min_size:
 31 |         max_size:
 32 | 
 33 |     Returns:
 34 |         resized_image: リサイズ後の画像
 35 |         window: (y1, x1, y2, x2). リサイズ後の画像が画像全体のどの位置にあるかを示す座標
 36 |         scale: 元画像に対してのスケール
 37 |     """
 38 |     h, w = image_array.shape[:2]
 39 |     window = (0, 0, h, w)
 40 |     scale = 1
 41 | 
 42 |     scale = max(1, min_size / min(h, w))
 43 | 
 44 |     # max_sizeを超えるないよう調整
 45 |     image_max = max(h, w)
 46 |     if round(image_max * scale) > max_size:
 47 |         scale = max_size / image_max
 48 | 
 49 |     if scale != 1:
 50 |         image_array = scipy.misc.imresize(image_array,
 51 |                                           (round(h * scale), round(w * scale)))
 52 |     # Padding
 53 |     h, w = image_array.shape[:2]
 54 |     top_pad = (max_size - h) // 2
 55 |     bottom_pad = max_size - h - top_pad
 56 |     left_pad = (max_size - w) // 2
 57 |     right_pad = max_size - w - left_pad
 58 |     padding = [(top_pad, bottom_pad), (left_pad, right_pad), (0, 0)]
 59 |     image_array = np.pad(image_array, padding,
 60 |                          mode='constant', constant_values=0)
 61 |     window = (top_pad, left_pad, h + top_pad, w + left_pad)
 62 | 
 63 |     return image_array, window, scale
 64 | 
 65 | 
 66 | def resize_mask(mask, padding_top_left, scale):
 67 |     """
 68 |         Args:
 69 |             mask: バイナリマスク
 70 |                 [height, width]
 71 |     """
 72 |     # [height, width] -> [height, width, 3]
 73 |     mask = np.dstack([mask, mask, mask])
 74 |     mask, _, _ = resize_with_padding(mask, padding_top_left, scale)
 75 |     # [height, width, 3] -> [height, width]
 76 |     mask = np.reshape(mask[:, :, 0], mask.shape[:2])
 77 |     return mask
 78 | 
 79 | 
 80 | def resize_bbox(bbox, padding_top_left, scale):
 81 |     logger.debug("resize_bbox:in: %s %s %s",
 82 |                  bbox, padding_top_left, scale)
 83 |     # top_left(y, x)　をscaleだけ大きくした矩形に足すことでPadding分ずらす
 84 |     bbox = bbox * scale + np.tile(padding_top_left, 2)
 85 |     logger.debug("resize_bbox:out: %s", bbox)
 86 |     return bbox
 87 | 
 88 | 
 89 | def random_flip(image_array, force_flip=False):
 90 |     x_flip = random.choice([True, False]) | force_flip
 91 |     # 上下逆転は結果の精度を落とすっぽい。
 92 |     y_flip = False  # random.choice([True, False]) | force_flip
 93 | 
 94 |     img = image_array.copy()
 95 |     if y_flip:
 96 |         img = np.flip(img, axis=0)
 97 |     if x_flip:
 98 |         img = np.flip(img, axis=1)
 99 |     return img, x_flip, y_flip
100 | 
101 | 
102 | def flip_mask(mask, x_flip, y_flip):
103 |     """
104 |         Args:
105 |             mask: バイナリマスク
106 |                 [height, width]
107 |     """
108 |     mask = mask.copy()
109 |     if y_flip:
110 |         mask = np.flip(mask, axis=0)
111 |     if x_flip:
112 |         mask = np.flip(mask, axis=1)
113 |     return mask
114 | 
115 | 
116 | def flip_bbox(bbox, image_size, x_flip, y_flip):
117 |     logger.debug("flip_bbox:in: %s %s %s %s",
118 |                  bbox, image_size, x_flip, y_flip)
119 |     h, w = image_size
120 |     flipped_bbox = bbox.copy()
121 |     if y_flip:
122 |         flipped_bbox[0] = h - bbox[2]  # top
123 |         flipped_bbox[2] = h - bbox[0]  # bottom
124 |     if x_flip:
125 |         flipped_bbox[1] = w - bbox[3]  # left
126 |         flipped_bbox[3] = w - bbox[1]  # right
127 |     # print(image_size, ":", (bbox[2] - bbox[0], bbox[3] - bbox[1]),
128 |     #       (flipped_bbox[2] - flipped_bbox[0],
129 |     #       flipped_bbox[3] - flipped_bbox[1]))
130 |     logger.debug("flip_bbox:out: %s", flipped_bbox)
131 |     return flipped_bbox
132 | 


--------------------------------------------------------------------------------
/xrcnn/config.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | 
  4 | class Config:
  5 | 
  6 |     train_data_path = '../dataset/VOCdevkit/VOC2007'
  7 | 
  8 |     # data augumentation setting
  9 |     # 教師データ加工(data augmentation)の有無
 10 |     use_horizontal_flips = False
 11 |     use_vertical_flips = False
 12 |     rot_90 = False
 13 | 
 14 |     # anchor box ratios
 15 |     # アンカーボックスのアスペクト比
 16 |     # [(h, w), ...]
 17 |     # 面積は固定
 18 |     anchor_box_aspect_ratios = [
 19 |         (1. / math.sqrt(2), 2. / math.sqrt(2)),
 20 |         (1., 1.),
 21 |         (2. / math.sqrt(2), 1. / math.sqrt(2))
 22 |     ]
 23 | 
 24 |     # bboxに適用する精度向上の為のパラメータ
 25 |     # TODO 値の根拠について調査。ひとまずは参考とする実装にあるパラメータを指定する。
 26 |     # bbox_refinement_std = [1.0, 1.0, 1.0, 1.0]
 27 |     bbox_refinement_std = [0.1, 0.1, 0.2, 0.2]
 28 | 
 29 |     # 学習時に利用するオブジェクトの最大数（画像1つ当たり）
 30 |     n_max_gt_objects_per_image = 100
 31 | 
 32 |     # non-non_maximum_suppression(NMS)の閾値。
 33 |     nms_thresh = 0.7
 34 | 
 35 |     # NMS前にこの閾値まで領域数を削減する。
 36 |     # トレーニングモードで利用されるパラメータ。
 37 |     n_train_pre_nms = 12000
 38 | 
 39 |     # NMS後にこの閾値まで領域数を削減する。
 40 |     # トレーニングモードで利用されるパラメータ。
 41 |     n_train_post_nms = 2000
 42 | 
 43 |     # NMS前にこの閾値まで領域数を削減する。
 44 |     # テストモードで利用されるパラメータ。
 45 |     n_test_pre_nms = 6000
 46 | 
 47 |     # NMS後にこの閾値まで領域数を削減する。
 48 |     # テストモードで利用されるパラメータ。
 49 |     n_test_post_nms = 300
 50 | 
 51 |     # バッチサイズ
 52 |     # 予測時は1にすること。
 53 |     batch_size = 2
 54 | 
 55 |     # RoIAlignの出力サイズ
 56 |     roi_align_pool_shape = [7, 7]
 57 |     mask_roi_align_pool_shape = [14, 14]
 58 | 
 59 |     # データセットに含まれるラベルの種類（背景を示すラベルも含む）
 60 |     n_dataset_labels = 21
 61 | 
 62 |     # Trueであればトレーニングモード。
 63 |     training = True
 64 |     # rpn_only でRPNのみのトレーニング。
 65 |     # 全体でトレーニングするとRPNのOffsetの損失がNaNになるため、
 66 |     # まずはRPNからトレーニング。
 67 |     # training_mode = 'all' | 'rpn_only' | 'head_only'
 68 |     training_mode = 'all'
 69 | 
 70 |     # バックボーンネットワークの種類
 71 |     # vgg, resnet
 72 |     backbone_nn_type = 'vgg'
 73 | 
 74 |     # GPU利用の場合は0以上（利用可能なGPU数を指定）
 75 |     gpu_count = 0
 76 | 
 77 |     # NNトレーニング時の学習率
 78 |     learning_rate = 0.001
 79 | 
 80 |     # 検出時に行うnon-non_maximum_suppression(NMS)の閾値。
 81 |     # 同一のオブジェクトに対するbbox予測の重複を排除する。
 82 |     detect_nms_thresh = 0.3
 83 |     # 予測結果として採用するラベル確率
 84 |     detect_label_prob = 0.7
 85 |     # 予測結果として得られる件数
 86 |     detect_max_instances = 100
 87 | 
 88 |     def __init__(self):
 89 |         # Number of pixels per pixel on base network feature map
 90 |         # ベースネットワークの特徴マップ1ピクセル当たりの入力画像におけるピクセル数
 91 |         if self.backbone_nn_type == 'vgg':
 92 |             # VGG16をベースにするため16ピクセルになる（stride=2*2の畳み込みが4回行われるため、サイズが元の1/16になる。）
 93 |             self.stride_per_base_nn_feature = 16
 94 |             self.image_max_size = 224  # Kerasの学習済みモデルのInputに合わせて224
 95 |         else:
 96 |             # ResNet50をベースにするため場合は32ピクセルになる（stride=2*2の畳み込みが5回行われるため、サイズが元の1/32になる。）
 97 |             self.stride_per_base_nn_feature = 32
 98 |             self.image_max_size = 224  # Kerasの学習済みモデルのInputに合わせて224
 99 | 
100 |         # # リサイズ後の最小サイズ
101 |         # image_min_size = 600
102 |         # # リサイズ後の最大サイズ
103 |         # image_max_size = 1024
104 |         # imagenet画像でpretrainしたバックボーンネットワークを使う場合のサイズ
105 |         self.image_min_size = 150
106 | 
107 |         # 入力画像のサイズを基準としたアンカーボックスのピクセル数
108 |         # 縦横同一サイズ
109 |         # anchor_box_scales = [128, 256, 512]
110 |         # Kerasの学習済みモデル（入力が224*224）を使う場合のサイズ
111 |         self.anchor_box_scales = [32, 64, self.image_max_size // 2]
112 | 
113 |         self.n_anchor = len(self.anchor_box_scales) * \
114 |             len(self.anchor_box_aspect_ratios)
115 | 
116 |         # Input image size
117 |         # (h, w, 3)
118 |         self.image_shape = [self.image_max_size, self.image_max_size, 3]
119 |         # # backboneネットワークの出力サイズ
120 |         self.backbone_shape = [
121 |             int(math.ceil(self.image_shape[0] /
122 |                           self.stride_per_base_nn_feature)),
123 |             int(math.ceil(self.image_shape[1] /
124 |                           self.stride_per_base_nn_feature))
125 |         ]
126 | 
127 |         # 評価対象外にするBBOXのサイズ
128 |         # 特徴抽出が困難と思われる小さなBoxを除外する
129 |         self.ignore_box_size = self.image_max_size // 20
130 | 
131 |         # maskネットワークの出力サイズは入力時のプーリングサイズの2倍
132 |         self.mask_out_shape = [self.mask_roi_align_pool_shape[0] * 2,
133 |                                self.mask_roi_align_pool_shape[1] * 2]
134 | 


--------------------------------------------------------------------------------
/train_mrcnn.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import tensorflow as tf
  4 | from keras import backend as K
  5 | import keras.callbacks
  6 | from keras import utils
  7 | from xrcnn.config import Config
  8 | from xrcnn.mrcnn import MaskRCNN
  9 | from xrcnn.util.anchor import Anchor
 10 | import xrcnn.util.coco_dataset as coco_dataset
 11 | 
 12 | 
 13 | from tensorflow.python import debug as tf_debug
 14 | from xrcnn.util import log
 15 | 
 16 | 
 17 | def name_filter(datum, tensor):
 18 |     print(datum.tensor_name)
 19 |     return "sample_gt_mask" in datum.tensor_name
 20 | 
 21 | 
 22 | def set_debugger_session():
 23 |     sess = K.get_session()
 24 |     sess = tf_debug.LocalCLIDebugWrapperSession(sess)
 25 |     sess.add_tensor_filter('name_filter', name_filter)
 26 |     K.set_session(sess)
 27 | 
 28 | 
 29 | FORMAT = '%(asctime)-15s %(levelname)s #[%(thread)d] %(message)s'
 30 | logging.basicConfig(format=FORMAT, level=logging.INFO)
 31 | 
 32 | logger = logging.getLogger(__name__)
 33 | logger.info("---start---")
 34 | 
 35 | config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
 36 | session = tf.Session(config=config)
 37 | K.set_session(session)
 38 | # set_debugger_session()
 39 | 
 40 | config = Config()
 41 | anchor = Anchor(config)
 42 | 
 43 | argparser = argparse.ArgumentParser(description="FasterRCNNのトレーニング")
 44 | argparser.add_argument('--data_path', type=str,
 45 |                        required=True, help="COCOデータセットが配置してあるディレクトリ")
 46 | argparser.add_argument('--stage', type=int,
 47 |                        required=True,
 48 |                        help="トレーニングステージ.1:RPNのみ, 2:HEADのみ only, 3:両方")
 49 | argparser.add_argument('--max_sample', type=int,
 50 |                        required=False, help="利用するVOCデータの上限")
 51 | argparser.add_argument('--weights_path', type=str,
 52 |                        required=False, help="モデルの重みファイルのパス")
 53 | args = argparser.parse_args()
 54 | 
 55 | n_max = args.max_sample
 56 | if n_max and n_max <= 0:
 57 |     n_max = None
 58 | 
 59 | print(args)
 60 | 
 61 | logger.info("use coco dataset.")
 62 | # カテゴリはpersonに限る
 63 | gen = coco_dataset.Generator(config, args.data_path,
 64 |                              data_type='train2017',
 65 |                              target_category_names=['person'])
 66 | train_data_generator = gen.generate(anchor, n_max=n_max)
 67 | labels = gen.get_labels()
 68 | gen = coco_dataset.Generator(config, args.data_path,
 69 |                              data_type='val2017',
 70 |                              target_category_names=['person'])
 71 | val_data_generator = gen.generate(anchor, n_max=n_max)
 72 | labels = gen.get_labels()
 73 | 
 74 | config.n_dataset_labels = len(labels)
 75 | config.training = True
 76 | config.gpu_count = 1
 77 | config.batch_size = 2
 78 | config.learning_rate = 0.001
 79 | if args.stage == 1:
 80 |     config.training_mode = 'rpn_only'
 81 |     steps_per_epoch = 200
 82 |     epochs = 500
 83 | elif args.stage == 2:
 84 |     config.training_mode = 'head_only'
 85 |     steps_per_epoch = 100
 86 |     epochs = 500
 87 | else:
 88 |     config.training_mode = 'all'
 89 |     steps_per_epoch = 100
 90 |     epochs = 500
 91 | log.out_name_pattern = ".+_loss$"
 92 | 
 93 | mrcnn = MaskRCNN(anchor.anchors, config)
 94 | model = mrcnn.compiled_model()
 95 | print(model.summary())
 96 | 
 97 | if args.weights_path:
 98 |     model.load_weights(args.weights_path, by_name=True)
 99 | 
100 | utils.plot_model(model, './model.png', True, True)
101 | 
102 | for i, layer in enumerate(model.layers):
103 |     if layer.__class__.__name__ == 'TimeDistributed':
104 |         name = layer.layer.name
105 |         trainable = layer.layer.trainable
106 |     else:
107 |         name = layer.name
108 |         trainable = layer.trainable
109 |     print('layer:', i, ':', name, trainable)
110 | 
111 | callbacks = [keras.callbacks.TerminateOnNaN(),
112 |              keras.callbacks.TensorBoard(log_dir='./tb_log',
113 |                                          histogram_freq=0,
114 |                                          write_graph=True,
115 |                                          write_images=False),
116 |              keras.callbacks.ModelCheckpoint(filepath='./model/maskrcnn.h5',
117 |                                              verbose=1,
118 |                                              save_weights_only=True,
119 |                                              save_best_only=True),
120 |              keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
121 |                                                verbose=1,
122 |                                                factor=0.7,
123 |                                                patience=10,
124 |                                                min_lr=config.learning_rate
125 |                                                / 30)]
126 | model.fit_generator(train_data_generator,
127 |                     steps_per_epoch=steps_per_epoch,
128 |                     epochs=epochs,
129 |                     verbose=1,
130 |                     workers=4,
131 |                     max_queue_size=10,
132 |                     use_multiprocessing=True,
133 |                     callbacks=callbacks,
134 |                     validation_data=val_data_generator,
135 |                     validation_steps=20)
136 | model.save_weights('./model/maskrcnn-latest.h5')
137 | 


--------------------------------------------------------------------------------
/xrcnn/region_proposal_layer.py:
--------------------------------------------------------------------------------
  1 | from keras import backend as K
  2 | from keras import layers as KL
  3 | import tensorflow as tf
  4 | import xrcnn.util.bbox as bbox
  5 | import logging
  6 | 
  7 | logger = logging.getLogger(__name__)
  8 | 
  9 | 
 10 | class RegionProposalLayer(KL.Layer):
 11 |     """RPNの最終レイヤ。
 12 |     上位のレイヤから得るオフセット(rpn_offsets), オブジェクトである確率(rpn_objects)から
 13 |     バッチ毎にRoIとRoI特徴マップの対応（roi_indices）、さらにアンカーを求める。
 14 | 
 15 |     rpn_objectsからスコア（rpn_objects）上位config.n_train_pre_nms(config.n_test_pre_nms)
 16 |     を残し、non_maximum_suppressionでIoUがconfig.nms_thresh以上の重複領域を除く。
 17 |     （スコアが高い物を優先して残す）
 18 |     NMSの結果残った領域から、更にスコアが上位config.n_train_post_nms(config.n_test_post_nms)である領域に絞り込む。
 19 | 
 20 |     Inputs:
 21 |         feature_map: [N, c, h, w]
 22 |         rpn_offsets: [N, R, 4]
 23 |             3軸目の形状は以下。
 24 |             (dy, dx, dh, dw)
 25 |         rpn_objects: [N, R, 1]
 26 |             3軸目はオブジェクトである確率。
 27 | 
 28 |         要素数N: config.batch_size
 29 |         要素数R: config.n_train_post_nms(トレーニング時はconfig.n_test_post_nms)
 30 | 
 31 |     Returns:
 32 |         領域提案: [N, n_rois, (y1, x1, y2, x2)]
 33 |             3軸目の座標は0〜1に正規化されている。
 34 |     """
 35 | 
 36 |     def __init__(self,
 37 |                  anchors,
 38 |                  config,
 39 |                  **kwargs):
 40 |         super(RegionProposalLayer, self).__init__(**kwargs)
 41 |         self.input_h = config.image_shape[0]
 42 |         self.input_w = config.image_shape[1]
 43 |         self.anchors = anchors
 44 |         self.nms_thresh = config.nms_thresh
 45 |         self.training = config.training
 46 |         self.bbox_refinement_std = config.bbox_refinement_std
 47 |         if self.training:
 48 |             self.n_pre_nms = config.n_train_pre_nms
 49 |             self.n_post_nms = config.n_train_post_nms
 50 |         else:
 51 |             self.n_pre_nms = config.n_test_pre_nms
 52 |             self.n_post_nms = config.n_test_post_nms
 53 |         self.batch_size = config.batch_size
 54 | 
 55 |     def call(self, inputs):
 56 |         rpn_offsets = inputs[1]
 57 |         # 既存実装に合わせた精度向上
 58 |         rpn_offsets *= self.bbox_refinement_std
 59 |         rpn_objects = inputs[2]
 60 |         fg_scores = rpn_objects[:, :, 1]
 61 |         n_anchors = self.anchors.shape[0]
 62 | 
 63 |         # スコアが上位の候補のみに絞る
 64 |         # r = np.repeat(range(3), 2)
 65 |         # >>> i = K.get_value(I)
 66 |         # array([[2, 1],
 67 |         #        [2, 1],
 68 |         #        [0, 1]], dtype=int32)
 69 |         #  >>> np.stack((r, i.flatten()), axis=1)
 70 |         # array([[0, 2],
 71 |         #        [0, 1],
 72 |         #        [1, 2],
 73 |         #        [1, 1],
 74 |         #        [2, 0],
 75 |         #        [2, 1]])
 76 |         # AI = K.variable(ai)
 77 |         # K.eval(tf.reshape(tf.gather_nd(T, AI), (3,2)))
 78 |         pre_nms_limit = min(self.n_pre_nms, n_anchors)
 79 |         # バッチ毎に上位Nのスコアが存在するIndexを取得する。
 80 |         top_k_idx = tf.nn.top_k(fg_scores, pre_nms_limit, sorted=True).indices
 81 | 
 82 |         # idxの形状は(R, pre_nms_limit)
 83 |         # tf.gather_ndで利用できるよう、バッチ入力のIndexとスコアのIndexの組合せにする。
 84 |         #   [[バッチ入力のIndex, スコアのindex], ・・・]
 85 |         n_batch = self.batch_size
 86 |         # ↑はもともと「K.shape(feature_map)[0]」としたかったが
 87 |         # dynamic shapeなのでtensorflowのslice, join系の関数に指定出来ない。。。
 88 |         # よって、configから固定値を取得することにした。
 89 |         # MaskRCNNの実装も同様だったのでこれでよいはず。
 90 | 
 91 |         rn = K.flatten(
 92 |             K.repeat(K.reshape(K.arange(n_batch), [-1, 1]),
 93 |                      pre_nms_limit))
 94 |         pos = K.stack((rn, K.flatten(top_k_idx)), axis=1)
 95 |         # スコア上位のindexを元にスコア、roi, アンカーを抽出する。
 96 |         fg_scores = K.reshape(tf.gather_nd(fg_scores, pos),
 97 |                               [n_batch, pre_nms_limit])
 98 |         rpn_offsets = K.reshape(tf.gather_nd(rpn_offsets, pos),
 99 |                                 [n_batch, pre_nms_limit, 4])
100 |         # バッチ毎に維持するアンカー(pos)が異なるので、バッチ数分アンカーを積み上げる。
101 |         anchors = K.reshape(K.tile(self.anchors, [n_batch, 1]),
102 |                             [n_batch, n_anchors, 4])
103 |         anchors = K.reshape(tf.gather_nd(anchors, pos),
104 |                             [n_batch, pre_nms_limit, 4])
105 | 
106 |         # アンカーとオフセットからBBoxを得る。
107 |         # バッチ毎にアンカー、オフセットを積み上げ、bbox.get_bboxでまとめて計算する。
108 |         # (N*R, 4)に変形。
109 |         stacked_anchors_per_batch = K.reshape(anchors,
110 |                                               [n_batch * pre_nms_limit, 4])
111 |         stacked_offsets_per_batch = K.reshape(rpn_offsets,
112 |                                               [n_batch * pre_nms_limit, 4])
113 |         bboxes = bbox.get_bbox(stacked_anchors_per_batch,
114 |                                stacked_offsets_per_batch)
115 | 
116 |         # 画像をはみ出すBBoxは画像領域内に収まるよう座標を調整する。
117 |         bboxes = K.clip(bboxes, [0, 0, 0, 0],
118 |                         [self.input_h, self.input_w,
119 |                          self.input_h, self.input_w])
120 | 
121 |         # 元の形状(N, R, 4)に戻す
122 |         bboxes = K.reshape(
123 |             bboxes, [K.shape(rpn_offsets)[0], K.shape(rpn_offsets)[1], 4])
124 | 
125 |         # 小さなオブジェクトの検出も可能としたいため、小さなBBoxは残す。
126 |         # fasterRCNN論文基準のchainercvの実装では、小さなBBOXを削除している。
127 |         # 後発の https://github.com/matterport/Mask_RCNN では残している。
128 | 
129 |         # NMSでself.n_post_nms以下になるよう絞る。
130 |         # バッチ毎にNMSを呼び出す形にするため、tf.splitを使ってbboxesを[n_batch,R,4]から[R,4]のリストにしてNMSする。
131 |         split_size = tf.tile([1], [n_batch])
132 |         proposal_boxes = K.stack([self._nms(box, score, pre_nms_limit)
133 |                                   for box, score
134 |                                   in zip(tf.split(bboxes, split_size, axis=0),
135 |                                          tf.split(
136 |                                              fg_scores, split_size, axis=0)
137 |                                          )])
138 | 
139 |         return proposal_boxes
140 | 
141 |     def _nms(self, bboxes, scores, dim1):
142 |         """nmsの結果を得る。
143 |         結果として得られる座標は0〜1に正規化されたままとする。
144 |         """
145 |         # tf.splitの結果の形状が(1,R,X)のままなので、(R,X)に変換
146 |         bboxes = tf.reshape(bboxes, [dim1, 4])
147 |         scores = tf.reshape(scores, [dim1])
148 |         # tensorflowのnon-max-suppression(NMS)を利用するので、まず入力するboxの座標を0~1に正規化する。
149 |         normalized_boxes = bbox.normalize_bbox(bboxes,
150 |                                                self.input_h, self.input_w)
151 |         indices = tf.image.non_max_suppression(
152 |             normalized_boxes, scores, self.n_post_nms,
153 |             iou_threshold=self.nms_thresh)
154 |         # NMSしたboxに絞り込む
155 |         # 座標は正規化しまま。
156 |         boxes = tf.gather(normalized_boxes, indices)
157 |         # boxが上限self.n_post_nmsを下回る場合は、1軸目の要素数がself.n_post_nmsとなるよう
158 |         # 0で埋めることで、バッチ毎の出力形状を合わせる。（1つのテンソルにまとめられるようにする）
159 |         # dataset.py#data_generatorではself.n_post_nmsの次元数を前提とする。
160 |         padding = tf.maximum(self.n_post_nms - tf.shape(boxes)[0], 0)
161 |         boxes = tf.pad(boxes, [(0, padding), (0, 0)])
162 |         return boxes
163 | 
164 |     def compute_output_shape(self, input_shape):
165 |         return (self.batch_size, self.n_post_nms, 4)
166 | 


--------------------------------------------------------------------------------
/xrcnn/loss.py:
--------------------------------------------------------------------------------
  1 | from keras import backend as K
  2 | import tensorflow as tf
  3 | import logging
  4 | 
  5 | from xrcnn.util import log
  6 | 
  7 | logger = logging.getLogger(__name__)
  8 | 
  9 | """
 10 |     RPNの損失関数
 11 |         J(p→,u,v→,t→)=Jcls(p→,u)+λ[u>=1]Jloc(v→,t→)
 12 | 
 13 |         Jcls(p→,u)=−log⁡pu
 14 | 
 15 |         Jloc=∑i∈{x,y,w,h}smoothL1(ti−vi)
 16 |             smoothL1(x)={
 17 |                 0.5 * x^2   if(|x|<1)
 18 |                 |x|−0.5 otherwise
 19 |                 }
 20 | """
 21 | 
 22 | 
 23 | def sparse_categorical_crossentropy(gt_ids, pred_one_hot_post_softmax):
 24 |     """
 25 |     K.sparse_categorical_crossentropyだと結果がNaNになる。。。
 26 |     0割り算が発生しているかも。
 27 |     https://qiita.com/4Ui_iUrz1/items/35a8089ab0ebc98061c1
 28 |     対策として、微少値を用いてlog(0)にならないよう調整した本関数を作成。
 29 |     """
 30 |     gt_ids = log.tfprint(gt_ids, "cross:gt_ids:")
 31 |     pred_one_hot_post_softmax = log.tfprint(pred_one_hot_post_softmax,
 32 |                                             "cross:pred_one_hot_post_softmax:")
 33 | 
 34 |     gt_one_hot = K.one_hot(gt_ids, K.shape(pred_one_hot_post_softmax)[-1])
 35 |     gt_one_hot = log.tfprint(gt_one_hot, "cross:gt_one_hot:")
 36 | 
 37 |     epsilon = K.epsilon()  # 1e-07
 38 |     loss = -K.sum(
 39 |         gt_one_hot * K.log(
 40 |             tf.clip_by_value(pred_one_hot_post_softmax, epsilon, 1 - epsilon)),
 41 |         axis=-1)
 42 |     loss = log.tfprint(loss, "cross:loss:")
 43 |     return loss
 44 | 
 45 | 
 46 | def smooth_l1(gt, pred):
 47 |     # https://qiita.com/GushiSnow/items/8c946208de0d6a4e31e7
 48 |     diff = K.abs(gt - pred)
 49 |     less_than_one = K.cast(K.less(diff, 1.0), "float32")
 50 |     # difffが1より小さい場合、less_than_one==1
 51 |     loss = (less_than_one * 0.5 * diff**2) + (1 - less_than_one) * (diff - 0.5)
 52 |     return loss
 53 | 
 54 | 
 55 | def offsets_loss(gt_offsets, pred_offsets, dump=False):
 56 |     """オフセット回帰の損失関数
 57 |     positive（gt_fg > 0）データのみ評価対象とする
 58 | 
 59 |     Args:
 60 |         gt_offsets: 正解オフセット
 61 |             [R, 4]
 62 |             3軸目は領域提案とアンカーのオフセット（中心、幅、高さ）。
 63 |                 (tx, ty, th, tw)
 64 |         pred_offsets: 予測値
 65 |             [R, 4].
 66 | 
 67 |     Note:
 68 |         この関数の呼び出し元はrpn_offsets_lossとhead_offsets_loss。
 69 |         RPNでのRoI予測が外れると全てNegativeなBBoxとなり、結果的にhead_offsets_lossへ渡される正解データのラベルが全てNegativeとなる。
 70 |         その場合、head_offsets_lossで得られる損失は0となるが、rpn_offsets_lossで得られる損失は大きくなるはずなので、
 71 |         損失全体(rpn_offsets_loss + head_offsets_loss)で評価すれば適切な損失になるはず。
 72 |     """
 73 |     loss = K.switch(tf.size(gt_offsets) > 0,
 74 |                     smooth_l1(gt_offsets, pred_offsets), tf.constant(0.0))
 75 |     loss = K.mean(loss)
 76 |     return loss
 77 | 
 78 | 
 79 | def rpn_offsets_loss(gt_offsets, gt_fg, pred_offsets):
 80 |     """RPNのオフセット回帰の損失関数
 81 |     positive（gt_fg > 0）データのみ評価対象とする
 82 | 
 83 |     gt_offsets: 正解オフセット
 84 |         [N, R, 4]
 85 |         3軸目は領域提案とアンカーのオフセット（中心、幅、高さ）。
 86 |             (tx, ty, th, tw)
 87 |     gt_fg: 正解データの前景／背景
 88 |         [N, R]
 89 |     pred_offsets: 予測値
 90 |         [N, R, 4].
 91 |     """
 92 |     pos_idx = tf.where(gt_fg > 0)
 93 |     gt_offsets = tf.gather_nd(gt_offsets, pos_idx)
 94 |     pred_offsets = tf.gather_nd(pred_offsets, pos_idx)
 95 |     # FasterRCNNの論文上は、RPNのオフセット回帰には係数10を乗ずることでオブジェクト分類損失とのバランスを取ることになっている。
 96 |     # が、rpnの損失の全損失に占める割合が高すぎるようなら係数調整
 97 |     p = 1.
 98 |     loss = p * offsets_loss(gt_offsets, pred_offsets)
 99 |     loss = log.tfprint(loss, "rpn_offsets_loss")
100 |     return loss
101 | 
102 | 
103 | def head_offsets_loss(gt_offsets, gt_labels, pred_offsets):
104 |     """ヘッドのオフセット回帰の損失関数
105 |     positive（gt_fg > 0）データのみ評価対象とする
106 | 
107 |     gt_offsets: 正解オフセット
108 |         [N, R, 4]
109 |     gt_labels: 正解データのラベルID
110 |         [N, R]
111 |     pred_offsets: ラベル毎の予測値
112 |         [N, R, n_labels, 4].
113 |     """
114 | 
115 |     # 正解データのラベルIDに対応するオフセットのみを損失評価対象とする。
116 |     # 論文には以下のようにあるので、正解ラベルのBBoxのみで良さそう。
117 |     # The second task loss, Lloc, is defined over a tuple of true bounding-box
118 |     # regression targets for class u, v = (vx, vy, vw, vh), and a predicted
119 |     # tuple tu = (tux , tuy , tuw, tuh ), again for class u.
120 |     pos_idx = tf.where(gt_labels > 0)
121 |     i = K.cast(pos_idx[:, 0], tf.int32)
122 |     j = K.cast(pos_idx[:, 1], tf.int32)
123 |     k = K.cast(tf.gather_nd(gt_labels, pos_idx), tf.int32)
124 |     pos_pred_idx = K.stack((i, j, k), axis=1)
125 |     pred_offsets = tf.gather_nd(pred_offsets, pos_pred_idx)
126 |     gt_offsets = tf.gather_nd(gt_offsets, pos_idx)
127 | 
128 |     loss = offsets_loss(gt_offsets, pred_offsets)
129 |     loss = log.tfprint(loss, "head_offsets_loss")
130 |     return loss
131 | 
132 | 
133 | def labels_loss(gt, pred):
134 |     """ラベル分類の損失関数
135 | 
136 |     gt: 正解
137 |         [N, R]
138 |         2軸目はラベルを示すID
139 |     pred: 予測値(softmax済み)
140 |         [N, R, labels].
141 |     """
142 | 
143 |     # 交差エントロピー誤差
144 |     # バッチ毎の計算ではなく、全体の平均値でOK。
145 |     # 論文に以下の記載がある。
146 |     #    In our current implementation (as in the released code),
147 |     #    the cls term in Eqn.(1) is normalized by the mini-batch size
148 |     #    (i.e., Ncls = 256) and the reg term is normalized by the number of
149 |     #    anchor locations (i.e., Nreg ∼ 2, 400).
150 |     gt = K.cast(gt, 'int32')
151 |     loss = K.switch(tf.size(gt) > 0,
152 |                     sparse_categorical_crossentropy(gt, pred), K.constant(0.0))
153 |     loss = K.mean(loss)
154 |     return loss
155 | 
156 | 
157 | def rpn_objects_loss(gt, pred):
158 |     """RPNのオブジェクト／非オブジェクト分類の損失関数
159 | 
160 |     gt: 正解
161 |         [N, anchors]
162 |         2軸目の値は以下の通り。
163 |         positive=1, negative=0, neutral(exclude from eval)=-1
164 |     pred: 予測値(softmax済み)
165 |         [batch, anchors, 2].
166 |         3軸目はオブジェクトor非オブジェクトを示す数値。
167 |     """
168 |     # 評価対象外の−1に該当する要素を除く
169 |     indices = tf.where(gt > -1)
170 |     # print("indicies", indices)
171 |     # print("pred", pred)
172 |     gt = tf.gather_nd(gt, indices)
173 |     pred = tf.gather_nd(pred, indices)
174 | 
175 |     # 交差エントロピー誤差
176 |     # バッチ毎の計算ではなく、全体の平均値でOK。
177 |     # 論文に以下の記載がある。
178 |     #    In our current implementation (as in the released code),
179 |     #    the cls term in Eqn.(1) is normalized by the mini-batch size
180 |     #    (i.e., Ncls = 256) and the reg term is normalized by the number of
181 |     #    anchor locations (i.e., Nreg ∼ 2, 400).
182 |     loss = labels_loss(gt, pred)
183 |     loss = log.tfprint(loss, "rpn_objects_loss")
184 |     return loss
185 | 
186 | 
187 | def head_labels_loss(gt, pred):
188 |     """ヘッドのラベル分類の損失関数
189 | 
190 |     gt: 正解
191 |         [N, R]
192 |         2軸目はラベルを示すID
193 |     pred: 予測値(softmax済み)
194 |         [N, R, labels].
195 |     """
196 |     gt = log.tfprint(gt, "head_labels_loss_val:gt", summarize=1024)
197 |     pred = log.tfprint(pred, "head_labels_loss_val:pred", summarize=1024)
198 |     loss = labels_loss(gt, pred)
199 |     loss = log.tfprint(loss, "head_labels_loss")
200 |     return loss
201 | 
202 | 
203 | def head_mask_loss(gt_masks, gt_labels, pred_masks):
204 |     """マスクの損失関数
205 | 
206 |     gt_masks: 正解データ。
207 |         マスクデータをbboxの領域のみ切り抜いてconfig.mask_out_shapeにリサイズしたデータ。
208 |         [N, R, h, w]
209 |         バイナリマスク
210 |     gt_labels: 正解データのラベルID
211 |         [N, R]
212 |     pred_masks: 予測値
213 |         バイナリマスク
214 |         [N, R, n_labels h, w]
215 |     ※h, w は config.mask_out_shape になる。
216 |     """
217 |     # Positiveなラベルが付与されているRoIのみ評価対象とする
218 |     pos_idx = tf.where(gt_labels > 0)
219 |     i = K.cast(pos_idx[:, 0], tf.int32)
220 |     j = K.cast(pos_idx[:, 1], tf.int32)
221 |     k = K.cast(tf.gather_nd(gt_labels, pos_idx), tf.int32)
222 |     # i = log.tfprint(i, "i:head_mask_loss")
223 |     # j = log.tfprint(j, "j:head_mask_loss")
224 |     # k = log.tfprint(k, "k:head_mask_loss")
225 |     pos_pred_idx = K.stack((i, j, k), axis=1)
226 |     # pos_pred_idx = log.tfprint(pos_pred_idx, "pos_pred_idx:head_mask_loss")
227 |     pred_masks = tf.gather_nd(pred_masks, pos_pred_idx)
228 |     gt_masks = tf.gather_nd(gt_masks, pos_idx)
229 | 
230 |     loss = K.switch(tf.size(gt_masks) > 0,
231 |                     K.binary_crossentropy(gt_masks, pred_masks),
232 |                     tf.constant(0.0))
233 |     loss = K.mean(loss)
234 |     loss = log.tfprint(loss, "head_mask_loss")
235 |     return loss
236 | 


--------------------------------------------------------------------------------
/predict_mrcnn.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import glob
  3 | import logging
  4 | import os
  5 | import re
  6 | import tensorflow as tf
  7 | from keras import backend as K
  8 | import numpy as np
  9 | import scipy.misc
 10 | from xrcnn.config import Config
 11 | from xrcnn.mrcnn import MaskRCNN
 12 | from xrcnn.util.anchor import Anchor
 13 | from xrcnn.util import bbox
 14 | from xrcnn.util import image
 15 | import cv2
 16 | import matplotlib.pyplot as plt
 17 | 
 18 | FORMAT = '%(asctime)-15s %(levelname)s #[%(thread)d] %(message)s'
 19 | logging.basicConfig(format=FORMAT, level=logging.INFO)
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | logger.info("---start---")
 23 | 
 24 | 
 25 | def add_rect(dest_img, box, color, thickness):
 26 |     cv2.rectangle(dest_img, (box[1], box[0]),
 27 |                   (box[3], box[2]),
 28 |                   color, thickness=thickness)
 29 | 
 30 | 
 31 | def add_mask(dest_img, mask, bbox, color, image_shape):
 32 |     threshold = 0.5
 33 |     y1, x1, y2, x2 = bbox
 34 |     h, w = y2 - y1, x2 - x1
 35 |     logger.debug("y1, x1, y2, x2: %s, h, w: %s", (y1, x1, y2, x2), (h, w))
 36 |     logger.debug("mask.shape: %s", mask.shape)
 37 |     mask = scipy.misc.imresize(mask, (h, w),
 38 |                                interp='bilinear').astype(np.float32)
 39 |     # scipy.misc.imresizeの結果は0~255にスケールされるので、0〜1に戻す。
 40 |     mask /= 255.0
 41 |     # 0 or 1に変換。
 42 |     mask = np.where(mask >= threshold, 1, 0).astype(np.uint8)
 43 | 
 44 |     # 0~image_shapeの枠外のマスクは除外する
 45 |     _y1, _x1, _y2, _x2 = max(0, y1), max(0, x1), min(image_shape[0], y2), \
 46 |         min(image_shape[1], x2)
 47 |     d_y1, d_x1, d_y2, d_x2 = _y1 - y1, _x1 - x1, _y2 - y2, _x2 - x2
 48 |     mask = mask[d_y1:h + d_y2, d_x1:w + d_x2]
 49 | 
 50 |     # マスクを画像に配置。image_shapeは入力画像の[h, w]
 51 |     fullsize_mask = np.zeros(image_shape, dtype=np.uint8)
 52 |     fullsize_mask[_y1:_y2, _x1:_x2] = mask
 53 | 
 54 |     logger.debug("mask.shape: %s, image_shape: %s, bbox: %s (%s) ",
 55 |                  mask.shape, image_shape, bbox, (y2 - y1, x2 - x1))
 56 |     logger.debug("d_y1, d_x1, d_y2, d_x2: %s, mask.shape: %s ",
 57 |                  (d_y1, d_x1, d_y2, d_x2), mask.shape)
 58 | 
 59 |     # # mask
 60 |     mask_image = np.zeros(image_shape + [3], dtype=np.uint8)
 61 |     mask_image[:, :] = color
 62 |     mask_image = cv2.bitwise_and(mask_image, mask_image, mask=fullsize_mask)
 63 |     # mask = np.dstack([mask, mask, mask])
 64 |     # mask[:, :, 0][mask[:, :, 0] == 1] = color[0]
 65 |     # mask[:, :, 1][mask[:, :, 1] == 1] = color[1]
 66 |     # mask[:, :, 2][mask[:, :, 2] == 1] = color[2]
 67 |     cv2.addWeighted(mask_image, 1.5, dest_img, 1, 0, dest_img)
 68 | 
 69 | 
 70 | config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
 71 | session = tf.Session(config=config)
 72 | K.set_session(session)
 73 | # set_debugger_session()
 74 | 
 75 | config = Config()
 76 | anchor = Anchor(config)
 77 | config.training = False
 78 | config.batch_size = 1
 79 | # 学習時に利用したデータセットに含まれるラベル数を指定する。
 80 | config.n_dataset_labels = 1 + 1  # 背景 + people
 81 | logger.warn("指定されたラベル数: %s. 学習時のラベル数と異なる場合、エラーになります。",
 82 |             config.n_dataset_labels)
 83 | 
 84 | # dump tensor
 85 | # log.out_name_pattern = ".+debug$"
 86 | 
 87 | argparser = argparse.ArgumentParser(description="FasterRCNNで物体検出")
 88 | argparser.add_argument('--input_path', type=str,
 89 |                        required=True, help="処理対象の画像ファイルパス")
 90 | argparser.add_argument('--weights_path', type=str,
 91 |                        required=True, help="モデルの重みファイルのパス")
 92 | argparser.add_argument('--rpn', type=bool,
 93 |                        required=False, help="RPNの予測結果を表示")
 94 | args = argparser.parse_args()
 95 | 
 96 | mrcnn = MaskRCNN(anchor.anchors, config)
 97 | 
 98 | 
 99 | def pred(input_path):
100 |     logger.info("input_path: %s", input_path)
101 |     # 画像をnumpy配列として読み込む
102 |     img = image.load_image_as_ndarray(input_path)
103 |     img = img.astype(np.uint8)
104 |     logger.debug("img.shape: %s", img.shape)
105 |     # 学習時と同様にリサイズ
106 |     img, _, _ = image.resize_with_padding(
107 |         img,
108 |         config.image_min_size,
109 |         config.image_max_size)
110 | 
111 |     # バッチサイズの次元を追加
112 |     input_img = np.array([img])
113 |     logger.debug("input_img.shape.resized: %s", input_img.shape)
114 |     # logger.info("window: %s", window)
115 |     # logger.info("scale: %s", scale)
116 | 
117 |     # 表示用画像はopencvに合わせてRGBからBGRへ変換
118 |     img_org = img.copy()
119 |     img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
120 | 
121 |     bboxes, labels, scores, masks, rois, rpn_offsets, rpn_objects = \
122 |         model.predict([input_img], verbose=1,
123 |                       batch_size=config.batch_size)
124 | 
125 |     # バッチサイズを示す1次元目を削除
126 |     bboxes, labels, scores, masks, rois, rpn_offsets, rpn_objects = \
127 |         np.squeeze(bboxes, axis=0), \
128 |         np.squeeze(labels, axis=0), \
129 |         np.squeeze(scores, axis=0), \
130 |         np.squeeze(masks, axis=0), \
131 |         np.squeeze(rois, axis=0), \
132 |         np.squeeze(rpn_offsets, axis=0), \
133 |         np.squeeze(rpn_objects, axis=0)
134 | 
135 |     save_path_suffix = re.split('/|\.', input_path)
136 |     save_path_suffix = save_path_suffix[-2] + '.png'
137 |     if args.rpn:
138 |         # 前景のみ
139 |         rpn_obj_pos = rpn_objects[:, 0]
140 |         rpn_anchor = anchor.anchors
141 |         # スコア降順
142 |         idx_pos = rpn_obj_pos.argsort()[::-1]
143 |         rpn_obj_pos = rpn_obj_pos[idx_pos]
144 |         rpn_offsets = rpn_offsets[idx_pos]
145 |         rpn_anchor = rpn_anchor[idx_pos]
146 |         # 上位50件で
147 |         top = 10
148 |         rpn_obj_pos = rpn_obj_pos[:top]
149 |         rpn_offsets = rpn_offsets[:top]
150 |         rpn_anchor = rpn_anchor[:top]
151 | 
152 |         rpn_offsets *= np.array(config.bbox_refinement_std)
153 | 
154 |         print(rpn_obj_pos, rpn_offsets, rpn_anchor)
155 |         boxes = bbox.get_bbox(rpn_anchor,
156 |                               rpn_offsets)
157 |         boxes = boxes.clip(0,
158 |                            config.image_max_size).astype('int32')
159 |         for i, box in enumerate(boxes):
160 |             box = box.astype('int32')
161 |             add_rect(img, box, (0, 0, 255), 1)
162 | 
163 |         save_path = './out/pred_rpn_' + save_path_suffix
164 |     else:
165 |         # # ラベルに対応するマクスを残す
166 |         # masks = masks[np.arange(masks.shape[0]), ]
167 | 
168 |         # 背景,paddingは除く
169 |         idx_labels = np.where(labels > 0)
170 |         bboxes = bboxes[idx_labels]
171 |         labels = labels[idx_labels]
172 |         scores = scores[idx_labels]
173 |         masks = masks[idx_labels]
174 |         rois = rois[idx_labels]
175 | 
176 |         h, w = config.image_shape[0], config.image_shape[1]
177 |         rois *= [h, w, h, w]
178 |         logger.debug("rois.shape: %s", rois.shape)
179 |         logger.debug("rois: %s", rois)
180 |         logger.debug("bboxes.shape: %s", bboxes.shape)
181 |         logger.debug("bboxes: %s", bboxes)
182 |         logger.debug("labels.shape: %s", labels.shape)
183 |         # logger.debug("labels: %s", labels)
184 |         logger.debug("scores.shape: %s", scores.shape)
185 |         logger.debug("scores: %s", scores)
186 |         logger.debug("masks.shape: %s", masks.shape)
187 |         # logger.debug("masks: %s", masks)
188 | 
189 |         blue = [i for i in range(255)[::(255 // (bboxes.shape[0] + 1))]]
190 |         green = blue[::-1]
191 | 
192 |         # Proposal表示
193 |         for roi in rois:
194 |             add_rect(img, roi, (0, 0, 255), 1)
195 | 
196 |         # bbox表示
197 |         for box, mask, b, g in zip(bboxes, masks, blue, green):
198 | 
199 |             # bbox, mask表示
200 |             add_rect(img, box, (b, g, 0), 2)
201 |             add_mask(img, mask, box, (b, g, 0), config.image_shape[:2])
202 | 
203 |         save_path = './out/pred_' + save_path_suffix
204 | 
205 |     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
206 | 
207 |     def show_image(_img, _label, _num):
208 |         plt.subplot(1, 2, _num)
209 |         plt.imshow(_img)
210 |         # plt.axis('off')
211 |         plt.gca().get_xaxis().set_ticks_position('none')
212 |         plt.gca().get_yaxis().set_ticks_position('none')
213 |         plt.tick_params(labelbottom='off')
214 |         plt.tick_params(labelleft='off')
215 |         plt.xlabel(_label)
216 | 
217 |     plt.figure(figsize=(6, 3))
218 |     show_image(img_org, 'Input', 1)
219 |     show_image(img, 'Output', 2)
220 |     plt.savefig(save_path)
221 | 
222 | 
223 | with tf.device('/cpu:0'):
224 |     model = mrcnn.compiled_model()
225 |     logger.debug("compile model.")
226 | 
227 | # with tf.device('/gpu:1'):
228 |     model.load_weights(args.weights_path, by_name=True)
229 |     logger.debug("load_weights.")
230 | 
231 |     if os.path.isdir(args.input_path):
232 |         paths = glob.glob(os.path.join(args.input_path, '*.jpg'))
233 |     else:
234 |         paths = args.input_path.split(',')
235 |     for path in paths:
236 |         pred(path)
237 | 


--------------------------------------------------------------------------------
/xrcnn/util/voc_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import random
  4 | import xml.etree.ElementTree as ET
  5 | import cv2
  6 | import numpy as np
  7 | 
  8 | import xrcnn.util.image as image
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | _dir_voc_imageset = 'ImageSets'
 13 | _dir_voc_annotation = 'Annotations'
 14 | _dir_voc_image = 'JPEGImages'
 15 | 
 16 | # VOCデータセットで提供されるラベルに背景（非オブジェクト）を示すラベルを加える。
 17 | # 背景=0とするため、リスト先頭に加える。
 18 | _voc_labels = ['__background__', 'aeroplane', 'bicycle', 'bird', 'boat',
 19 |                'bottle', 'bus', 'car', 'cat', 'chair',
 20 |                'cow', 'diningtable', 'dog', 'horse', 'motorbike',
 21 |                'person', 'pottedplant', 'sheep', 'sofa', 'train',
 22 |                'tvmonitor']
 23 | 
 24 | 
 25 | def get_pascal_voc_labels():
 26 |     return _voc_labels
 27 | 
 28 | 
 29 | def pascal_voc_data_generator(path, anchor, config, train_val='train',
 30 |                               n_max=None, prefix=None):
 31 |     """
 32 |     VOCイメージセットをkeras.model.fit_generatorで使用するgeneratorの形式で取得する。
 33 |         Args:
 34 |         Returns:
 35 | 
 36 |     """
 37 |     if prefix is not None:
 38 |         logger.info("load specific meta: %s", prefix)
 39 |         image_meta = load_image_meta(path, prefix)
 40 |         image_metas = [image_meta]
 41 |     else:
 42 |         if train_val == 'train':
 43 |             logger.info("load voc train data.")
 44 |             image_metas, _ = load_pascal_voc_traindata(path, n_max)
 45 |         else:
 46 |             logger.info("load voc validation data.")
 47 |             image_metas, _ = load_pascal_voc_validationdata(path, n_max)
 48 | 
 49 |     random.shuffle(image_metas)
 50 | 
 51 |     # 複数GPU利用の場合は入力データが各GPUに均等に配分される。
 52 |     # モデル内でconfig.batch_size＝バッチサイズとして実装しているところがあるので、
 53 |     # GPU数毎に入力データがconfig.batch_sizeとなるよう掛けておく。
 54 |     batch_size = config.batch_size * config.gpu_count
 55 |     batch_count = 0
 56 |     head_trainable = config.training_mode in ['head_only', 'all']
 57 |     while True:
 58 |         for meta in image_metas:
 59 |             if batch_count == 0:
 60 |                 images = []
 61 |                 rpn_offsets = []
 62 |                 rpn_fbs = []
 63 |                 bboxes = []
 64 |                 labels = []
 65 | 
 66 |             # 画像を規定のサイズにリサイズ。
 67 |             img = image.load_image_as_ndarray(meta['image_path'])
 68 |             img, window, scale = image.resize_with_padding(
 69 |                 img,
 70 |                 config.image_min_size,
 71 |                 config.image_max_size)
 72 |             logger.debug("window, scale: %s, %s", window, scale)
 73 |             # ランダムにflip
 74 |             img, flip_x, flip_y = image.random_flip(img)
 75 |             images.append(img)
 76 | 
 77 |             # 画像毎のオブジェクト数は固定にする。複数画像を1つのテンソルにおさめるため。
 78 |             bb = np.zeros([config.n_max_gt_objects_per_image, 4])
 79 |             bb_raw = []
 80 |             lb = np.zeros([config.n_max_gt_objects_per_image])
 81 |             # bboxもリサイズ＆flip
 82 |             for i, obj in enumerate(meta['objects']):
 83 |                 b = image.flip_bbox(
 84 |                     image.resize_bbox(obj['bbox'], window[:2], scale),
 85 |                     img.shape[:2], flip_x, flip_y)
 86 |                 bb_raw.append(b)
 87 |                 lb[i] = obj['label_id']
 88 |             logger.debug("bb_raw: %s", bb_raw)
 89 |             # RPN向けのGTをまとめる
 90 |             of, fb = anchor.generate_gt_offsets(np.array(bb_raw),
 91 |                                                 config.image_shape[:2])
 92 |             logger.debug("shapes: offset: %s, fb: %s", of.shape, fb.shape)
 93 | 
 94 |             # 有効なラベルが1つもないデータは無効なので返却しない
 95 |             if not np.any(lb > 0):
 96 |                 continue
 97 | 
 98 |             rpn_offsets.append(of)
 99 |             rpn_fbs.append(fb)
100 | 
101 |             bb[:len(bb_raw), :] = bb_raw
102 |             bboxes.append(bb)
103 |             labels.append(lb)
104 | 
105 |             logger.info("loaded image: %s", meta['image_path'])
106 | 
107 |             if np.any(np.argwhere(np.isnan(of))):
108 |                 raise ValueError("nanを含むオフセットを検出！")
109 | 
110 |             batch_count += 1
111 |             if batch_count >= batch_size:
112 |                 batch_count = 0
113 |                 inputs = [np.array(images), np.array(rpn_offsets),
114 |                           np.array(rpn_fbs)]
115 |                 if head_trainable:
116 |                     inputs += [np.array(bboxes), np.array(labels)]
117 |                 yield inputs, []
118 | 
119 | 
120 | def load_image_meta(path, prefix):
121 |     annotation_path = os.path.join(
122 |         path, _dir_voc_annotation, prefix + '.xml')
123 | 
124 |     try:
125 |         # parse annotation file
126 |         xml = ET.parse(annotation_path)
127 |         root = xml.getroot()
128 |         objects = root.findall('object')
129 | 
130 |         if len(objects) > 0:
131 |             image_path = os.path.join(
132 |                 path, _dir_voc_image, root.find('filename').text)
133 |             width = int(root.find('size').find('width').text)
134 |             height = int(root.find('size').find('height').text)
135 |             segmented = bool(int(root.find('segmented').text))
136 |             data = {'image_path': image_path,
137 |                     'size': (width, height),
138 |                     'segmented': segmented,
139 |                     'objects': []}
140 | 
141 |             for obj in objects:
142 |                 name = obj.find('name').text
143 |                 try:
144 |                     # 0は背景（非オブジェクト）を示すため、0オリジンとする。
145 |                     name_id = _voc_labels.index(name) + 1
146 |                 except ValueError as e:
147 |                     # 想定外のラベルなので処理しない
148 |                     logger.warn(e)
149 |                     continue
150 |                 truncated = bool(int(obj.find('truncated').text))
151 |                 difficult = bool(int(obj.find('difficult').text))
152 |                 bbox = obj.find('bndbox')
153 |                 xmin = int(round(float(bbox.find('xmin').text)))
154 |                 ymin = int(round(float(bbox.find('ymin').text)))
155 |                 xmax = int(round(float(bbox.find('xmax').text)))
156 |                 ymax = int(round(float(bbox.find('ymax').text)))
157 | 
158 |                 data['objects'].append({
159 |                     'label': name,
160 |                     'label_id': name_id,
161 |                     'truncated': truncated,
162 |                     'difficult': difficult,
163 |                     'bbox': np.array([ymin, xmin, ymax, xmax])
164 |                 })
165 | 
166 |             logger.debug("load_image_meta: %s", data)
167 |             return data
168 | 
169 |         else:
170 |             logger.warn("%s has no object." % annotation_path)
171 |             return None
172 | 
173 |     except Exception as e:
174 |         logger.warn(e)
175 |         raise e
176 | 
177 | 
178 | def _load_pascal_voc(path, imagelist, n_max=None):
179 |     """ load PASCAL VOC dataset.
180 |         http://host.robots.ox.ac.uk/pascal/VOC/
181 | 
182 |         Args:
183 |             path parent directory of VOC dataset.
184 | 
185 |         Returns:
186 |             VOC Image annotation as dictionary.
187 |             annotation
188 |               |- images
189 |                     |- filepath
190 |                     |- size : (width, height)
191 |                     |- segmented
192 |                     |- objects
193 |                         |- label
194 |                         |- label_id（1オリジンとする。0は背景を示す。）
195 |                         |- truncated
196 |                         |- difficult
197 |                         |- bbox : [ymin, xmin, ymax, xmax]
198 |     """
199 |     images = []
200 |     labels = set()
201 |     target_data_list = os.path.join(path, _dir_voc_imageset, 'Main', imagelist)
202 |     with open(target_data_list, 'r') as f:
203 |         for i, line in enumerate(f):
204 |             if n_max is not None and i >= n_max:
205 |                 break
206 |             line = line.strip()
207 |             meta = load_image_meta(path, line)
208 |             images.append(meta)
209 |             for obj in meta['objects']:
210 |                 for label in obj['label']:
211 |                     labels.add(label)
212 | 
213 |     return images, labels
214 | 
215 | 
216 | def load_pascal_voc_traindata(path, n_max=None):
217 |     return _load_pascal_voc(path, 'train.txt', n_max)
218 | 
219 | 
220 | def load_pascal_voc_validationdata(path, n_max=None):
221 |     return _load_pascal_voc(path, 'val.txt', n_max)
222 | 
223 | 
224 | def show_voc_image(image):
225 |     img = cv2.imread(image['image_path'])
226 |     for obj in image['objects']:
227 |         bbox = obj['bbox']
228 |         cv2.rectangle(img, (bbox[1], bbox[0]), (bbox[3], bbox[2]), (0, 0, 255))
229 |     cv2.imshow('img', img)
230 |     cv2.waitKey(0)
231 | 
232 | 
233 | def show(path, prefix):
234 |     meta = load_image_meta(path, prefix)
235 |     show_voc_image(meta)
236 | 
237 | # def resize(image, min_size, max_size):
238 | #     size = image['size']
239 | #     w_org = size[0]
240 | #     w, h = _get_resiezed_imagesize(size[0], size[1], min_size, max_size)
241 | #     image['size'] = (w, h)
242 | #
243 | #     scale = float(w) / float(w_org)
244 | #     for obj in image['objects']:
245 | #         bbox = obj['bbox']
246 | #         bbox[0][0] = int(bbox[0][0] * scale)
247 | #         bbox[0][1] = int(bbox[0][1] * scale)
248 | #         bbox[1][0] = int(bbox[1][0] * scale)
249 | #         bbox[1][1] = int(bbox[1][1] * scale)
250 | 


--------------------------------------------------------------------------------
/xrcnn/util/bbox.py:
--------------------------------------------------------------------------------
  1 | from keras import backend as K
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | 
  5 | 
  6 | def normalize_bbox(input_bboxes, input_h, input_w):
  7 |     """入力データのbboxを0~1に正規化する。
  8 |     入力画像の縦横で割る。
  9 |     NNから得られる予測値として利用する。
 10 |     従って損失評価時にはGTをこの関数を通した値を利用する。
 11 |     """
 12 |     return input_bboxes / K.variable([input_h, input_w, input_h, input_w])
 13 | 
 14 | 
 15 | def get_bbox(src_bbox, offset):
 16 |     """src_bboxにoffsetを適用し、元の領域を復元する。
 17 |     RPNから得たoffset予測値をアンカーボックスに適用して提案領域を得る。といったケースで利用する。
 18 | 
 19 |     Args:
 20 |         src_bbox (tensor / ndarray): オフセットを適用するBoudingBox。
 21 |             Its shape is :math:`(R, 4)`.
 22 |             2軸目に以下の順でBBoxの座標を保持する。
 23 |             :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`.
 24 |         offset (tensor / ndarray): オフセット。
 25 |             形状はsrc_bboxに同じ。
 26 |             2軸目にオフセットの形状を保持数r。 :math:`t_y, t_x, t_h, t_w`.
 27 |                 tx =(x−xa)/wa, ty =(y−ya)/ha, tw = log(w/wa), th = log(h/ha)
 28 |                 ※それぞれ、アンカーからのオフセット
 29 |                 ※「x」は予測された領域の中心x、「xa」はアンカーの中心x。
 30 | 
 31 |     Returns:
 32 |         tensor:
 33 |         オフセットを適用したBoudingBox。
 34 |         形状はsrc_bboxに同じ。
 35 |         1軸目はsrc_bboxと同じ情報を示す。
 36 |         2軸目にはオフセットを適用した座標を保持する。
 37 |         :math:`\\hat{g}_{ymin}, \\hat{g}_{xmin},
 38 |         \\hat{g}_{ymax}, \\hat{g}_{xmax}`.
 39 | 
 40 |     """
 41 |     if type(src_bbox) == np.ndarray and type(offset) == np.ndarray:
 42 |         xp = np
 43 |     else:
 44 |         xp = K
 45 | 
 46 |     if src_bbox.shape[0] == 0:
 47 |         return xp.zeros((0, 4), dtype=offset[:, 0].dtype)
 48 | 
 49 |     # src_bbox（anchorなど）の左上と右下の座標から、中心座標＋高さ＋幅の形式に変換する
 50 |     src_height = src_bbox[:, 2] - src_bbox[:, 0]
 51 |     src_width = src_bbox[:, 3] - src_bbox[:, 1]
 52 |     src_ctr_y = src_bbox[:, 0] + 0.5 * src_height
 53 |     src_ctr_x = src_bbox[:, 1] + 0.5 * src_width
 54 | 
 55 |     # オフセットを中心座標、高さ、幅毎にまとめる
 56 |     dy = offset[:, 0]
 57 |     dx = offset[:, 1]
 58 |     dh = offset[:, 2]
 59 |     dw = offset[:, 3]
 60 | 
 61 |     # 論文にあるオフセット算出式（以下）から逆算
 62 |     # tx =(x−xa)/wa, ty =(y−ya)/ha, tw = log(w/wa), th = log(h/ha)
 63 |     # ※それぞれ、アンカーからのオフセット
 64 |     # ※「x」は予測された領域の中心x、「xa」はアンカーの中心x。
 65 |     ctr_y = dy * src_height + src_ctr_y
 66 |     ctr_x = dx * src_width + src_ctr_x
 67 |     h = xp.exp(dh) * src_height
 68 |     w = xp.exp(dw) * src_width
 69 | 
 70 |     # 矩形の左上と右下の座標に変換
 71 |     ymin = ctr_y - 0.5 * h
 72 |     xmin = ctr_x - 0.5 * w
 73 |     ymax = ctr_y + 0.5 * h
 74 |     xmax = ctr_x + 0.5 * w
 75 |     bbox = xp.transpose(xp.stack((ymin, xmin, ymax, xmax), axis=0))
 76 |     return bbox
 77 | 
 78 | 
 79 | def restore_bbox(normalized_rois, normalized_offsets, input_h, input_w):
 80 |     """
 81 |     正規化されたRoIとオフセットから、入力画像にスケールアップしたBBOXを取得する。
 82 |     Args:
 83 |         normalized_rois (ndarray):
 84 |         normalized_offsets (ndarray):
 85 |     """
 86 |     is_numpy = type(normalized_rois) == np.ndarray \
 87 |         and type(normalized_offsets) == np.ndarray
 88 |     if is_numpy:
 89 |         xp = np
 90 |         box = np.array([input_h, input_w, input_h, input_w])
 91 |     else:
 92 |         xp = K
 93 |         box = K.variable([input_h, input_w, input_h, input_w])
 94 | 
 95 |     normalized_bboxes = get_bbox(normalized_rois, normalized_offsets)
 96 |     bboxes = normalized_bboxes * box
 97 |     bboxes = xp.round(bboxes)
 98 |     if is_numpy:
 99 |         bboxes = bboxes.astype(xp.int32)
100 |     else:
101 |         bboxes = K.cast(bboxes, tf.int32)
102 | 
103 |     return bboxes
104 | 
105 | 
106 | def get_offset(src_bbox, dst_bbox):
107 |     """src_bboxからdst_bboxを得るために必要なオフセットを取得する。
108 |     get_bbox(src_bbox, offset) => dst_bbox となる。
109 | 
110 |     Args:
111 |         src_bbox (ndarray): 基準となるBoudingBox。
112 |             Its shape is :math:`(R, 4)`.
113 |             2軸目に以下の順でBBoxの座標を保持する。
114 |             :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`.
115 |         dst_bbox (ndarray): 基準となるBoudingBox。
116 |             Its shape is :math:`(R, 4)`.
117 |             2軸目に以下の順でBBoxの座標を保持する。
118 |             :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`.
119 | 
120 |     Returns:
121 |         ndarray:
122 |         オフセット
123 |         This has shape :math:`(R, 4)`.
124 |         The second axis contains four values :math:`t_y, t_x, t_h, t_w`.
125 | 
126 |     """
127 |     epsilon = 1e-07
128 | 
129 |     # src_bboxを中心座標＋高さ＋幅の形式に変換する
130 |     height = src_bbox[:, 2] - src_bbox[:, 0]
131 |     width = src_bbox[:, 3] - src_bbox[:, 1]
132 |     ctr_y = src_bbox[:, 0] + 0.5 * height
133 |     ctr_x = src_bbox[:, 1] + 0.5 * width
134 | 
135 |     # dst_bboxも同じく変換する
136 |     base_height = dst_bbox[:, 2] - dst_bbox[:, 0]
137 |     base_width = dst_bbox[:, 3] - dst_bbox[:, 1]
138 |     base_ctr_y = dst_bbox[:, 0] + 0.5 * base_height
139 |     base_ctr_x = dst_bbox[:, 1] + 0.5 * base_width
140 | 
141 |     # 0除算にならないよう調整
142 |     height = np.maximum(height, epsilon)
143 |     width = np.maximum(width, epsilon)
144 | 
145 |     # 論文にあるオフセット算出式より
146 |     # tx =(x−xa)/wa, ty =(y−ya)/ha, tw = log(w/wa), th = log(h/ha)
147 |     dy = (base_ctr_y - ctr_y) / height
148 |     dx = (base_ctr_x - ctr_x) / width
149 |     # print(height, width, base_height, base_width)
150 |     dh = np.log(base_height / height)
151 |     dw = np.log(base_width / width)
152 | 
153 |     # (R, 4)の形状に変換
154 |     offset = np.transpose(np.stack((dy, dx, dh, dw), axis=0))
155 |     return offset
156 | 
157 | 
158 | def get_offset_K(src_bbox, dst_bbox):
159 |     """src_bboxからdst_bboxを得るために必要なオフセットを取得する。
160 |     get_bbox(src_bbox, offset) => dst_bbox となる。
161 | 
162 |     Args:
163 |         src_bbox (tensor): 基準となるBoudingBox。
164 |             Its shape is :math:`(R, 4)`.
165 |             2軸目に以下の順でBBoxの座標を保持する。
166 |             :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`.
167 |         dst_bbox (tensor): 基準となるBoudingBox。
168 |             Its shape is :math:`(R, 4)`.
169 |             2軸目に以下の順でBBoxの座標を保持する。
170 |             :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`.
171 | 
172 |     Returns:
173 |         tensor:
174 |         オフセット
175 |         This has shape :math:`(R, 4)`.
176 |         The second axis contains four values :math:`t_y, t_x, t_h, t_w`.
177 | 
178 |     """
179 |     epsilon = K.epsilon()
180 | 
181 |     # src_bboxを中心座標＋高さ＋幅の形式に変換する
182 |     height = src_bbox[:, 2] - src_bbox[:, 0]
183 |     width = src_bbox[:, 3] - src_bbox[:, 1]
184 |     ctr_y = src_bbox[:, 0] + 0.5 * height
185 |     ctr_x = src_bbox[:, 1] + 0.5 * width
186 | 
187 |     # dst_bboxも同じく変換する
188 |     base_height = dst_bbox[:, 2] - dst_bbox[:, 0]
189 |     base_width = dst_bbox[:, 3] - dst_bbox[:, 1]
190 |     base_ctr_y = dst_bbox[:, 0] + 0.5 * base_height
191 |     base_ctr_x = dst_bbox[:, 1] + 0.5 * base_width
192 | 
193 |     # 0除算にならないよう調整
194 |     height = K.maximum(height, epsilon)
195 |     width = K.maximum(width, epsilon)
196 | 
197 |     # 論文にあるオフセット算出式より
198 |     # tx =(x−xa)/wa, ty =(y−ya)/ha, tw = log(w/wa), th = log(h/ha)
199 |     dy = (base_ctr_y - ctr_y) / height
200 |     dx = (base_ctr_x - ctr_x) / width
201 |     dh = K.log(base_height / height)
202 |     dw = K.log(base_width / width)
203 | 
204 |     # (R, 4)の形状に変換
205 |     offset = K.transpose(K.stack((dy, dx, dh, dw), axis=0))
206 |     return offset
207 | 
208 | 
209 | def get_iou(bbox_base, bbox_target):
210 |     """2つのBoundingBoxのIoU（Intersection Over Union）を取得する。
211 |         https://www.pyimagesearch.com/2016/11/07/intersection-over-union-iou-for-object-detection/
212 |     Args:
213 |         bbox_base (ndarray): 基準になるBoudingBox。
214 |             Its shape is :math:`(N, 4)`.
215 |             2軸目に以下の順でBBoxの座標を保持する。
216 |             :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`.
217 |         bbox_target (ndarray): BoudingBox。
218 |             Its shape is :math:`(K, 4)`.
219 |             2軸目に以下の順でBBoxの座標を保持する。
220 |             :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`.
221 | 
222 |         bbox_baseの各Box毎にbbox_targetを適用し、IoUを求める。
223 | 
224 |     Returns:
225 |         ndarray:
226 |         IoU(0 <= IoU <= 1)
227 |         形状は以下の通り。
228 |         :math:`(N, K)`.
229 | 
230 |     """
231 |     if bbox_base.shape[1] != 4 or bbox_target.shape[1] != 4:
232 |         raise IndexError
233 | 
234 |     # 交差領域の左上の座標
235 |     # bbox_base[:, None, :]のより次元を増やすことで、
236 |     # bbox_baseとbbox_targetを総当りで評価出来る。
237 |     # (N, K, 2)の座標が得られる
238 |     tl = np.maximum(bbox_base[:, None, :2], bbox_target[:, :2])
239 |     # 交差領域の右下の座標
240 |     # (N, K, 2)の座標が得られる
241 |     br = np.minimum(bbox_base[:, None, 2:], bbox_target[:, 2:])
242 | 
243 |     # 右下-左下＝交差領域の(h, w)が得られる。
244 |     # h*wで交差領域の面積。ただし、交差領域がない（右下 <= 左上）ものは除くため0とする。
245 |     area_i = np.prod(br - tl, axis=2) * \
246 |         np.all(br > tl, axis=2).astype('float32')
247 |     area_base = np.prod(bbox_base[:, 2:] - bbox_base[:, :2], axis=1)
248 |     area_target = np.prod(bbox_target[:, 2:] - bbox_target[:, :2], axis=1)
249 |     return area_i / (area_base[:, None] + area_target - area_i)
250 | 
251 | 
252 | def get_iou_K(bbox_base, bbox_target):
253 |     """2つのBoundingBoxのIoU（Intersection Over Union）を取得する。
254 |         https://www.pyimagesearch.com/2016/11/07/intersection-over-union-iou-for-object-detection/
255 |     Args:
256 |         bbox_base (tensor): 基準になるBoudingBox。
257 |             Its shape is :math:`(N, 4)`.
258 |             2軸目に以下の順でBBoxの座標を保持する。
259 |             :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`.
260 |         bbox_target (tensor): BoudingBox。
261 |             Its shape is :math:`(K, 4)`.
262 |             2軸目に以下の順でBBoxの座標を保持する。
263 |             :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`.
264 | 
265 |         bbox_baseの各Box毎にbbox_targetを適用し、IoUを求める。
266 | 
267 |     Returns:
268 |         tensor:
269 |         IoU(0 <= IoU <= 1)
270 |         形状は以下の通り。
271 |         :math:`(N, K)`.
272 | 
273 |     """
274 |     if bbox_base.shape[1] != 4 or bbox_target.shape[1] != 4:
275 |         raise IndexError
276 | 
277 |     # 交差領域の左上の座標
278 |     # bbox_base[:, None, :]のより次元を増やすことで、
279 |     # bbox_baseとbbox_targetを総当りで評価出来る。
280 |     # (N, K, 2)の座標が得られる
281 |     tl = K.maximum(bbox_base[:, None, :2], bbox_target[:, :2])
282 |     # 交差領域の右下の座標
283 |     # (N, K, 2)の座標が得られる
284 |     br = K.minimum(bbox_base[:, None, 2:], bbox_target[:, 2:])
285 | 
286 |     # 右下-左下＝交差領域の(h, w)が得られる。
287 |     # h*wで交差領域の面積。ただし、交差領域がない（右下 <= 左上）ものは除くため0とする。
288 |     area_i = K.prod(br - tl, axis=2) * \
289 |         K.cast(K.all(br > tl, axis=2), 'float32')
290 |     area_base = K.prod(bbox_base[:, 2:] - bbox_base[:, :2], axis=1)
291 |     area_target = K.prod(bbox_target[:, 2:] - bbox_target[:, :2], axis=1)
292 |     return area_i / (area_base[:, None] + area_target - area_i)
293 | 


--------------------------------------------------------------------------------
/xrcnn/util/coco_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import random
  4 | import cv2
  5 | import numpy as np
  6 | from pycocotools.coco import COCO
  7 | 
  8 | import xrcnn.util.image as image
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | class Generator:
 14 |     def __init__(self, config, data_root_dir, data_type='val2017',
 15 |                  target_category_names=['person']):
 16 |         # TODO ['person']以外が指定された場合にカテゴリIDからone_hot化しているところが破綻する。要改善。
 17 |         """
 18 |             Args:
 19 |                 category_names
 20 |                 ['person']のみの指定、もしくは指定なし（全カテゴリ）のみサポート。
 21 |         カテゴリIDとカテゴリ名の対応は以下の通り
 22 |         [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21,
 23 |         22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
 24 |         43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
 25 |         62, 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84,
 26 |         85, 86, 87, 88, 89, 90]
 27 |         ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
 28 |         'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
 29 |         'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
 30 |         'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
 31 |          'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
 32 |          'sports ball', 'kite', 'baseball bat', 'baseball glove',
 33 |          'skateboard', 'surfboard', 'tennis racket',
 34 |             'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
 35 |             'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
 36 |             'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
 37 |             'potted plant', 'bed', 'dining table', 'toilet', 'tv',
 38 |             'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
 39 |             'microwave', 'oven', 'toaster', 'sink', 'refrigerator',
 40 |             'book', 'clock', 'vase', 'scissors', 'teddy bear',
 41 |             'hair drier', 'toothbrush']
 42 |         """
 43 |         self.config = config
 44 |         self.data_root_dir = data_root_dir
 45 |         self.annotation_path = '{}/annotations/instances_{}.json'.format(
 46 |             data_root_dir, data_type)
 47 |         self.image_dir_path = '{}/{}/'.format(data_root_dir, data_type)
 48 |         self.coco = COCO(self.annotation_path)
 49 |         self._target_category_names = target_category_names
 50 |         # 背景＝0を追加
 51 |         self._category_ids = self.coco.getCatIds(
 52 |             catNms=self._target_category_names)
 53 | 
 54 |     def get_labels(self):
 55 |         cats = self.coco.loadCats(self._category_ids)
 56 |         # 背景を追加
 57 |         return ['__background__'] + [cat['name'] for cat in cats]
 58 | 
 59 |     def get_label_ids(self):
 60 |         return [0] + self._category_ids
 61 | 
 62 |     def _get_all_image_ids(self):
 63 |         # annotationが存在するイメージのみに限定。
 64 |         image_ids = set()
 65 |         for id in self.get_label_ids():
 66 |             image_ids |= set(self.coco.getImgIds(catIds=id))
 67 |         return image_ids
 68 | 
 69 |     def _get_metas(self, image_ids):
 70 |         """ load COCO dataset.
 71 | 
 72 |             Returns:
 73 |                 annotation
 74 |                   |- images
 75 |                         |- filepath
 76 |                         |- size : (width, height)
 77 |                         |- objects
 78 |                             |- label_id（COCOは1オリジン。0は背景を示す。）
 79 |                             |- iscrowd
 80 |                             |- bbox : (ymin, xmin, ymax, xmax)
 81 |                             |- mask : [width, height, 1] バイナリマスク
 82 |         """
 83 |         img_metas = self.coco.loadImgs(ids=image_ids)
 84 |         metas = []
 85 |         for image_meta in img_metas:
 86 |             # {'license': 1,
 87 |             # 'file_name': '000000002685.jpg',
 88 |             # 'coco_url':
 89 |             #   'http://images.cocodataset.org/val2017/000000002685.jpg',
 90 |             # 'height': 555,
 91 |             # 'width': 640,
 92 |             # 'date_captured': '2013-11-25 19:10:39',
 93 |             # 'flickr_url':
 94 |             #   'http://farm9.staticflickr.com/8535/8710326856_2aac3d36fb_z.jpg',
 95 |             # 'id': 2685}
 96 |             # iscrowd=Falseで群衆を含むデータを除く
 97 |             image_id = image_meta['id']
 98 |             a_ids = self.coco.getAnnIds(imgIds=image_id, iscrowd=False)
 99 |             annotations = self.coco.loadAnns(a_ids)
100 | 
101 |             if len(annotations) > 0:
102 |                 image_path = os.path.join(
103 |                     self.image_dir_path, image_meta['file_name'])
104 |                 width = int(image_meta['width'])
105 |                 height = int(image_meta['height'])
106 |                 meta = {'image_path': image_path,
107 |                         'size': (width, height),
108 |                         'objects': []}
109 | 
110 |                 for annotation in annotations:
111 |                     # {'segmentation': [[574.96, ..., 440.26]],
112 |                     # 'area': 36959.305749999985,
113 |                     # 'iscrowd': 0,
114 |                     # 'image_id': 2685,
115 |                     # 'bbox': [315.54, 56.12, 323.02, 384.14],
116 |                     # 'category_id': 1,
117 |                     # 'id': 1226144}
118 |                     label_id = annotation['category_id']
119 |                     # 対象外のカテゴリIDは除外
120 |                     if label_id not in self.get_label_ids():
121 |                         continue
122 | 
123 |                     # バイナリマスク
124 |                     mask = self.coco.annToMask(annotation)
125 | 
126 |                     # bboxはmask要素から抽出
127 |                     # annotationのbboxがmaskよりも小さい事があるみたい
128 |                     idx = np.where(mask == 1)
129 |                     bbox = np.array([np.min(idx[0]), np.min(idx[1]),
130 |                                      np.max(idx[0]), np.max(idx[1])])
131 | 
132 |                     meta['objects'].append({
133 |                         'label_id': label_id,
134 |                         'bbox': bbox,
135 |                         'mask': mask
136 |                     })
137 | 
138 |                 # 対象とするオブジェクトを含む情報のみ追加する
139 |                 if len(meta['objects']) > 0:
140 |                     metas.append(meta)
141 |                 else:
142 |                     logger.warn("image_id[%s] has no object." % image_id)
143 | 
144 |             else:
145 |                 logger.warn("image_id[%s] has no object." % image_id)
146 | 
147 |         logger.debug("load_image_meta: %s", metas)
148 |         return metas
149 | 
150 |     def generate(self, anchor, n_max=None, image_ids=None,
151 |                  include_mask=True):
152 |         """
153 |         VOCイメージセットをkeras.model.fit_generatorで使用するgeneratorの形式で取得する。
154 |             Args:
155 |             Returns:
156 | 
157 |         """
158 |         if image_ids is None:
159 |             image_ids = list(self._get_all_image_ids())
160 | 
161 |         random.shuffle(image_ids)
162 | 
163 |         # 複数GPU利用の場合は入力データが各GPUに均等に配分される。
164 |         # モデル内でconfig.batch_size＝バッチサイズとして実装しているところがあるので、
165 |         # GPU数毎に入力データがconfig.batch_sizeとなるよう掛けておく。
166 |         batch_size = self.config.batch_size * self.config.gpu_count
167 |         batch_count = 0
168 |         head_trainable = self.config.training_mode in ['head_only', 'all']
169 |         while True:
170 |             for image_id in image_ids:
171 |                 try:
172 |                     if batch_count == 0:
173 |                         images = []
174 |                         rpn_offsets = []
175 |                         rpn_fbs = []
176 |                         bboxes = []
177 |                         labels = []
178 |                         masks = []
179 | 
180 |                     metas = self._get_metas([image_id])
181 |                     if len(metas) == 0:
182 |                         continue
183 |                     meta = metas[0]
184 | 
185 |                     logger.info("loaded image: %s", meta['image_path'])
186 |                     # 画像を規定のサイズにリサイズ。
187 |                     img = image.load_image_as_ndarray(meta['image_path'])
188 |                     # モノクロ（2次元データ）は除く
189 |                     # val2017/000000061418.jpg など
190 |                     if(len(img.shape) < 3):
191 |                         logger.warn("skip 2 dim image: %s", meta['image_path'])
192 |                         continue
193 | 
194 |                     img, window, scale = image.resize_with_padding(
195 |                         img,
196 |                         self.config.image_min_size,
197 |                         self.config.image_max_size)
198 |                     logger.debug("window, scale: %s, %s", window, scale)
199 |                     # ランダムにflip
200 |                     img, flip_x, flip_y = image.random_flip(img)
201 | 
202 |                     # 画像毎のオブジェクト数は固定にする。複数画像を1つのテンソルにおさめるため。
203 |                     bb = np.zeros([self.config.n_max_gt_objects_per_image, 4])
204 |                     bb_raw = []
205 |                     lb = np.zeros([self.config.n_max_gt_objects_per_image])
206 |                     mk = np.zeros([self.config.n_max_gt_objects_per_image,
207 |                                    self.config.image_max_size,
208 |                                    self.config.image_max_size])
209 |                     mk_raw = []
210 |                     # bbox, maskもリサイズ＆flip
211 |                     for i, obj in enumerate(meta['objects']):
212 |                         b = image.flip_bbox(
213 |                             image.resize_bbox(obj['bbox'], window[:2], scale),
214 |                             img.shape[:2], flip_x, flip_y)
215 |                         # boxサイズが小さすぎるオブジェクトは除外
216 |                         h, w = b[2] - b[0], b[3] - b[1]
217 |                         if h <= self.config.ignore_box_size \
218 |                                 or w <= self.config.ignore_box_size:
219 |                             continue
220 |                         bb_raw.append(b)
221 | 
222 |                         m = image.flip_mask(
223 |                             image.resize_mask(obj['mask'],
224 |                                               self.config.image_min_size,
225 |                                               self.config.image_max_size),
226 |                             flip_x, flip_y)
227 |                         mk_raw.append(m)
228 | 
229 |                         lb[i] = obj['label_id']
230 | 
231 |                     # 有効なラベルが1つもないデータは無効なので返却しない
232 |                     if not np.any(lb > 0):
233 |                         continue
234 | 
235 |                     # RPN向けのGTをまとめる
236 |                     of, fb = anchor.generate_gt_offsets(
237 |                         np.array(bb_raw), self.config.image_shape[:2])
238 |                     logger.debug("shapes: offset: %s, fb: %s", of.shape,
239 |                                  fb.shape)
240 | 
241 |                     images.append(img)
242 |                     rpn_offsets.append(of)
243 |                     rpn_fbs.append(fb)
244 | 
245 |                     bb[:len(bb_raw), :] = bb_raw
246 |                     bboxes.append(bb)
247 | 
248 |                     mk[:len(mk_raw), :] = mk_raw
249 |                     masks.append(mk)
250 | 
251 |                     labels.append(lb)
252 | 
253 |                     if np.any(np.argwhere(np.isnan(of))):
254 |                         logger.error("nanを含むオフセットを検出！スキップします。")
255 |                         continue
256 | 
257 |                     batch_count += 1
258 |                     if batch_count >= batch_size:
259 |                         batch_count = 0
260 |                         inputs = [np.array(images), np.array(rpn_offsets),
261 |                                   np.array(rpn_fbs)]
262 |                         if head_trainable:
263 |                             inputs += [np.array(bboxes), np.array(labels)]
264 |                             if include_mask:
265 |                                 inputs += [np.array(masks)]
266 |                         yield inputs, []
267 | 
268 |                 except ValueError as e:
269 |                     logger.error("想定外エラー")
270 |                     logger.error(e)
271 |                     continue
272 | 
273 |     def show_images(self, anchor, n_max=None, image_ids=None):
274 |         iter = self.generate(anchor, n_max, image_ids)
275 |         for data, _ in iter:
276 |             img = data[0][0]
277 |             # rgb -> bgr
278 |             img = np.flip(img, axis=2).astype(np.uint8)
279 |             boxes = data[3][0]
280 |             masks = data[5][0]
281 |             # 0パディングした行は除く
282 |             idx_pos = np.where(np.any(boxes, axis=1))[0]
283 |             boxes = boxes[idx_pos]
284 |             masks = masks[idx_pos]
285 |             c = [i for i in range(255)[::(255 // boxes.shape[0] - 1)]]
286 |             i = 0
287 |             for bbox, mask in zip(boxes, masks):
288 |                 bbox = bbox.astype(np.uint8)
289 |                 mask = mask.astype(np.uint8)
290 |                 color = (c[i], c[::-1][i], 0)
291 |                 # bbox
292 |                 cv2.rectangle(img, (bbox[1], bbox[0]),
293 |                               (bbox[3], bbox[2]), color)
294 |                 # # mask
295 |                 # mask_img = np.zeros(img.shape, img.dtype)
296 |                 # mask_img[:, :] = color
297 |                 mask = np.dstack([mask, mask, mask])
298 |                 mask[:, :, 0][mask[:, :, 0] == 1] = color[0]
299 |                 mask[:, :, 1][mask[:, :, 1] == 1] = color[1]
300 |                 mask[:, :, 2][mask[:, :, 2] == 1] = color[2]
301 |                 # mask_img = cv2.bitwise_and(mask_img, mask_img, mask=mask)
302 |                 cv2.addWeighted(mask, 1, img, 1, 0, img)
303 |                 i += 1
304 |             cv2.imshow('img', img)
305 |             cv2.waitKey(0)
306 | 


--------------------------------------------------------------------------------
/xrcnn/util/anchor.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from xrcnn.util import bbox as B
  3 | 
  4 | 
  5 | class Anchor:
  6 |     def __init__(self, config):
  7 |         # def __init__(self, base_size=16,
  8 |         #              anchor_ratios=[
  9 |         #                  (1. / math.sqrt(2), 2. / math.sqrt(2)),
 10 |         #                  (1., 1.),
 11 |         #                  (2. / math.sqrt(2), 1. / math.sqrt(2))],
 12 |         #              anchor_scales=[128 / 4, 256 / 4, 512 / 4],
 13 |         #              backbone_shape=[64 / 4, 64 / 4]):
 14 |         """RoI予測の基準となるアンカーを生成する。
 15 |         アンカーの基準となる値を指定する。
 16 | 
 17 |         Args:
 18 |             base_size (number): アンカーを適用する特徴マップ1ピクセルが、入力画像において何ピクセルに値するか。
 19 |             anchor_ratios (list of float): アンカーのアスペクト比。
 20 |                 :math:`[(h, w), ...]`
 21 |             anchor_scales (list of numbers): アンカーのサイズ（入力画像におけるサイズ）。
 22 |                 このサイズの正方形をアンカーの領域とする。
 23 |             anchor_ratios (list of numbers): アンカーのアスペクト比
 24 |         """
 25 |         self.base_size = config.stride_per_base_nn_feature
 26 |         self.backbone_shape = config.backbone_shape
 27 |         self.anchor_ratios = config.anchor_box_aspect_ratios
 28 |         self.anchor_scales = config.anchor_box_scales
 29 |         self.bbox_refinement_std = config.bbox_refinement_std
 30 |         self.anchor_base = self._anchor_base(
 31 |             self.base_size, self.anchor_ratios, self.anchor_scales)
 32 |         self.anchors = self._generate_anchors(self.backbone_shape)
 33 | 
 34 |     def generate_gt_offsets(self, bbox_gt, img_size,
 35 |                             pos_iou_thresh=0.5,
 36 |                             neg_iou_thresh=0.3,
 37 |                             n_max_sample=256,
 38 |                             pos_ratio=0.5):
 39 |         """anchorにGroud truthなBBoxを適用し、anchor毎に最もIoUが大きいBBoxを特定し、そのBBoxとのオフセットを得る。
 40 |         IoU値により、各アンカーを以下に分類する。
 41 |             0.7以上：オブジェクト
 42 |                 →0.5にする。
 43 |                     0.7だとVOCdevkit/VOC2007/Annotations/007325.xmlにあるようなサイズのBboxが
 44 |                     GTとして得られなかったため。
 45 |             0.3未満：非オブジェクト
 46 |             それ以外：評価対象外。つまり、トレーニングには使わないアンカー。
 47 | 
 48 |         Args:
 49 |             bbox_gt (array): Ground truthなBBox
 50 |                 Its shape is :math:`(R, 4)`.
 51 |             img_size (h，w): 入力画像の高さと幅のタプル.
 52 |             pos_iou_thresh: この値以上のIoUをclass=1とする。
 53 |             pos_iou_thresh: この値未満のIoUをclass=0とする。
 54 |             n_max_sample: 評価対象とする（classが1or0である）オフセットの上限
 55 |             pos_ratio: 評価対象サンプル中のPositiveの割合
 56 |                 n_max_sample, pos_ratioは論文中の以下への対応。
 57 |                 考慮無しではNegativeサンプルが支配的になる。学習効率も考慮し、このような処理を行うものと思われる。
 58 |                 Each mini-batch arises from a single image that contains many
 59 |                 positive and negative example anchors. It is possible to
 60 |                 optimize for the loss functions of all anchors,
 61 |                 but this will bias towards negative samples as they are
 62 |                 dominate. Instead, we randomly sample 256 anchors in an image
 63 |                 to compute the loss function of a mini-batch, where the sampled
 64 |                  positive and negative anchors have a ratio of up to 1:1.
 65 |                  If there are fewer than 128 positive samples in an image,
 66 |                  we pad the mini-batch with negative ones.
 67 | 
 68 |         Returns:
 69 |             (offsets, obj_flags):
 70 | 
 71 |             offsets (array) : 各アンカーとGround TruthなBBoxとのオフセット。
 72 |                 Its shape is :math:`(S, 4)`.
 73 |                 2軸目の内容は以下の通り。
 74 |                 (x, y ,h, w)
 75 |             objects (array): 各アンカーがオブジェクトか否か。
 76 |                 Its shape is :math:`(S, 1)`.
 77 |                 2軸目の内容は以下の通り。
 78 |                     1：オブジェクト
 79 |                     0：非オブジェクト
 80 |                     −1：評価対象外
 81 |         """
 82 | 
 83 |         h, w = img_size
 84 |         anchor = self.anchors
 85 |         n_anchor_initial = len(anchor)
 86 | 
 87 |         # 入力領域をはみ出すアンカーを除外
 88 |         index_inside = np.where(
 89 |             (anchor[:, 0] >= 0) &
 90 |             (anchor[:, 1] >= 0) &
 91 |             (anchor[:, 2] <= h) &
 92 |             (anchor[:, 3] <= w)
 93 |         )[0]
 94 |         anchor = anchor[index_inside]
 95 | 
 96 |         # 各アンカー毎にGTとのIoUを算出し、最大か0.7以上のIoUを残す。
 97 |         # IoU >= 0.7はオブジェクト候補とする（class = 1）
 98 |         # IoU < 0.3は非オブジェクト候補とする（class = 0）
 99 |         # それ以外のIoUは評価対象外とする（class = -1）
100 |         argmax_ious, objects = self._create_label(anchor, bbox_gt,
101 |                                                   pos_iou_thresh,
102 |                                                   neg_iou_thresh,
103 |                                                   n_max_sample,
104 |                                                   pos_ratio)
105 |         # アンカーとGroud truthのオフセットを得る。
106 |         offsets = B.get_offset(anchor, bbox_gt[argmax_ious])
107 |         # 既存実装に合わせた精度向上
108 |         offsets /= np.array(self.bbox_refinement_std)
109 | 
110 |         # 元の形状に戻す。
111 |         # index_insideに削減した1次元目の次元数をn_anchor_initialに戻す。
112 |         # 復元した座標は評価対象外なので、ラベルは−1、オフセットは0を設定して無効な状態に。
113 |         objects = self._unmap(objects, n_anchor_initial, index_inside, fill=-1)
114 |         offsets = self._unmap(offsets, n_anchor_initial, index_inside, fill=0)
115 | 
116 |         return offsets, objects
117 | 
118 |     def _create_label(self, anchor, bbox, pos_iou_thresh, neg_iou_thresh,
119 |                       n_max_sample, pos_ratio):
120 |         """
121 |         anchorとbboxのIoUを算出し、それぞれオブジェクト候補か否かを得る。
122 |         IoU >= 0.7はオブジェクト候補とする（class = 1）
123 |         IoU < 0.3は非オブジェクト候補とする（class = 0）
124 |         それ以外のIoUは評価対象外とする（class = -1）
125 | 
126 |         anchor毎に全bboxについてのIoUを算出する。
127 |         つまり、(len(anchor), len(bbox))のマトリクスになる。
128 |         このマトリクスから、anchor毎に最大のIoUを含むbboxのindexを得る。
129 | 
130 |         Args:
131 |             anchor (tensor): アンカー
132 |                 Its shape is :math:`(R, 4)`.
133 |             bbox (tensor): Ground truthなBBox
134 |                 Its shape is :math:`(S, 4)`.
135 |             pos_iou_thresh: この値以上のIoUをclass=1とする。
136 |             pos_iou_thresh: この値未満のIoUをclass=0とする。
137 |             n_max_sample: 評価対象とする（classが1or0である）オフセットの上限
138 |             pos_ratio: 評価対象サンプル中のPositiveの割合
139 | 
140 |         Returns:
141 |             (index_max_iou_per_anchor, label)
142 |             index_max_iou_per_anchor: anchor毎のIoUが最大となるbboxのIndex。
143 |                 Its shape is :math:`(R, 1)`.
144 |             label:anchor毎のオブジェクト／非オブジェクト
145 |                 Its shape is :math:`(R, 1)`.
146 | 
147 |         """
148 |         # 評価対象外の−1で初期化
149 |         label = np.full((len(anchor)), -1)
150 | 
151 |         # アンカー毎にIoUが最大となるbboxの列Indexとその値、最大のIoUを含むアンカーのIndexを得る。
152 |         index_max_iou_per_anchor, max_ious, gt_argmax_ious = self._calc_ious(
153 |             anchor, bbox)
154 | 
155 |         # 最大のIoUを含むアンカーはPositive
156 |         label[gt_argmax_ious] = 1
157 | 
158 |         # 閾値以上のIoUはPositive
159 |         label[max_ious >= pos_iou_thresh] = 1
160 | 
161 |         # 閾値未満のIoUはNegative
162 |         label[max_ious < neg_iou_thresh] = 0
163 | 
164 |         # Positiveのサンプル数を上限以内に抑える
165 |         n_pos_max = int(pos_ratio * n_max_sample)
166 |         pos_index = np.where(label == 1)[0]
167 |         if len(pos_index) > n_pos_max:
168 |             # n_pos_maxを超える場合は、Positiveをランダムに評価対象外にする
169 |             disable_index = np.random.choice(
170 |                 pos_index, size=(len(pos_index) - n_pos_max), replace=False)
171 |             label[disable_index] = -1
172 | 
173 |         # Negativeサンプルも同様に上限以内に抑える
174 |         n_neg = n_max_sample - np.sum(label == 1)
175 |         neg_index = np.where(label == 0)[0]
176 |         if len(neg_index) > n_neg:
177 |             disable_index = np.random.choice(
178 |                 neg_index, size=(len(neg_index) - n_neg), replace=False)
179 |             label[disable_index] = -1
180 | 
181 |         return index_max_iou_per_anchor, label
182 | 
183 |     def _calc_ious(self, anchor, bbox):
184 |         # anchor毎に全bboxとのIoUを得る。
185 |         ious = B.get_iou(anchor, bbox)
186 |         # anchor毎に最大のIoUが格納されている列Indexを得る。
187 |         argmax_ious = ious.argmax(axis=1)
188 |         # argmax_iousが示すIndexの実数、つまりアンカー毎の最大のIoUを得る。
189 |         max_ious = ious[np.arange(ious.shape[0]), argmax_ious]
190 | 
191 |         # IoUが最大となるアンカーのIndexを特定する
192 |         # 以下はchainercvに於ける実装だが、これだと全てのBBoxとのIoUが0の
193 |         # アンカーについてもgt_argmax_iousに含まれそう。。。つまり全てPositive扱いになる。
194 |         # 論文に従い、最大IoUのアンカーのみを特定する。
195 |         # gt_argmax_ious = ious.argmax(axis=0)
196 |         # gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])]
197 |         # gt_argmax_ious = np.where(ious == gt_max_ious)[0]
198 |         gt_argmax_ious = np.where(ious == ious.max())[0]
199 | 
200 |         return argmax_ious, max_ious, gt_argmax_ious
201 | 
202 |     def _unmap(self, data, count, index, fill=0):
203 |         # 元の形状に戻す。
204 | 
205 |         if len(data.shape) == 1:
206 |             ret = np.empty((count,), dtype=data.dtype)
207 |             ret.fill(fill)
208 |             ret[index] = data
209 |         else:
210 |             ret = np.empty((count,) + data.shape[1:], dtype=data.dtype)
211 |             ret.fill(fill)
212 |             ret[index, :] = data
213 |         return ret
214 | 
215 |     def _generate_anchors(self, feature_shape):
216 |         """特徴マップの各ピクセル毎のアンカーを生成する。
217 | 
218 |         Args:
219 |             feature_shape: 特徴マップの高さと幅のタプル
220 |                 (h, w)
221 | 
222 |         Returns:
223 |             ndarray
224 |             形状は以下の通り。
225 |             (len(feature_height) * len(feature_width)
226 |                 * len(self.anchor_ratios) * len(self.anchor_scales), 4)
227 |             1軸目は「特徴マップの行」→「特徴マップの列」→「アスペクト比の順」→「アンカーサイズ」で並ぶ。
228 |             例：
229 |             2軸目に格納される座標の形状は以下の通り。
230 |             : math: `(y_{min}, x_{min}, y_{max}, x_{max})`
231 | 
232 |         """
233 | 
234 |         feature_height, feature_width = feature_shape
235 |         # フィーチャマップの全ピクセルを示す交点座標
236 |         shift_y = np.arange(0, feature_height * self.base_size, self.base_size)
237 |         shift_x = np.arange(0, feature_width * self.base_size, self.base_size)
238 |         shift_x, shift_y = np.meshgrid(shift_x, shift_y)
239 |         # 交点毎にself._anchor_baseを加算することで交点毎のアンカーを算出したい。
240 |         # 各交点のアンカーのベースとなる座標を求める
241 |         shift = np.stack((shift_y.flatten(), shift_x.flatten(),
242 |                           shift_y.flatten(), shift_x.flatten()), axis=1)
243 |         # np.arange(0, 5, 1)で以下のようになる。
244 |         # >>> shift_y
245 |         # array([[0, 0, 0, 0, 0],
246 |         #        [1, 1, 1, 1, 1],
247 |         #        [2, 2, 2, 2, 2],
248 |         #        [3, 3, 3, 3, 3],
249 |         #        [4, 4, 4, 4, 4]])
250 |         # >>> shift_x
251 |         # array([[0, 1, 2, 3, 4],
252 |         #        [0, 1, 2, 3, 4],
253 |         #        [0, 1, 2, 3, 4],
254 |         #        [0, 1, 2, 3, 4],
255 |         #        [0, 1, 2, 3, 4]])
256 |         # >>> shift
257 |         # array([[0, 0, 0, 0],
258 |         #        [0, 1, 0, 1],
259 |         #        [0, 2, 0, 2],
260 |         #        [0, 3, 0, 3],
261 |         #        [0, 4, 0, 4],
262 |         #        [1, 0, 1, 0],
263 |         #        [1, 1, 1, 1],
264 |         #        [1, 2, 1, 2],
265 |         #        [1, 3, 1, 3],
266 |         #        [1, 4, 1, 4],
267 |         #        [2, 0, 2, 0],
268 |         #        [2, 1, 2, 1],
269 |         #        [2, 2, 2, 2],
270 |         #        [2, 3, 2, 3],
271 |         #        [2, 4, 2, 4],
272 |         #        [3, 0, 3, 0],
273 |         #        [3, 1, 3, 1],
274 |         #        [3, 2, 3, 2],
275 |         #        [3, 3, 3, 3],
276 |         #        [3, 4, 3, 4],
277 |         #        [4, 0, 4, 0],
278 |         #        [4, 1, 4, 1],
279 |         #        [4, 2, 4, 2],
280 |         #        [4, 3, 4, 3],
281 |         #        [4, 4, 4, 4]])
282 | 
283 |         n_a = self.anchor_base.shape[0]
284 |         n_s = shift.shape[0]
285 |         # 各交点毎にアンカーの座標を求める。
286 |         # まずはそのために次元を調整。
287 |         # (len(feature_height) * len(feature_width), 1, 4)にする。
288 |         # 上記5*5の例であれば、(25,1,4)
289 |         shift = np.transpose(np.reshape(shift, (1, n_s, 4)), (1, 0, 2))
290 |         # (1, len(self.anchor_ratios) * len(self.anchor_scales), 4)にする。
291 |         # 上記5*5の例であれば、(1,9,4)
292 |         anchor = np.reshape(self.anchor_base, (1, n_a, 4))
293 |         # shift + anchorにより、shift[n, :, :]とanchor[:, k, :]の組合せが得られる。
294 |         # つまり、各交点毎にanchor_baseを加算した結果が得られる。
295 |         # 結果として得られるテンソルの形状は以下の通り。
296 |         # (len(feature_height) * len(feature_width),
297 |         #   len(self.anchor_ratios) * len(self.anchor_scales), 4)
298 |         # 上記5*5の例であれば、(25,9,4)
299 |         anchor = shift.astype(float) + anchor
300 | 
301 |         # 上記を以下の形状に変換する。
302 |         # (len(feature_height) * len(feature_width)
303 |         #   * len(self.anchor_ratios) * len(self.anchor_scales), 4)
304 |         anchor = np.reshape(anchor, (n_s * n_a, 4))
305 |         return anchor.astype('float32')
306 | 
307 |     def _anchor_base(self, base_size, anchor_ratios, anchor_scales):
308 |         """基準となるアンカーを生成する。
309 |         ratiosとanchor_scales毎にアンカーを示す座標（矩形の左上と右下の座標）を返す。
310 |         矩形の中心は(base_size / 2, base_size / 2)とする。（論文に合わせ、受容野の中心とする）
311 | 
312 |         Args:
313 |             base_size(number): アンカーを適用する特徴マップ1ピクセルが、入力画像において何ピクセルに値するか。
314 |             anchor_ratios(list of float): アンカーのアスペクト比。
315 |                 : math: `[(h, w), ...]`
316 |             anchor_scales(list of numbers): アンカーのサイズ（入力画像におけるサイズ）。
317 |                 このサイズの正方形をアンカーの領域とする。
318 | 
319 |         Returns:
320 |             numpy配列
321 |             形状は以下の通り。
322 |             (len(anchor_ratios) * len(anchor_scales), 4)
323 |             2軸目に格納される座標の形状は以下の通り。
324 |             : math: `(y_{min}, x_{min}, y_{max}, x_{max})`
325 | 
326 |         """
327 |         # 受容野の中心を指定
328 |         py = base_size / 2.
329 |         px = base_size / 2.
330 | 
331 |         anchor_base = np.zeros((len(anchor_ratios) * len(anchor_scales), 4),
332 |                                dtype=np.float32)
333 |         for i in range(len(anchor_ratios)):
334 |             for j in range(len(anchor_scales)):
335 |                 h = anchor_scales[j] * anchor_ratios[i][0]
336 |                 w = anchor_scales[j] * anchor_ratios[i][1]
337 | 
338 |                 index = i * len(anchor_scales) + j
339 |                 # 矩形右上の座標
340 |                 anchor_base[index, 0] = py - h / 2.
341 |                 anchor_base[index, 1] = px - w / 2.
342 |                 # 矩形左上の座標
343 |                 anchor_base[index, 2] = py + h / 2.
344 |                 anchor_base[index, 3] = px + w / 2.
345 |         return anchor_base.astype('float32')
346 | 


--------------------------------------------------------------------------------
/xrcnn/frcnn.py:
--------------------------------------------------------------------------------
  1 | from logging import getLogger
  2 | from keras.applications.vgg16 import VGG16
  3 | from keras.applications.resnet50 import ResNet50
  4 | from keras.layers import Input, TimeDistributed, Lambda, Activation, Dense, \
  5 |     Flatten, Reshape, Layer
  6 | from keras.layers.convolutional import Conv2D
  7 | from keras.models import Model
  8 | from keras.optimizers import Adam
  9 | from keras import backend as K
 10 | from keras.utils import multi_gpu_model
 11 | 
 12 | import tensorflow as tf
 13 | from xrcnn.batchnorm import BatchNorm
 14 | import xrcnn.loss as loss
 15 | from xrcnn.util import bbox
 16 | from xrcnn.util import log
 17 | from xrcnn.region_proposal_layer import RegionProposalLayer
 18 | from xrcnn.roi_align_layer import RoiAlignLayer
 19 | 
 20 | logger = getLogger(__name__)
 21 | 
 22 | 
 23 | class Frcnn:
 24 |     def __init__(self, anchors, config):
 25 |         self.anchors = anchors
 26 |         self.config = config
 27 | 
 28 |     def _model_backbone_plane(self):
 29 |         if self.config.backbone_nn_type == 'vgg':
 30 |             model = VGG16(weights='imagenet')
 31 |         else:
 32 |             model = ResNet50(weights='imagenet')
 33 |         return model
 34 | 
 35 |     def _model_backbone_headless(self):
 36 |         if self.config.backbone_nn_type == 'vgg':
 37 |             model = VGG16(weights='imagenet', include_top=False)
 38 |             # 畳み込み層の後のプーリング層を除く
 39 |             # https://github.com/keras-team/keras/issues/2371
 40 |             # https://github.com/keras-team/keras/issues/6229
 41 |             # http://forums.fast.ai/t/how-to-finetune-with-new-keras-api/2328/9
 42 |             model.layers.pop()
 43 |         else:
 44 |             model = ResNet50(weights='imagenet', include_top=False)
 45 |         # VGGの重みは学習対象外
 46 |         for layer in model.layers:
 47 |             layer.trainable = False
 48 |         output = model.layers[-1].output
 49 |         _input = model.input
 50 |         return _input, output
 51 | 
 52 |     def _nn_rpn(self, backbone, trainable):
 53 |         """Region Proporsal Network
 54 |         領域提案とオブジェクト推測が得られるNN。
 55 | 
 56 |         Args:
 57 |             backbone : 起点となるNNのレイヤ
 58 |             config: Config
 59 | 
 60 |         Returns:
 61 |             [rois, offsets, objects]
 62 |             rois: 領域提案
 63 |                 形状は以下の通り。
 64 |                     (N, n_anchor, 4)
 65 |                     3軸目は領域の左上と右下の座標。
 66 |                         (y1, x1, y2, x2)
 67 |             offsets: 領域提案とアンカーのオフセット
 68 |                 形状は以下の通り。
 69 |                     (N, n_anchor, 4)
 70 |                     3軸目は領域提案とアンカーのオフセット（中心、幅、高さ）。
 71 |                         (tx, ty, th, tw)
 72 |                         つまりアンカーがn個とすると、(tx0,ty0,th0,tw0,tx1,ty1, ... ,thn, twn)
 73 |                         それぞれの値は論文に記載の通り以下とする。
 74 |                             tx =(x−xa)/wa, ty =(y−ya)/ha,
 75 |                             tw = log(w/wa), th = log(h/ha)
 76 |                         ※それぞれ、アンカーからのオフセット
 77 |                         ※「x」は予測された領域の中心x、「xa」はアンカーの中心x。
 78 |             objects: オブエジェクト、非オブジェクトである確率
 79 |                 形状は以下の通り。
 80 |                     (N, n_anchor, 2)
 81 |                     [:, :, 0]をオブジェクトではない確率、[:, :, 1]をオブジェクトである確率とみなす。
 82 | 
 83 |         """
 84 |         # 中間層（ZFの場合は256-d、VGGの場合は512-d）
 85 |         shared = Conv2D(512, 3, padding='same', activation='relu',
 86 |                         kernel_initializer='he_uniform',
 87 |                         name='rpn_conv1', trainable=trainable)(backbone)
 88 |         # 領域座標提案
 89 |         offsets = Conv2D(self.config.n_anchor * 4, 1, padding='valid',
 90 |                          activation='linear',
 91 |                          kernel_initializer='he_uniform',
 92 |                          name='rpn_offsets', trainable=trainable)(shared)
 93 |         # (N, n_anchor, 4)の形状に変換
 94 |         offsets = Reshape([-1, 4], name='rpn_offsets_reshape',
 95 |                           trainable=trainable)(offsets)
 96 | 
 97 |         # オブジェクト判別
 98 |         obj = Conv2D(self.config.n_anchor * 2, 1, padding='valid',
 99 |                      activation='linear',
100 |                      kernel_initializer='glorot_uniform',
101 |                      name='rpn_objects_val', trainable=trainable)(shared)
102 |         # (N, n_anchor, 2)の形状に変換
103 |         obj_logit = Reshape([-1, 2], name='rpn_objects_reshape',
104 |                             trainable=trainable)(obj)
105 |         # オブジェクト／非オブジェクトを示す数値を確率に変換する
106 |         obj_prob = Activation('softmax', name='rpn_objects_prob',
107 |                               trainable=trainable)(obj_logit)
108 | 
109 |         # 領域提案
110 |         # 座標が0~1に正規化されている
111 |         normalized_rois = RegionProposalLayer(self.anchors, self.config,
112 |                                               name='region_proporsal_layer',
113 |                                               trainable=trainable)(
114 |             [backbone, offsets, obj_prob])
115 | 
116 |         return normalized_rois, offsets, obj_prob, obj_logit
117 | 
118 |     def _nn_head(self, backbone, region_proposal):
119 |         """Head Network
120 |         region_proposal*クラスラベル毎のオフセット予測、
121 |         オブジェクト毎の存在確率が得られるNN。
122 | 
123 |         Args:
124 |             backbone : 起点となるNNのレイヤ
125 |             region_proposal: RegionProposalLayerで得られた領域提案
126 |                 形状は以下の通り。
127 |                     (N, n_rois, 4)
128 |                     3軸目は領域の左上と右下の座標が0〜1に正規化されている。
129 |                         (y1, x1, y2, x2)
130 |         Returns:
131 |             [offsets, labels]
132 |             offsets: 領域提案からのオフセット
133 |                 形状は以下の通り。
134 |                     (N, n_rois, n_label, 4)
135 |                     3軸目は領域の中心、幅、高さが0〜1に正規化された値。
136 |                         (tx, ty, th, tw)
137 |             labels: クラスラベル毎の存在確率
138 |                 形状は以下の通り。
139 |                     (N, n_rois, n_label)
140 | 
141 |         """
142 |         #
143 |         n_label = self.config.n_dataset_labels
144 |         # RoI Align
145 |         # 論文ではRoiPoolingだが、より精度の高いRoiAlignにする。
146 |         out = RoiAlignLayer(self.config.roi_align_pool_shape, self.config,
147 |                             name='head_roi_align')(
148 |             [backbone, region_proposal])
149 | 
150 |         # FasterRCNN論文では4096だが、MaskRCNN論文では1024に削減している。
151 |         # 4096だとGPU(tesra K80)でOutOfMemoryになるので2048に減らしてみた。
152 |         # Resnetだど2048でもOutOfMemoryになるので1024にする。
153 |         if self.config.backbone_nn_type == 'vgg':
154 |             unit_size = 2048
155 |         else:
156 |             unit_size = 1024
157 | 
158 |         out = TimeDistributed(Flatten(name='head_flatten'))(out)
159 |         out = TimeDistributed(Dense(unit_size, kernel_initializer='he_uniform',
160 |                                     name='head_fc1'))(out)
161 |         out = TimeDistributed(BatchNorm(axis=1), name='head_fc1_bn')(out)
162 |         out = Activation('relu')(out)
163 | 
164 |         out = TimeDistributed(Dense(unit_size, kernel_initializer='he_uniform',
165 |                                     name='head_fc2'))(out)
166 |         out = TimeDistributed(BatchNorm(axis=1), name='head_fc2_bn')(out)
167 |         out = Activation('relu')(out)
168 | 
169 |         # 畳込みに置き換え
170 |         # out = TimeDistributed(Conv2D(1024, (self.config.roi_align_out_size,
171 |         #                                     self.config.roi_align_out_size),
172 |         #                              kernel_initializer='he_uniform',
173 |         #                              padding="valid"),
174 |         #                       name="head_conv1")(out)
175 |         # out = TimeDistributed(BatchNorm(axis=3), name='head_conv1_bn')(out)
176 |         # out = Activation('relu')(out)
177 |         # out = TimeDistributed(Conv2D(1024, (1, 1)),
178 |         #                       kernel_initializer='he_uniform',
179 |         #                       name="head_conv2")(out)
180 |         # out = TimeDistributed(BatchNorm(axis=3),
181 |         #                       name='head_conv2_bn')(out)
182 |         # out = Activation('relu')(out)
183 |         # out = Lambda(lambda x: K.squeeze(K.squeeze(x, 3), 2),
184 |         #              name="head_")(out)
185 | 
186 |         # region_proposal毎にそれぞれのクラスラベルの存在確率を得る。
187 |         # 形状は[N, n_region, n_label]
188 |         labels_logit = TimeDistributed(
189 |             Dense(n_label,
190 |                   kernel_initializer='glorot_uniform'),
191 |             name='head_label_val')(out)
192 |         labels_prob = Activation('softmax', name='head_label')(labels_logit)
193 | 
194 |         # region_proposal毎、クラスラベル毎にregion_proposalのBBoxからのオフセットを得る。
195 |         offsets = TimeDistributed(Dense(4 * n_label, activation='linear',
196 |                                         kernel_initializer='zero'),
197 |                                   name='head_offsets')(out)
198 |         # [N, n_rois, n_label, 4]に変形
199 |         offsets = Reshape([-1, n_label, 4],
200 |                           name='head_offsets_reshape')(offsets)
201 | 
202 |         return offsets, labels_prob, labels_logit
203 | 
204 |     def _build_model(self):
205 |         rpn_trainable = self.config.training_mode in ['rpn_only', 'all']
206 |         head_trainable = self.config.training_mode in ['head_only', 'all']
207 | 
208 |         # backbone network
209 |         backbone_in, backbone_out = self._model_backbone_headless()
210 | 
211 |         # rpn
212 |         normalized_rois, rpn_offsets, objects, objects_logit \
213 |             = self._nn_rpn(backbone_out, rpn_trainable)
214 | 
215 |         # 学習時のみ損失を計算
216 |         if self.config.training:
217 |             # 学習時
218 |             # 入力
219 |             input_gt_rois = Input(
220 |                 shape=[None, 4], name="input_gt_rois", dtype='float32')
221 |             input_gt_objects = Input(
222 |                 shape=[None], name="input_gt_objects", dtype='int32')
223 |             inputs = [backbone_in, input_gt_rois, input_gt_objects]
224 | 
225 |             losses = []
226 |             if rpn_trainable:
227 |                 # 損失計算
228 |                 # RPNの損失
229 |                 rpn_offsets_loss = Lambda(lambda x: loss.rpn_offsets_loss(*x),
230 |                                           name="rpn_offsets_loss")(
231 |                     [input_gt_rois, input_gt_objects, rpn_offsets])
232 |                 rpn_objects_loss = Lambda(lambda x: loss.rpn_objects_loss(*x),
233 |                                           name="rpn_objects_loss")(
234 |                     [input_gt_objects, objects])
235 | 
236 |                 losses += [rpn_offsets_loss, rpn_objects_loss]
237 | 
238 |             if head_trainable:
239 |                 input_gt_boxes = Input(
240 |                     shape=[None, 4], name="input_gt_boxes", dtype='float32')
241 |                 input_gt_label_ids = Input(
242 |                     shape=[None], name="input_gt_label_ids", dtype='int32')
243 |                 inputs += [input_gt_boxes, input_gt_label_ids]
244 | 
245 |                 # 正解データとRoIから評価対象のRoIを絞り込み、それに対応する正解データを得る。
246 |                 normalized_sample_rois, normalized_sample_gt_offsets, \
247 |                     sample_gt_labels \
248 |                     = SubsamplingRoiLayer(self.config,
249 |                                           name='subsampling_roi_and_gt')(
250 |                         [normalized_rois, input_gt_boxes, input_gt_label_ids])
251 |                 # 以下のようにoutput_shapeを直接指定するとIndexErrorが発生したので、
252 |                 # ↑のようにカスタムレイヤー化する
253 |                 # batch_size = K.shape(normalized_rois)[0]
254 |                 # sample_rois, sample_gt_offsets, sample_labels = \
255 |                 #     Lambda(lambda x: self._subsampling_roi_and_gt(*x),
256 |                 #            output_shape=[(batch_size, None, 4),
257 |                 #                          (batch_size, None, 4),
258 |                 #                          (batch_size, None)],
259 |                 #            name="subsampling_roi_and_gt")(
260 |                 #         [normalized_rois, input_gt_boxes,
261 |                 #         input_gt_label_ids])
262 | 
263 |                 # head
264 |                 head_offsets, labels, labels_logit\
265 |                     = self._nn_head(backbone_out, normalized_sample_rois)
266 | 
267 |                 # 損失計算
268 |                 # ヘッドの損失はModel#compileで損失関数を指定する方法では対応出来ないため、
269 |                 # Layerとして定義してModel#add_lossで加算する。
270 |                 head_offsets_loss = Lambda(lambda x:
271 |                                            loss.head_offsets_loss(*x),
272 |                                            name="head_offsets_loss")(
273 |                     [normalized_sample_gt_offsets, sample_gt_labels,
274 |                         head_offsets])
275 |                 head_labels_loss = Lambda(lambda x:
276 |                                           loss.head_labels_loss(*x),
277 |                                           name="head_labels_loss")(
278 |                     [sample_gt_labels, labels])
279 | 
280 |                 # 損失
281 |                 losses += [head_offsets_loss, head_labels_loss]
282 | 
283 |             # 出力＝損失
284 |             outputs = losses
285 | 
286 |         else:
287 |             # 予測時
288 |             # head
289 |             # head_offsetsは0〜1で正規化された値
290 |             head_offsets, labels, _ = self._nn_head(
291 |                 backbone_out, normalized_rois)
292 | 
293 |             # 予測時は損失不要
294 |             # ダミーの損失関数
295 |             dummy_loss = Lambda(lambda x: K.constant(0), name="dummy_loss")(
296 |                 [backbone_in])
297 |             losses = [dummy_loss, dummy_loss, dummy_loss]
298 |             inputs = [backbone_in]
299 |             # normalized_roisの正規化を戻した座標にhead_offsetを適用することでBBoxを得る。
300 |             outputs = [normalized_rois, head_offsets, labels,
301 |                        rpn_offsets, objects]
302 | 
303 |         model = Model(inputs=inputs, outputs=outputs, name='faser_r_cnn')
304 |         # Kerasは複数指定した損失の合計をモデル全体の損失として評価してくれる。
305 |         # 損失を追加
306 |         for output in losses:
307 |             model.add_loss(tf.reduce_mean(output, keep_dims=True))
308 |         return model, len(outputs)
309 | 
310 |     def compiled_model(self):
311 |         if self.config.gpu_count > 1:
312 |             # 複数GPUで並列処理
313 |             with tf.device('/cpu:0'):
314 |                 model, n_outputs = self._build_model()
315 |             model = multi_gpu_model(model, self.config.gpu_count)
316 |         else:
317 |             model, n_outputs = self._build_model()
318 | 
319 |         # compile()ではlossを指定しないが、空ではエラーになるためNoneのリストを指定する。
320 |         model.compile(optimizer=Adam(lr=self.config.learning_rate),
321 |                       loss=[None] * n_outputs)
322 |         return model
323 | 
324 | 
325 | class SubsamplingRoiLayer(Layer):
326 |     def __init__(self, config, **kwargs):
327 |         super(SubsamplingRoiLayer, self).__init__(**kwargs)
328 |         self.config = config
329 |         self.n_samples_per_batch = 64
330 | 
331 |     def call(self, inputs):
332 |         return self._subsampling(*inputs)
333 | 
334 |     def compute_output_shape(self, input_shape):
335 |         return [(self.config.batch_size, self.n_samples_per_batch, 4),
336 |                 (self.config.batch_size, self.n_samples_per_batch, 4),
337 |                 (self.config.batch_size, self.n_samples_per_batch)]
338 | 
339 |     def _subsampling(self, normalized_rois, gt_bboxes, gt_labels,
340 |                      pos_iou_thresh=0.5,
341 |                      exclusive_iou_tresh=0.1,
342 |                      pos_ratio=0.25):
343 |         """正解データとのIoUを基にRoIをサンプリングする。
344 |         IoUがpos_iou_thresh以上であるRoIをオブジェクトとみなす。
345 |             オブジェクトはサンプルの25%以内とする。（n_samples_per_batch * pos_ratio 以内）
346 |         pos_iou_thresh未満、exclusive_iou_thresh以上は非オブジェクトとみなす。
347 |         exclusive_iou_thresh未満は偶然の一致であり意味なし（難解）なので無視。
348 |         ※論文ではheuristic for hard example mining.と記載されている点。
349 |         バッチ毎のサンプル数はn_samples_per_batch以内とする。
350 |         （n_samples_per_batch未満の場合は、n_samples_per_batchになるよう0パディングする。）
351 | 
352 |         上記のサンプリングに対応する正解データのラベル、また、BBoxとのオフセットも得る。
353 | 
354 |         Args:
355 |             normalized_rois (tensor) : RegionProposalLayerで得られたRoI。
356 |                 (N, n_rois, 4)
357 |                 3軸目は領域の左上と右下の座標が0〜1に正規化された値。
358 |                 入力画像サイズの高さ、幅で除算することで正規化された値。
359 |                     (y1, x1, y2, x2)
360 |             gt_bboxes (ndarray) : 正解BBox。
361 |                 (N, config.n_max_gt_objects_per_image, 4)
362 |                 座標は正規化されていない。
363 |             gt_labels (ndarray) : 正解ラベル。
364 |                 (N, config.n_max_gt_objects_per_image)
365 |                 ==0:背景データ
366 |                 >=1:オブジェクト
367 |         Returns:
368 |             sample_rois (tensor): サンプリングしたRoI。
369 |                 (N, n_samples_per_batch, 4)
370 |                 3軸目の座標は0〜1に正規化された値。
371 |             sample_gt_offset (tensor): サンプリングしたRoIに対応するBBoxとのオフセット。
372 |                 (N, n_samples_per_batch, 4)
373 |                 3軸目の座標は0〜1に正規化された値をself.config.bbox_refinement_stdで割ることで標準化した値。
374 |             sample_gt_labels (tensor): サンプリングしたRoIに対応するBBoxのラベル。
375 |                 (N, n_samples_per_batch)
376 |         """
377 |         pos_roi_per_batch = round(self.n_samples_per_batch * pos_ratio)
378 | 
379 |         # gt_bboxesをnormalized_roisに合わせて正規化する。
380 |         # これでIoUが評価出来るようになる。
381 |         input_h = self.config.image_shape[0]
382 |         input_w = self.config.image_shape[1]
383 |         normalized_gt_bboxes = bbox.normalize_bbox(gt_bboxes, input_h, input_w)
384 | 
385 |         # 入力をバッチ毎に分割
386 |         normalized_rois = tf.split(normalized_rois, self.config.batch_size)
387 |         normalized_gt_bboxes = tf.split(normalized_gt_bboxes,
388 |                                         self.config.batch_size)
389 |         gt_labels = tf.split(gt_labels, self.config.batch_size)
390 | 
391 |         sample_rois = []
392 |         sample_gt_offsets = []
393 |         sample_gt_labels = []
394 | 
395 |         for roi, gt_bbox, gt_label in zip(normalized_rois,
396 |                                           normalized_gt_bboxes, gt_labels):
397 |             # 0次元目(バッチサイズ)は不要なので削除
398 |             roi = log.tfprint(roi, "roi: ")
399 |             gt_bbox = log.tfprint(gt_bbox, "gt_bbox: ")
400 |             gt_label = log.tfprint(gt_label, "gt_label: ")
401 | 
402 |             roi = K.squeeze(roi, 0)
403 |             gt_bbox = K.squeeze(gt_bbox, 0)
404 |             gt_label = K.squeeze(gt_label, 0)
405 | 
406 |             roi = log.tfprint(roi, "roi_squeezed: ")
407 |             gt_bbox = log.tfprint(gt_bbox, "gt_bbox_squeezed: ")
408 |             gt_label = log.tfprint(gt_label, "gt_label_squeezed: ")
409 | 
410 |             # ゼロパディング行を除外
411 |             # K.gather(zero, K.squeeze(tf.where(K.any(zero, axis=1)), -1) )
412 |             idx_roi_row = K.flatten(tf.where(K.any(roi, axis=1)))
413 |             idx_gt_bbox = K.flatten(tf.where(K.any(gt_bbox, axis=1)))
414 |             roi = K.gather(roi, idx_roi_row)
415 |             # gt_bboxとgt_labelは行数と行の並びが同じなので同じidxを利用できる
416 |             gt_bbox = K.gather(gt_bbox, idx_gt_bbox)
417 |             gt_label = K.gather(gt_label, idx_gt_bbox)
418 | 
419 |             gt_bbox = log.tfprint(gt_bbox, "gt_bbox_gathered: ")
420 |             gt_label = log.tfprint(gt_label, "gt_label_gathered: ")
421 | 
422 |             # IoUを求める。
423 |             # (n_rois, )
424 |             ious = bbox.get_iou_K(roi, gt_bbox)
425 |             ious = log.tfprint(ious, "ious: ")
426 | 
427 |             # 各RoI毎にIoU最大のBBoxの位置を得る
428 |             idx_max_gt = K.argmax(ious, axis=1)
429 |             idx_max_gt = log.tfprint(idx_max_gt, "idx_max_gt: ")
430 | 
431 |             max_iou = K.max(ious, axis=1)  # max_iouの行数はroiと同じになる
432 |             max_iou = log.tfprint(max_iou, "max_iou: ")
433 |             idx_pos = K.flatten(tf.where(max_iou >= pos_iou_thresh))
434 |             # positiveサンプル数をpos_roi_per_batch以内に制限
435 |             limit_pos = K.minimum(pos_roi_per_batch, K.shape(idx_pos)[0])
436 |             idx_pos = K.switch(K.shape(idx_pos)[0] > 0,
437 |                                tf.random_shuffle(idx_pos)[:limit_pos],
438 |                                idx_pos)
439 |             limit_pos = log.tfprint(limit_pos, "limit_pos: ")
440 |             idx_pos = log.tfprint(idx_pos,  "idx_pos: ")
441 | 
442 |             # negativeサンプル数を
443 |             #   n_samples_per_batch - pos_roi_per_batch
444 |             # に制限
445 |             idx_neg = K.flatten(tf.where((max_iou < pos_iou_thresh)
446 |                                          & (max_iou >= exclusive_iou_tresh)))
447 |             # negativeサンプル数は pos_roi_per_batch - limit_pos(つまり残り) 以内に制限
448 |             limit_neg = self.n_samples_per_batch - limit_pos
449 |             limit_neg = K.minimum(limit_neg, K.shape(idx_neg)[0])
450 |             idx_neg = K.switch(K.shape(idx_neg)[0] > 0,
451 |                                tf.random_shuffle(idx_neg)[:limit_neg],
452 |                                idx_neg)
453 |             limit_neg = log.tfprint(limit_neg, "limit_neg: ")
454 |             idx_neg = log.tfprint(idx_neg,  "idx_neg: ")
455 | 
456 |             # 返却するサンプルを抽出
457 |             # GTのoffsets, labelsは各roisに対応させる。つまり、同じ位置に格納する。
458 |             idx_keep = K.concatenate((idx_pos, idx_neg))
459 |             idx_keep = log.tfprint(idx_keep, "idx_keep: ")
460 | 
461 |             # 各RoIの最大IoUを示すIndexについても、上記返却するサンプルのみを残す。
462 |             idx_gt_keep = K.gather(idx_max_gt, idx_keep)
463 |             # IoUが閾値以上のPositiveとみなされるサンプルのみを残すためのIndex。
464 |             idx_gt_keep_pos = K.gather(idx_max_gt, idx_pos)
465 |             idx_gt_keep = log.tfprint(idx_gt_keep, "idx_gt_keep: ")
466 | 
467 |             sample_roi = K.gather(roi, idx_keep)
468 |             sample_gt_offset = bbox.get_offset_K(
469 |                 sample_roi, K.gather(gt_bbox, idx_gt_keep))
470 |             # negativeな要素には0を設定
471 |             sample_gt_label = K.concatenate((K.cast(K.gather(
472 |                 gt_label, idx_gt_keep_pos),
473 |                 dtype='int32'),
474 |                 K.zeros([limit_neg],  # K.zerosは0階テンソルを受け付けないので配列化。。。
475 |                         dtype='int32')))
476 | 
477 |             # 行数がn_samples_per_batch未満の場合は0パディング
478 |             remain = tf.maximum(self.n_samples_per_batch
479 |                                 - tf.shape(sample_roi)[0], 0)
480 |             sample_roi = tf.pad(sample_roi, [(0, remain), (0, 0)],
481 |                                 name='subsample_sample_roi')
482 |             sample_gt_offset = tf.pad(sample_gt_offset, [(0, remain), (0, 0)],
483 |                                       name='subsample_sample_gt_offset')
484 |             sample_gt_offset /= self.config.bbox_refinement_std
485 |             sample_gt_label = tf.pad(sample_gt_label, [(0, remain)],
486 |                                      name='subsample_sample_gt_label')
487 | 
488 |             sample_roi = log.tfprint(sample_roi, "sample_roi: ")
489 |             sample_gt_offset = log.tfprint(
490 |                 sample_gt_offset, "sample_gt_offset: ")
491 |             sample_gt_label = log.tfprint(sample_gt_label, "sample_gt_label: ")
492 | 
493 |             sample_rois.append(sample_roi)
494 |             sample_gt_offsets.append(sample_gt_offset)
495 |             sample_gt_labels.append(sample_gt_label)
496 | 
497 |         return [K.stack(sample_rois), K.stack(sample_gt_offsets),
498 |                 K.stack(sample_gt_labels)]
499 | 


--------------------------------------------------------------------------------
/xrcnn/mrcnn.py:
--------------------------------------------------------------------------------
  1 | from logging import getLogger
  2 | from keras.layers import Input, TimeDistributed, Lambda, Activation, Reshape
  3 | from keras.layers.convolutional import Conv2D, Conv2DTranspose
  4 | from keras.models import Model
  5 | from keras import backend as K
  6 | 
  7 | import tensorflow as tf
  8 | from xrcnn.batchnorm import BatchNorm
  9 | import xrcnn.loss as loss
 10 | from xrcnn.frcnn import Frcnn
 11 | from xrcnn.frcnn import SubsamplingRoiLayer
 12 | from xrcnn.util import bbox
 13 | from xrcnn.roi_align_layer import RoiAlignLayer
 14 | 
 15 | logger = getLogger(__name__)
 16 | 
 17 | 
 18 | class MaskRCNN(Frcnn):
 19 |     def _nn_squeeze_roi(self, batch_rois, batch_offsets, batch_labels):
 20 |         input_batch_size = batch_rois.shape[0]
 21 |         # batch_rois = log.tfprint(batch_rois, "batch_rois:before_split:debug")
 22 |         batch_rois = tf.split(batch_rois, input_batch_size)
 23 |         batch_offsets = tf.split(batch_offsets, input_batch_size)
 24 |         batch_labels = tf.split(batch_labels, input_batch_size)
 25 | 
 26 |         ret_bboxes, ret_normalized_rois, ret_labels, ret_scores = \
 27 |             [], [], [], []
 28 | 
 29 |         for normalized_rois, head_offsets, labels \
 30 |                 in zip(batch_rois, batch_offsets, batch_labels):
 31 |             # バッチを示す0次元目を削除
 32 |             normalized_rois, head_offsets, labels = \
 33 |                 K.squeeze(normalized_rois, axis=0), \
 34 |                 K.squeeze(head_offsets, axis=0), \
 35 |                 K.squeeze(labels, axis=0)
 36 | 
 37 |             # 各RoI毎に最も確率の高いラベルに対応するOffsets, masksを抽出
 38 |             # [n, n_labels] -> [n, 1]
 39 |             labels_id = K.cast(K.argmax(labels, axis=-1), tf.int32)
 40 |             # labels_id = log.tfprint(labels_id, "labels_id:inloop:debug")
 41 |             idx_labels = K.cast(K.stack([K.arange(K.shape(labels)[0]),
 42 |                                          labels_id],
 43 |                                         axis=1), tf.int32)
 44 |             # idx_labels = log.tfprint(idx_labels, "idx_labels:inloop:debug")
 45 |             # head_offsets = log.tfprint(
 46 |             #     head_offsets, "head_offsets:pre:inloop:debug")
 47 |             # [n, n_labels, 4] -> [n, 1, 4]
 48 |             head_offsets = tf.gather_nd(head_offsets, idx_labels)
 49 |             # head_offsets = log.tfprint(
 50 |             #     head_offsets, "head_offsets:gather:inloop:debug")
 51 |             head_offsets *= K.variable(self.config.bbox_refinement_std)
 52 |             # head_offsets = log.tfprint(
 53 |             #     head_offsets, "head_offsets:std:inloop:debug")
 54 |             # [n, n_labels] -> [n, 1]
 55 |             labels_prob = tf.gather_nd(labels, idx_labels)
 56 | 
 57 |             # オブジェクトである確率が閾値以上の領域のみを残す
 58 |             # 1次元目の列番号のみ抽出
 59 |             idx_labels, _ = tf.unique(tf.where(
 60 |                 labels_prob >= self.config.detect_label_prob)[:, 0])
 61 |             # idx_labels = log.tfprint(idx_labels, "idx_labels:inloop:debug")
 62 |             normalized_rois = tf.gather(normalized_rois, idx_labels)
 63 |             head_offsets = tf.gather(head_offsets, idx_labels)
 64 |             labels_prob = tf.gather(labels_prob, idx_labels)
 65 |             labels_id = tf.gather(labels_id, idx_labels)
 66 |             # labels_prob = log.tfprint(labels_prob,
 67 |             #   "labels_prob:inloop:debug")
 68 | 
 69 |             # 背景は除く
 70 |             idx_labels, _ = tf.unique(tf.where(labels_id > 0)[:, 0])
 71 |             # idx_labels = log.tfprint(idx_labels, "idx_labels:inloop:debug")
 72 |             normalized_rois = tf.gather(normalized_rois, idx_labels)
 73 |             head_offsets = tf.gather(head_offsets, idx_labels)
 74 |             labels_prob = tf.gather(labels_prob, idx_labels)
 75 |             labels_id = tf.gather(labels_id, idx_labels)
 76 |             # labels_prob = log.tfprint(labels_prob,
 77 |             #   "labels_prob:inloop:debug")
 78 |             # labels_id = log.tfprint(labels_id, "labels_id:inloop:debug")
 79 | 
 80 |             # ラベルの確率の高い順に並べる
 81 |             _, idx_labels_order = tf.nn.top_k(
 82 |                 labels_prob, k=K.shape(labels_prob)[0], sorted=True)
 83 |             normalized_rois = tf.gather(normalized_rois, idx_labels_order)
 84 |             head_offsets = tf.gather(head_offsets, idx_labels_order)
 85 |             labels_prob = tf.gather(labels_prob, idx_labels_order)
 86 |             labels_id = tf.gather(labels_id, idx_labels_order)
 87 | 
 88 |             # bbox復元
 89 |             h, w = self.config.image_shape[0], self.config.image_shape[1]
 90 |             bboxes = K.cast(bbox.restore_bbox(normalized_rois, head_offsets,
 91 |                                               h, w),
 92 |                             tf.float32)
 93 | 
 94 |             # バッチ毎にNMSする
 95 |             # TODO オブジェクトの種類が複数であれば、オブジェクトの種類ごとにNMSしたほうがよさそう。
 96 |             idx_keep = tf.image.non_max_suppression(
 97 |                 bboxes, labels_prob,
 98 |                 max_output_size=self.config.detect_max_instances,
 99 |                 iou_threshold=self.config.detect_nms_thresh)
100 |             bboxes = tf.gather(bboxes, idx_keep)
101 |             normalized_rois = tf.gather(normalized_rois, idx_keep)
102 |             labels_prob = tf.gather(labels_prob, idx_keep)
103 |             labels_id = tf.gather(labels_id, idx_keep)
104 | 
105 |             diff = K.cast(K.maximum(
106 |                 self.config.detect_max_instances - K.shape(bboxes)[0], 0),
107 |                 tf.int32)
108 |             bboxes = tf.pad(bboxes, [[0, diff], [0, 0]],
109 |                             mode='CONSTANT', constant_values=0,
110 |                             name="pad_bboxes")
111 |             normalized_rois = tf.pad(normalized_rois, [[0, diff], [0, 0]],
112 |                                      mode='CONSTANT', constant_values=0,
113 |                                      name="pad_normalized_rois")
114 |             labels_prob = tf.pad(labels_prob, [[0, diff]],
115 |                                  mode='CONSTANT', constant_values=0,
116 |                                  name="pad_labels_prob")
117 |             labels_id = tf.pad(labels_id, [[0, diff]],
118 |                                mode='CONSTANT', constant_values=0,
119 |                                name="pad_labels_id")
120 | 
121 |             ret_bboxes.append(K.cast(bboxes, tf.int32))
122 |             ret_normalized_rois.append(normalized_rois)
123 |             ret_labels.append(K.cast(labels_id, tf.int32))
124 |             ret_scores.append(labels_prob)
125 | 
126 |         return [K.stack(ret_bboxes), K.stack(ret_normalized_rois),
127 |                 K.stack(ret_labels), K.stack(ret_scores)]
128 | 
129 |     def _nn_mask(self, backbone_out, normalized_rois):
130 |         """Builds the computation graph of the mask head of Feature Pyramid Network.
131 | 
132 |             Args:
133 |                 backbone_out: backboneネットワークの出力
134 |                 rois: 正規化されたRoI
135 |                     [N, n_rois, (y1, x1, y2, x2)]
136 |             Returns:
137 |                 masks:
138 |                     [N, n_rois, num_classes, pool_size*2, pool_size*2]
139 |         """
140 |         # [N, R, pool_size, pool_size, channels]
141 |         out = RoiAlignLayer(self.config.mask_roi_align_pool_shape, self.config,
142 |                             name='head_mask_roi_align')([backbone_out,
143 |                                                          normalized_rois])
144 | 
145 |         # 畳み込み層は論文通り4層。フィーチャマップの解像度は維持
146 |         out = TimeDistributed(Conv2D(256, (3, 3), padding='same'),
147 |                               name='head_mask_conv1')(out)
148 |         out = TimeDistributed(BatchNorm(axis=3),
149 |                               name='head_mask_conv1_bn')(out)
150 |         out = Activation('relu')(out)
151 | 
152 |         out = TimeDistributed(Conv2D(256, (3, 3), padding='same'),
153 |                               name='head_mask_conv2')(out)
154 |         out = TimeDistributed(BatchNorm(axis=3),
155 |                               name='head_mask_conv2_bn')(out)
156 |         out = Activation('relu')(out)
157 | 
158 |         out = TimeDistributed(Conv2D(256, (3, 3), padding='same'),
159 |                               name='head_mask_conv3')(out)
160 |         out = TimeDistributed(BatchNorm(axis=3),
161 |                               name='head_mask_conv3_bn')(out)
162 |         out = Activation('relu')(out)
163 | 
164 |         out = TimeDistributed(Conv2D(256, (3, 3), padding='same'),
165 |                               name='head_mask_conv4')(out)
166 |         out = TimeDistributed(BatchNorm(axis=3),
167 |                               name='head_mask_conv4_bn')(out)
168 |         out = Activation('relu')(out)
169 | 
170 |         # 解像度を28*28（倍）に上げるための逆畳み込み
171 |         out = TimeDistributed(Conv2DTranspose(256, (2, 2), strides=2),
172 |                               name='head_mask_deconv')(out)
173 |         out = Activation('relu')(out)
174 |         out = TimeDistributed(Conv2D(self.config.n_dataset_labels,
175 |                                      (1, 1), strides=1, activation='sigmoid'),
176 |                               name='head_mask_binary')(out)
177 |         # [N, n_rois, n_label, h ,w]に変形
178 |         out = Reshape([-1, self.config.n_dataset_labels,
179 |                        self.config.mask_out_shape[0],
180 |                        self.config.mask_out_shape[1]],
181 |                       name='head_mask_reshape')(out)
182 |         return out
183 | 
184 |     def _build_model(self):
185 |         rpn_trainable = self.config.training_mode in ['rpn_only', 'all']
186 |         head_trainable = self.config.training_mode in ['head_only', 'all']
187 | 
188 |         # backbone network
189 |         backbone_in, backbone_out = self._model_backbone_headless()
190 | 
191 |         # rpn
192 |         normalized_rois, rpn_offsets, objects, objects_logit \
193 |             = self._nn_rpn(backbone_out, rpn_trainable)
194 | 
195 |         # 学習時のみ損失を計算
196 |         if self.config.training:
197 |             # 学習時
198 |             input_gt_rois = Input(
199 |                 shape=[None, 4], name="input_gt_rois", dtype='float32')
200 |             input_gt_objects = Input(
201 |                 shape=[None], name="input_gt_objects", dtype='int32')
202 |             inputs = [backbone_in, input_gt_rois, input_gt_objects]
203 |             losses = []
204 |             if rpn_trainable:
205 |                 # 損失計算
206 |                 # RPNの損失
207 |                 rpn_offsets_loss = Lambda(lambda x: loss.rpn_offsets_loss(*x),
208 |                                           name="rpn_offsets_loss")(
209 |                     [input_gt_rois, input_gt_objects, rpn_offsets])
210 |                 rpn_objects_loss = Lambda(lambda x: loss.rpn_objects_loss(*x),
211 |                                           name="rpn_objects_loss")(
212 |                     [input_gt_objects, objects])
213 | 
214 |                 losses += [rpn_offsets_loss, rpn_objects_loss]
215 | 
216 |             if head_trainable:
217 |                 input_gt_boxes = Input(
218 |                     shape=[None, 4], name="input_gt_boxes", dtype='float32')
219 |                 input_gt_label_ids = Input(
220 |                     shape=[None], name="input_gt_label_ids", dtype='int32')
221 |                 h, w = self.config.image_shape[0], self.config.image_shape[1]
222 |                 input_gt_masks = Input(
223 |                     shape=[None, h, w], name="input_gt_masks", dtype='float32')
224 |                 inputs += [input_gt_boxes, input_gt_label_ids, input_gt_masks]
225 | 
226 |                 # 正解データとRoIから評価対象のRoIを絞り込み、それに対応する正解データを得る。
227 |                 normalized_sample_rois, normalized_sample_gt_offsets, \
228 |                     sample_gt_labels, sample_gt_masks = \
229 |                     MaskSubsamplingRoiLayer(self.config,
230 |                                             name='mask_subsampling')(
231 |                         [normalized_rois, input_gt_boxes,
232 |                             input_gt_label_ids, input_gt_masks])
233 | 
234 |                 # head
235 |                 head_offsets, labels, labels_logit \
236 |                     = self._nn_head(backbone_out, normalized_sample_rois)
237 |                 # ヘッドの損失はModel#compileで損失関数を指定する方法では対応出来ないため、
238 |                 # Layerとして定義してModel#add_lossで加算する。
239 |                 head_offsets_loss = Lambda(lambda x:
240 |                                            loss.head_offsets_loss(*x),
241 |                                            name="head_offsets_loss")(
242 |                     [normalized_sample_gt_offsets, sample_gt_labels,
243 |                         head_offsets])
244 |                 head_labels_loss = Lambda(lambda x:
245 |                                           loss.head_labels_loss(*x),
246 |                                           name="head_labels_loss")(
247 |                     [sample_gt_labels, labels])
248 | 
249 |                 # mask
250 |                 masks = self._nn_mask(backbone_out, normalized_sample_rois)
251 |                 head_mask_loss = Lambda(lambda x:
252 |                                         loss.head_mask_loss(*x),
253 |                                         name="head_mask_loss")(
254 |                     [sample_gt_masks, sample_gt_labels, masks])
255 | 
256 |                 # 損失
257 |                 losses += [head_offsets_loss, head_labels_loss, head_mask_loss]
258 | 
259 |             # 出力＝損失
260 |             outputs = losses
261 | 
262 |         else:
263 |             # 予測時
264 |             # head
265 |             # head_offsetsは0〜1で正規化された値
266 |             head_offsets, labels, _ = self._nn_head(
267 |                 backbone_out, normalized_rois)
268 | 
269 |             # 候補を絞り込む
270 |             bboxes, rois, labels, scores = Lambda(lambda x:
271 |                                                   self._nn_squeeze_roi(*x),
272 |                                                   name="squeeze_roi")(
273 |                 [normalized_rois, head_offsets, labels])
274 | 
275 |             # mask
276 |             masks = self._nn_mask(backbone_out, rois)
277 | 
278 |             def _squeeze_masks(masks, idx_labels):
279 |                 dim1 = K.flatten(K.repeat(K.expand_dims(
280 |                     K.arange(K.shape(masks)[0])), K.shape(masks)[1]))
281 |                 dim2 = K.tile(K.arange(K.shape(masks)[1]),
282 |                               [K.shape(masks)[0]])
283 |                 idx = K.stack([dim1, dim2,
284 |                                K.cast(K.flatten(labels), tf.int32)], axis=1)
285 |                 # idx = log.tfprint(idx, "idx:inloop:debug")
286 |                 squeezed_masks = tf.gather_nd(masks, idx)
287 |                 squeezed_masks = K.reshape(squeezed_masks,
288 |                                            [K.shape(masks)[0],
289 |                                             K.shape(masks)[1],
290 |                                             K.shape(masks)[3],
291 |                                             K.shape(masks)[4]])
292 |                 return squeezed_masks
293 | 
294 |             # ラベルに対応するマスクを残す
295 |             masks = Lambda(lambda x: _squeeze_masks(x[0],
296 |                                                     K.cast(x[1], tf.int32)),
297 |                            name="squeeze_mask")([masks, labels])
298 | 
299 |             # 予測時は損失不要
300 |             # ダミーの損失関数
301 |             dummy_loss = Lambda(lambda x: K.constant(0), name="dummy_loss")(
302 |                 [backbone_in])
303 |             losses = [dummy_loss, dummy_loss, dummy_loss,
304 |                       dummy_loss, dummy_loss, dummy_loss]
305 |             inputs = [backbone_in]
306 | 
307 |             outputs = [bboxes, labels, scores, masks, rois,
308 |                        rpn_offsets, objects]
309 | 
310 |         model = Model(inputs=inputs, outputs=outputs, name='mask_r_cnn')
311 |         # Kerasは複数指定した損失の合計をモデル全体の損失として評価してくれる。
312 |         # 損失を追加
313 |         for output in losses:
314 |             model.add_loss(tf.reduce_mean(output, keep_dims=True))
315 |         return model, len(outputs)
316 | 
317 | 
318 | class MaskSubsamplingRoiLayer(SubsamplingRoiLayer):
319 |     def __init__(self, config, **kwargs):
320 |         super().__init__(config, **kwargs)
321 | 
322 |     def compute_output_shape(self, input_shape):
323 |         return [(self.config.batch_size, self.n_samples_per_batch, 4),
324 |                 (self.config.batch_size, self.n_samples_per_batch, 4),
325 |                 (self.config.batch_size, self.n_samples_per_batch),
326 |                 (self.config.batch_size, self.n_samples_per_batch,
327 |                  self.config.mask_out_shape[0], self.config.mask_out_shape[1])]
328 | 
329 |     def _subsampling(self, normalized_rois, gt_bboxes, gt_labels, gt_masks,
330 |                      pos_iou_thresh=0.5,
331 |                      exclusive_iou_tresh=0.1,
332 |                      pos_ratio=0.25):
333 |         """
334 |         Args:
335 |             normalized_rois (tensor) : RegionProposalLayerで得られたRoI。
336 |                 (N, n_rois, 4)
337 |                 3軸目は領域の左上と右下の座標が0〜1に正規化された値。
338 |                 入力画像サイズの高さ、幅で除算することで正規化された値。
339 |                     (y1, x1, y2, x2)
340 |             gt_bboxes (ndarray) : 正解BBox。
341 |                 (N, config.n_max_gt_objects_per_image, 4)
342 |                 座標は正規化されていない。
343 |             gt_labels (ndarray) : 正解ラベル。
344 |                 (N, config.n_max_gt_objects_per_image)
345 |                 ==0:背景データ
346 |                 >=1:オブジェクト
347 |             gt_masks (ndarray) : 正解mask。
348 |                 (N, config.n_max_gt_objects_per_image,
349 |                     config.image_shape[0], config.image_shape[1])
350 |                 座標は正規化されていない。
351 |         Returns:
352 |             sample_rois (tensor): サンプリングしたRoI。
353 |                 (N, n_samples_per_batch, 4)
354 |                 3軸目の座標は0〜1に正規化された値。
355 |             sample_gt_offset (tensor): サンプリングしたRoIに対応するBBoxとのオフセット。
356 |                 (N, n_samples_per_batch, 4)
357 |                 3軸目の座標は0〜1に正規化された値をself.config.bbox_refinement_stdで割ることで標準化した値。
358 |             sample_gt_labels (tensor): サンプリングしたRoIに対応するBBoxのラベル。
359 |                 (N, n_samples_per_batch)
360 |             sample_gt_masks (tensor): サンプリングしたRoIに対応するmask。
361 |                 _nn_mask()で得られる特徴マップのサイズにリサイズされている。
362 |                 (N, n_samples_per_batch, config.mask_out_shape[0],
363 |                     config.mask_out_shape[1])
364 |         """
365 |         # TODO maskに関する冗長なコードの排除
366 |         pos_roi_per_batch = round(self.n_samples_per_batch * pos_ratio)
367 | 
368 |         # gt_bboxesをnormalized_roisに合わせて正規化する。
369 |         # これでIoUが評価出来るようになる。
370 |         input_h = self.config.image_shape[0]
371 |         input_w = self.config.image_shape[1]
372 |         normalized_gt_bboxes = bbox.normalize_bbox(gt_bboxes, input_h, input_w)
373 | 
374 |         # 入力をバッチ毎に分割
375 |         normalized_rois = tf.split(normalized_rois, self.config.batch_size)
376 |         normalized_gt_bboxes = tf.split(normalized_gt_bboxes,
377 |                                         self.config.batch_size)
378 |         gt_labels = tf.split(gt_labels, self.config.batch_size)
379 |         gt_masks = tf.split(gt_masks, self.config.batch_size)
380 | 
381 |         sample_rois = []
382 |         sample_gt_offsets = []
383 |         sample_gt_labels = []
384 |         sample_gt_masks = []
385 | 
386 |         for roi, gt_bbox, gt_label, gt_mask in zip(normalized_rois,
387 |                                                    normalized_gt_bboxes,
388 |                                                    gt_labels,
389 |                                                    gt_masks):
390 |             # 0次元目(バッチサイズ)は不要なので削除
391 |             roi = K.squeeze(roi, 0)
392 |             gt_bbox = K.squeeze(gt_bbox, 0)
393 |             gt_label = K.squeeze(gt_label, 0)
394 |             gt_mask = K.squeeze(gt_mask, 0)  # mask
395 | 
396 |             # ゼロパディング行を除外
397 |             idx_roi_row = K.flatten(tf.where(K.any(roi, axis=1)))
398 |             idx_gt_bbox = K.flatten(tf.where(K.any(gt_bbox, axis=1)))
399 |             roi = K.gather(roi, idx_roi_row)
400 |             # gt_bbox, gt_label, gt_masksは行数と行の並びが同じなので同じidxを利用できる
401 |             gt_bbox = K.gather(gt_bbox, idx_gt_bbox)
402 |             gt_label = K.gather(gt_label, idx_gt_bbox)
403 |             gt_mask = K.gather(gt_mask, idx_gt_bbox)  # mask
404 | 
405 |             # IoUを求める。
406 |             # (n_rois, )
407 |             ious = bbox.get_iou_K(roi, gt_bbox)
408 | 
409 |             # 各RoI毎にIoU最大のBBoxの位置を得る
410 |             idx_max_gt = K.argmax(ious, axis=1)
411 | 
412 |             max_iou = K.max(ious, axis=1)  # max_iouの行数はroiと同じになる
413 |             idx_pos = K.flatten(tf.where(max_iou >= pos_iou_thresh))
414 |             # positiveサンプル数をpos_roi_per_batch以内に制限
415 |             limit_pos = K.minimum(pos_roi_per_batch, K.shape(idx_pos)[0])
416 |             idx_pos = K.switch(K.shape(idx_pos)[0] > 0,
417 |                                tf.random_shuffle(idx_pos)[:limit_pos],
418 |                                idx_pos)
419 | 
420 |             # negativeサンプル数を
421 |             #   n_samples_per_batch - pos_roi_per_batch
422 |             # に制限
423 |             idx_neg = K.flatten(tf.where((max_iou < pos_iou_thresh)
424 |                                          & (max_iou >= exclusive_iou_tresh)))
425 |             # negativeサンプル数は pos_roi_per_batch - limit_pos(つまり残り) 以内に制限
426 |             limit_neg = self.n_samples_per_batch - limit_pos
427 |             limit_neg = K.minimum(limit_neg, K.shape(idx_neg)[0])
428 |             idx_neg = K.switch(K.shape(idx_neg)[0] > 0,
429 |                                tf.random_shuffle(idx_neg)[:limit_neg],
430 |                                idx_neg)
431 | 
432 |             # 返却するサンプルを抽出
433 |             # GTのoffsets, labelsは各roisに対応させる。つまり、同じ位置に格納する。
434 |             idx_keep = K.cast(K.concatenate((idx_pos, idx_neg)), tf.int32)
435 | 
436 |             # 各RoIの最大IoUを示すIndexについても、上記返却するサンプルのみを残す。
437 |             idx_gt_keep = K.cast(K.gather(idx_max_gt, idx_keep), tf.int32)
438 |             # IoUが閾値以上のPositiveとみなされるサンプルのみを残すためのIndex。
439 |             idx_gt_keep_pos = K.cast(K.gather(idx_max_gt, idx_pos), tf.int32)
440 | 
441 |             # 残すべきRoIを残す
442 |             sample_roi_pre_pad = K.gather(roi, idx_keep)
443 |             # sample_roiに対応するgt_bboxとのオフセットを求める
444 |             sample_gt_offset = bbox.get_offset_K(
445 |                 sample_roi_pre_pad, K.gather(gt_bbox, idx_gt_keep))
446 |             # sample_gt_offsetの算出に用いたbboxに対応するラベルを残す
447 |             # negativeな要素には0を設定
448 |             sample_gt_label = K.concatenate((K.cast(K.gather(
449 |                 gt_label, idx_gt_keep_pos),
450 |                 dtype='int32'),
451 |                 K.zeros([limit_neg],  # K.zerosは0階テンソルを受け付けないので配列化。。。
452 |                         dtype='int32')))
453 |             sample_gt_mask = K.gather(gt_mask, idx_gt_keep)  # mask
454 | 
455 |             # 行数がn_samples_per_batch未満の場合は0パディング
456 |             remain = tf.maximum(self.n_samples_per_batch
457 |                                 - tf.shape(sample_roi_pre_pad)[0], 0)
458 |             sample_roi = tf.pad(sample_roi_pre_pad, [(0, remain), (0, 0)],
459 |                                 name='subsample_sample_roi')
460 |             sample_gt_offset = tf.pad(sample_gt_offset, [(0, remain), (0, 0)],
461 |                                       name='subsample_sample_gt_offset')
462 |             sample_gt_offset /= self.config.bbox_refinement_std
463 |             sample_gt_label = tf.pad(sample_gt_label, [(0, remain)],
464 |                                      name='subsample_sample_gt_label')
465 | 
466 |             # maskのバイナリマップをProposalLayerで得られるRoIでclipし、_nn_mask()で得られる特徴マップのサイズにリサイズ。
467 |             # _nn_mask()で得られるマスクは、RoIを基準とするため。
468 |             # RoIがズレていると評価にならない。。。RoIの精度が上がらないとネットワーク最終結果の精度は上がらない。。。
469 |             # tf.image.crop_and_resize でリサイズするため、4次元のテンソルに変換
470 |             sample_gt_mask = tf.expand_dims(sample_gt_mask, -1)
471 |             idx_box = K.arange(0, K.shape(sample_roi_pre_pad)[0])
472 |             sample_gt_mask = tf.image.crop_and_resize(
473 |                 sample_gt_mask, sample_roi_pre_pad, idx_box,
474 |                 self.config.mask_out_shape)
475 |             # crop_and_resizeのために追加した4次元目を削除
476 |             sample_gt_mask = tf.squeeze(sample_gt_mask, axis=3)
477 |             # リサイズの結果発生する少数を四捨五入して0 or 1に戻す。
478 |             sample_gt_mask = tf.round(sample_gt_mask)
479 |             # bbox, offset等と同様にpadding
480 |             sample_gt_mask = tf.pad(sample_gt_mask, [(0, remain),  # mask
481 |                                                      (0, 0), (0, 0)],
482 |                                     name='subsample_sample_gt_mask')
483 | 
484 |             sample_rois.append(sample_roi)
485 |             sample_gt_offsets.append(sample_gt_offset)
486 |             sample_gt_labels.append(sample_gt_label)
487 |             sample_gt_masks.append(sample_gt_mask)  # mask
488 | 
489 |         return [K.stack(sample_rois), K.stack(sample_gt_offsets),
490 |                 K.stack(sample_gt_labels), K.stack(sample_gt_masks)]  # mask
491 | 


--------------------------------------------------------------------------------