├── .gitignore
├── README.md
├── __init__.py
├── __pycache__
    ├── config.cpython-36.pyc
    └── nms.cpython-36.pyc
├── db_config.py
├── evaluate.py
├── figures
    ├── 1039_bboxshow.jpg
    ├── 1039_binarize_map.jpg
    ├── 1039_polyshow.jpg
    ├── 1039_thresh_binary.jpg
    ├── 1039_threshold_map.jpg
    ├── bacc.png
    ├── bloss.png
    ├── mloss.png
    ├── net.png
    ├── org.jpg
    ├── tbacc.png
    ├── tbloss.png
    ├── tloss.png
    └── ttloss.png
├── inference.py
├── lib
    ├── __init__.py
    ├── dataset
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── data_util.cpython-36.pyc
    │   │   ├── data_utils.cpython-36.pyc
    │   │   └── dataload.cpython-36.pyc
    │   ├── dataloader.py
    │   ├── generator_enqueuer.py
    │   ├── img_aug.py
    │   └── label_maker.py
    ├── networks
    │   ├── losses.py
    │   ├── mobilenet
    │   │   ├── conv_blocks.py
    │   │   ├── mobilenet.py
    │   │   ├── mobilenet_v2.py
    │   │   └── mobilenet_v3.py
    │   ├── model.py
    │   └── resnet
    │   │   ├── resnet_utils.py
    │   │   ├── resnet_v1.py
    │   │   ├── resnet_v1_tiny.py
    │   │   └── resnet_v2.py
    ├── postprocess
    │   └── post_process.py
    └── utils.py
├── requirements.txt
└── train.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # pycharm
 2 | .idea
 3 | *.xml
 4 | .DS_Store
 5 | 
 6 | # model
 7 | #checkpoints
 8 | #checkpoint
 9 | *.pb
10 | *.npy
11 | *ckpt*
12 | 
13 | # image
14 | #*.png
15 | *.jpg
16 | *.JPG
17 | *.jpeg
18 | 
19 | # txt result
20 | *.txt
21 | 
22 | # 编译
23 | *.pyc
24 | 
25 | 
26 | # logs output
27 | events*
28 | *.pkl
29 | 
30 | # mAP
31 | *.json


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DB: Real-time Scene Text Detection with Differentiable Binarization
  2 | 
  3 | 
  4 | ## Introduction
  5 | This is a TensorFlow implementation of ["Real-time Scene Text Detection with Differentiable Binarization"](https://arxiv.org/abs/1911.08947).
  6 | 
  7 | Part of the code is inherited from [DB](https://github.com/MhLiao/DB).
  8 | 
  9 | ![net](figures/net.png)
 10 | 
 11 | 
 12 | ## ToDo List
 13 | 
 14 | - [x] Release trained models
 15 | - [x] Training code
 16 | - [x] Inference code
 17 | - [x] Muti gpu training
 18 | - [x] Tensorboard support
 19 | - [x] Exp another train losses 
 20 | - [ ] Eval code
 21 | - [x] Data augmentation(crop and random img aug)
 22 | - [x] More backbones
 23 | - [x] Add dilation conv(ASPP layer)
 24 | - [ ] Deformable Convolutional Networks
 25 | 
 26 | 
 27 | ## Install
 28 | 
 29 |     pip install -r requirements.txt
 30 | 
 31 | 
 32 | ## Test
 33 | 
 34 | ### 1.Download model.
 35 | 
 36 | | Model 	| Download link 	|
 37 | |------------	|-------	|
 38 | | ResNet-50| [BaiduYun](https://pan.baidu.com/s/1Pfwl8M6aBwuUpJbP2jVFuw), [GoogleDrive](https://drive.google.com/drive/folders/1uJL6sf6EP6ekK_4XLNGLt1U9EGRJ0eDO?usp=sharing)|
 39 | | ResNet-50-ASPP |[BaiduYun](https://pan.baidu.com/s/1OlMbhLSaQYb4U1VZZGabHgf), [GoogleDrive](https://drive.google.com/open?id=1s91HWS4dtXCFv5x5-YlCaj-KbobnEEUuf)|
 40 | ### 2.Config network
 41 | revise the `db_config.py`
 42 | 
 43 |     cfg.BACKBONE = 'resnet_v1_50'
 44 |     # if trained model name does not have aspp, it should be False.
 45 |     cfg.ASPP_LAYER = False
 46 | 
 47 | ### 3.Start to test img.
 48 | 
 49 |     python inference.py --gpuid='0' --ckptpath='path' --imgpath='img.jpg'
 50 | 
 51 | 
 52 | ## Samples show
 53 | 
 54 | | org show 	| poly show 	| bbox show 	|
 55 | |------------	|-------	|-------	|
 56 | | ![poly_img](figures/org.jpg) 	| ![poly_img](figures/1039_polyshow.jpg) 	| ![bbox_img](figures/1039_bboxshow.jpg) 	|
 57 | | binarize_map |  threshold_map	| thresh_binary |
 58 | | ![bin_map](figures/1039_binarize_map.jpg) |  ![thres_map](figures/1039_threshold_map.jpg)	| ![bin_thres_map](figures/1039_thresh_binary.jpg) | 
 59 | 
 60 | 
 61 | ## Dataset
 62 | This repo is train on CTW1500 dataset.
 63 | Download from [BaiduYun](https://pan.baidu.com/s/1yG_191LemrQa7K0h7Wispw) (key:yjiz) or 
 64 | [OneDrive](https://1drv.ms/u/s!Aplwt7jiPGKilH4XzZPoKrO7Aulk).
 65 | 
 66 | 
 67 | ## Training model
 68 | #### 1. Get the CTW1500 train images path and labels path.
 69 | 
 70 | revise the `db_config.py`
 71 |     
 72 |     # Train data config
 73 |     cfg.TRAIN.IMG_DIR = '/path/ctw1500/train/text_image'
 74 |     cfg.TRAIN.LABEL_DIR = '/path/ctw1500/train/text_label_curve'
 75 |     
 76 |     # Val or test data config
 77 |     cfg.EVAL.IMG_DIR = '/path/ctw1500/test/text_image'
 78 |     cfg.EVAL.LABEL_DIR = '/path/ctw1500/test/text_label_circum'
 79 |     
 80 |     
 81 | #### 2. Muti gpu train and config network.
 82 | 
 83 | revise the `db_config.py`
 84 |     
 85 |     # only support 'resnet_v1_50' and 'resnet_v1_18'
 86 |     cfg.BACKBONE = 'resnet_v1_50' 
 87 |     # if you want to train aspp network, it should be True
 88 |     cfg.ASPP_LAYER = False
 89 |     cfg.TRAIN.VIS_GPU = '5,6' # single gpu -> '0'
 90 |     
 91 |     
 92 | #### 3. Save train logs and models.
 93 | 
 94 | revise the `db_config.py`
 95 | 
 96 |     cfg.TRAIN.TRAIN_LOGS = '/path/tf_logs'
 97 |     cfg.TRAIN.CHECKPOINTS_OUTPUT_DIR = '/path/ckpt'
 98 |     
 99 | #### 4. Pretrain or restore model.
100 | 
101 | If you want to pretrain model, revise the `db_config.py`
102 | 
103 |     cfg.TRAIN.RESTORE = False
104 |     cfg.TRAIN.PRETRAINED_MODEL_PATH = 'pretrain model path'
105 |     
106 | If you want to restore model, revise the `db_config.py`
107 | 
108 |     cfg.TRAIN.RESTORE = True
109 |     cfg.TRAIN.RESTORE_CKPT_PATH = 'checkpoint path'
110 | 
111 | #### 5. Start to train.
112 | 
113 |     python train.py
114 | 
115 | #### 6. Tensorboard show
116 |     
117 |     cd 'tensorboard path'
118 |     tensorboard --logdir=./
119 | 
120 | Red line is train logs, blue line is val logs.
121 | 
122 | Losses show
123 | 
124 | |   binarize loss	|   threshold loss	|threshold binary loss	|
125 | |------------	|-------	|-------	|
126 | | ![binarize_loss](figures/bloss.png) 	| ![threshold loss](figures/tloss.png)	|![thresh_binary_loss](figures/tbloss.png)	|
127 | |   model_loss 	|   total_loss	|	|
128 | | ![model_loss](figures/mloss.png) 	| ![total_loss](figures/ttloss.png) 	| |
129 | 
130 | 
131 | Acc show
132 | 
133 | |   binarize acc	|   threshold binary acc	|
134 | |------------	|-------	|
135 | | ![binarize acc](figures/bacc.png) 	| ![threshold binary acc](figures/tbacc.png)	|
136 | 
137 | 
138 | 
139 | ## Experiment
140 | 
141 | Test on RTX 2080 Ti.
142 | 
143 | |   BackBone	| ASPP |   Input Size	|   Infernce Time(ms)	|	PostProcess Time(ms) | FPS |
144 | |------------	|------ |--------	|-------	|-------	|-------	|
145 | | ResNet-50 	| × | 320	| 13.3 | 2.9 | 61.7 |
146 | | ResNet-50 	| × | 512	| 19.2 | 4.5 | 42.2 |
147 | | ResNet-50 	| × | 640	| 28.9 | 5.2 | 29.3 |
148 | | ResNet-50 	| × | 736	| 33.2 | 5.7 | 25.7 |
149 | | ResNet-18 	| × | 320	|  12.2 | 2.9 | 66.2 |
150 | | ResNet-18 	| × | 512	| 16.9 | 4.5 | 46.7 |
151 | | ResNet-18 	| × | 736	| 32.7 | 5.7 | 26 |
152 | | ResNet-50 	| √ | 640	| 32.6 | --- | --- |


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/__init__.py


--------------------------------------------------------------------------------
/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/__pycache__/config.cpython-36.pyc


--------------------------------------------------------------------------------
/__pycache__/nms.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/__pycache__/nms.cpython-36.pyc


--------------------------------------------------------------------------------
/db_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from easydict import EasyDict as edict
 3 | 
 4 | cfg = edict()
 5 | 
 6 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~inference~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 7 | cfg.MEANS = [123.68, 116.78, 103.94]
 8 | cfg.INPUT_MAX_SIZE = 640
 9 | cfg.K = 10
10 | cfg.EPSILON_RATIO = 0.001
11 | cfg.SHRINK_RATIO = 0.4
12 | cfg.THRESH_MIN = 0.3
13 | cfg.THRESH_MAX = 0.7
14 | cfg.FILTER_MIN_AREA = 1e-4
15 | 
16 | # ['resnet_v1_50', 'resnet_v1_18', 'resnet_v2_50', 'resnet_v2_18', 'mobilenet_v2', 'mobilenet_v3']
17 | cfg.BACKBONE = 'resnet_v1_50'
18 | cfg.ASPP_LAYER = False
19 | # ~~~~~~~~~~~~~~~~~~z~~~~~~~~~~~~~train config~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20 | 
21 | cfg.TRAIN = edict()
22 | cfg.TRAIN.VERSION = 'aspp'
23 | # 多gpu训练
24 | cfg.TRAIN.VIS_GPU = '3,4'
25 | cfg.TRAIN.BATCH_SIZE_PER_GPU = 2
26 | cfg.TRAIN.LOSS_ALPHA = 1.0
27 | cfg.TRAIN.LOSS_BETA = 10.0
28 | 
29 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~dataload & aug~~~~~~~~~~~~~~~~~~~~~~~~~~
30 | cfg.TRAIN.IMG_DIR = '/hostpersistent/zzh/dataset/open_data/ctw1500/train/text_image'
31 | cfg.TRAIN.LABEL_DIR = '/hostpersistent/zzh/dataset/open_data/ctw1500/train/text_label_curve'
32 | cfg.TRAIN.IMG_SIZE = 640
33 | cfg.TRAIN.MIN_TEXT_SIZE = 1
34 | cfg.TRAIN.MIN_AREA = 1
35 | cfg.TRAIN.IMG_SCALE = [0.5, 1, 1, 1, 1.5, 2.0]
36 | cfg.TRAIN.CROP_PROB = 0.9
37 | cfg.TRAIN.MIN_CROP_SIDE_RATIO = 0.001
38 | cfg.TRAIN.NUM_READERS = 20
39 | cfg.TRAIN.DATA_AUG_PROB = 0.0
40 | cfg.TRAIN.AUG_TOOL = ['GaussianBlur',
41 |                 'AverageBlur',
42 |                 'MedianBlur',
43 |                 'BilateralBlur',
44 |                 'MotionBlur',
45 |                 #'ElasticTransformation',
46 |                 #'PerspectiveTransform',
47 |                       ]
48 | 
49 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~save ckpt and log~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
50 | cfg.TRAIN.MAX_STEPS = 10000000
51 | cfg.TRAIN.SAVE_CHECKPOINT_STEPS = 2000
52 | cfg.TRAIN.SAVE_SUMMARY_STEPS = 100
53 | cfg.TRAIN.SAVE_MAX = 20
54 | cfg.TRAIN.TRAIN_LOGS = os.path.join('/hostpersistent/zzh/lab/DB-tf/', 'tf_logs')
55 | cfg.TRAIN.CHECKPOINTS_OUTPUT_DIR = os.path.join('/hostpersistent/zzh/lab/DB-tf/', 'ckpt')
56 | 
57 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~restore and pretrain~~~~~~~~~~~~~~~~~~~~~
58 | cfg.TRAIN.RESTORE = None
59 | cfg.TRAIN.RESTORE_CKPT_PATH = os.path.join('/hostpersistent/zzh/lab/DB-tf/', 'ckpt')
60 | cfg.TRAIN.PRETRAINED_MODEL_PATH = '/hostpersistent/zzh/lab/DB-tf/ckpt/DB_resnet_v1_50_1223_model.ckpt-121201'
61 | 
62 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~super em~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
63 | cfg.TRAIN.LEARNING_RATE = 0.0001
64 | cfg.TRAIN.OPT = 'adam'#'momentum'#
65 | cfg.TRAIN.MOVING_AVERAGE_DECAY = 0.997
66 | 
67 | 
68 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ eval ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
69 | cfg.EVAL = edict()
70 | cfg.EVAL.IMG_DIR = '/hostpersistent/zzh/dataset/open_data/ctw1500/test/text_image'
71 | cfg.EVAL.LABEL_DIR = '/hostpersistent/zzh/dataset/open_data/ctw1500/test/text_label_circum'
72 | cfg.EVAL.NUM_READERS = 1
73 | cfg.EVAL.TEST_STEP = 5000
74 | 


--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | import os
  4 | import cv2
  5 | import tqdm
  6 | import numpy as np
  7 | 
  8 | from inference import DB
  9 | from db_config import cfg
 10 | from lib.utils import quad_iou, compute_f1_score, load_ctw1500_labels, make_dir
 11 | 
 12 | 
 13 | def load_pred_labels(path):
 14 |     pass
 15 | 
 16 | def evaluate(gt_care_list, gt_dontcare_list, pred_list, overlap=0.5):
 17 |     """
 18 | 
 19 |     :param gt_care_list: [-1, M, 2]
 20 |     :param gt_dontcare_list: [-1, M, 2]
 21 |     :param pred_list: [-1, M, 2]
 22 |     :param overlap:
 23 |     :return:
 24 |     """
 25 | 
 26 |     pred_care_list =[]
 27 |     pred_dontcare_list = []
 28 | 
 29 |     if len(gt_dontcare_list) != 0:
 30 |         for pred_box in pred_list:
 31 |             flag = False
 32 |             for gt_box in gt_dontcare_list:
 33 |                 if quad_iou(gt_box, pred_box) > overlap:
 34 |                     flag = True
 35 |                     break
 36 | 
 37 |             if not flag:
 38 |                 pred_care_list.append(pred_box)
 39 |             else:
 40 |                 pred_dontcare_list.append(pred_box)
 41 |     else:
 42 |         pred_care_list = pred_list
 43 | 
 44 |     gt_care_flag_list = [False] * len(gt_care_list)
 45 |     pred_care_flag_list = [False] * len(pred_care_list)
 46 |     pairs_list = []
 47 |     gt_not_pair_list = []
 48 |     pred_not_pair_list = []
 49 | 
 50 |     for gt_i, gt_box in enumerate(gt_care_list):
 51 |         for pred_i, pred_box in enumerate(pred_care_list):
 52 |             if pred_care_flag_list[pred_i]:
 53 |                 continue
 54 |             else:
 55 |                 iou = quad_iou(gt_box, pred_box)
 56 |                 if iou > overlap:
 57 |                     pair_dict = {}
 58 |                     pair_dict['gt'] = gt_box
 59 |                     pair_dict['pred'] = pred_box
 60 |                     pair_dict['iou'] = iou
 61 |                     pairs_list.append(pair_dict)
 62 |                     pred_care_flag_list[pred_i] = True
 63 |                     gt_care_flag_list[gt_i] = True
 64 | 
 65 |     TP = len(pairs_list)
 66 | 
 67 |     if len(gt_care_list) == 0:
 68 |         recall = 1.0
 69 |         precision = 1.0 if len(pred_care_list) == 0 else 0.0
 70 |     elif len(pred_care_list) == 0:
 71 |         recall = 0.0
 72 |         precision = 0.0
 73 |     else:
 74 |         recall = 1.0 * TP / len(gt_care_list)
 75 |         precision = 1.0 * TP / len(pred_care_list)
 76 | 
 77 |     f1_score = compute_f1_score(precision, recall)
 78 | 
 79 |     return precision, recall, f1_score, TP, len(gt_care_list), len(pred_care_list), pairs_list
 80 | 
 81 | 
 82 | def evaluate_all(gt_file_dir, gt_img_dir, ckpt_path, gpuid='0'):
 83 |     db = DB(ckpt_path, gpuid)
 84 | 
 85 |     img_list = os.listdir(gt_img_dir)
 86 | 
 87 |     show = './eva'
 88 |     make_dir(show)
 89 | 
 90 |     total_TP = 0
 91 |     total_gt_care_num = 0
 92 |     total_pred_care_num = 0
 93 |     for img_name in tqdm.tqdm(img_list):
 94 |         img = cv2.imread(os.path.join(gt_img_dir, img_name))
 95 | 
 96 |         pred_box_list, pred_score_list, _ = db.detect_img(os.path.join(gt_img_dir, img_name),
 97 |                                                           ispoly=True,
 98 |                                                           show_res=False)
 99 | 
100 |         gt_file_name = os.path.splitext(img_name)[0] + '.txt'
101 | 
102 |         gt_boxes, tags = load_ctw1500_labels(os.path.join(gt_file_dir, gt_file_name))
103 | 
104 |         gt_care_list = []
105 |         gt_dontcare_list = []
106 | 
107 |         for i, box in enumerate(gt_boxes):
108 |             box = box.reshape((-1, 2)).tolist()
109 |             if tags[i] == False:
110 |                 gt_care_list.append(box)
111 |             else:
112 |                 gt_dontcare_list.append(box)
113 | 
114 |         precision, recall, f1_score, TP, gt_care_num, pred_care_num, pairs_list = evaluate(gt_care_list,
115 |                                                                                gt_dontcare_list,
116 |                                                                                pred_box_list,
117 |                                                                                overlap=0.5)
118 | 
119 |         for pair in pairs_list:
120 |             cv2.polylines(img, [np.array(pair['gt'], np.int).reshape([-1, 1, 2])], True, (0, 255, 0))
121 |             cv2.polylines(img, [np.array(pair['pred'], np.int).reshape([-1, 1, 2])], True, (255, 0, 0))
122 | 
123 |         cv2.imwrite(os.path.join(show, img_name), img)
124 | 
125 |         total_TP += TP
126 |         total_gt_care_num += gt_care_num
127 |         total_pred_care_num += pred_care_num
128 | 
129 |     total_precision = float(total_TP) / total_pred_care_num
130 |     total_recall = float(total_TP) / total_gt_care_num
131 |     total_f1_score = compute_f1_score(total_precision, total_recall)
132 | 
133 |     return total_precision, total_recall, total_f1_score
134 | 
135 | if __name__ == '__main__':
136 | 
137 |     ckpt_path = '/hostpersistent/zzh/lab/DB-tf/ckpt/DB_resnet_v1_50_aspp_model.ckpt-303001'
138 |     gt_img_dir = cfg.EVAL.IMG_DIR
139 |     gt_file_dir = cfg.EVAL.LABEL_DIR
140 | 
141 |     precision, recall, f1_score = evaluate_all(gt_file_dir, gt_img_dir, ckpt_path)
142 |     print(precision, recall, f1_score)
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 


--------------------------------------------------------------------------------
/figures/1039_bboxshow.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/1039_bboxshow.jpg


--------------------------------------------------------------------------------
/figures/1039_binarize_map.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/1039_binarize_map.jpg


--------------------------------------------------------------------------------
/figures/1039_polyshow.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/1039_polyshow.jpg


--------------------------------------------------------------------------------
/figures/1039_thresh_binary.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/1039_thresh_binary.jpg


--------------------------------------------------------------------------------
/figures/1039_threshold_map.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/1039_threshold_map.jpg


--------------------------------------------------------------------------------
/figures/bacc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/bacc.png


--------------------------------------------------------------------------------
/figures/bloss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/bloss.png


--------------------------------------------------------------------------------
/figures/mloss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/mloss.png


--------------------------------------------------------------------------------
/figures/net.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/net.png


--------------------------------------------------------------------------------
/figures/org.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/org.jpg


--------------------------------------------------------------------------------
/figures/tbacc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/tbacc.png


--------------------------------------------------------------------------------
/figures/tbloss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/tbloss.png


--------------------------------------------------------------------------------
/figures/tloss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/tloss.png


--------------------------------------------------------------------------------
/figures/ttloss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/ttloss.png


--------------------------------------------------------------------------------
/inference.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import time
  4 | import tqdm
  5 | import argparse
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | 
  9 | from db_config import cfg
 10 | from shapely.geometry import Polygon
 11 | from lib.postprocess.post_process import SegDetectorRepresenter
 12 | import lib.networks.model as model
 13 | 
 14 | 
 15 | def get_args():
 16 |     parser = argparse.ArgumentParser(description='DB-tf')
 17 |     parser.add_argument('--ckptpath', default='/hostpersistent/zzh/lab/DB-tf/ckpt/DB_resnet_v1_50_1223_model.ckpt-121201',
 18 |                         type=str,
 19 |                         help='load model')
 20 |     parser.add_argument('--imgpath', default='/hostpersistent/zzh/dataset/open_data/ctw1500/test/text_image/1012.jpg',
 21 |                         type=str)
 22 |     parser.add_argument('--gpuid', default='0',
 23 |                         type=str)
 24 |     parser.add_argument('--ispoly', default=True,
 25 |                         type=bool)
 26 |     parser.add_argument('--show_res', default=True,
 27 |                         type=bool)
 28 | 
 29 |     args = parser.parse_args()
 30 | 
 31 |     return args
 32 | 
 33 | def make_dir(dir):
 34 |     if not os.path.exists(dir):
 35 |         os.makedirs(dir)
 36 | 
 37 | class DB():
 38 | 
 39 |     def __init__(self, ckpt_path, gpuid='0'):
 40 |         os.environ['CUDA_VISIBLE_DEVICES'] = gpuid
 41 |         tf.reset_default_graph()
 42 |         self._input_images = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_images')
 43 |         global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)
 44 | 
 45 |         self._binarize_map, self._threshold_map, self._thresh_binary = model.model(self._input_images, is_training=False)
 46 | 
 47 |         variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step)
 48 |         saver = tf.train.Saver(variable_averages.variables_to_restore())
 49 |         gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
 50 |         gpu_config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options, allow_soft_placement=True)
 51 |         self.sess = tf.Session(config=gpu_config)
 52 |         saver.restore(self.sess, ckpt_path)
 53 |         self.decoder = SegDetectorRepresenter()
 54 |         print('restore model from:', ckpt_path)
 55 | 
 56 |     def __del__(self):
 57 |         self.sess.close()
 58 | 
 59 |     def detect_img(self, img_path, ispoly=True, show_res=True):
 60 |         img = cv2.imread(img_path)
 61 |         h, w, _ = img.shape
 62 |         resized_img, ratio, size = self._resize_img(img)
 63 | 
 64 |         s = time.time()
 65 |         binarize_map, threshold_map, thresh_binary = self.sess.run([self._binarize_map, self._threshold_map, self._thresh_binary],
 66 |                                                                    feed_dict={self._input_images: [resized_img]})
 67 |         net_time = time.time()-s
 68 | 
 69 |         s = time.time()
 70 |         boxes, scores = self.decoder([resized_img], binarize_map, ispoly)
 71 |         boxes = boxes[0]
 72 |         area = h * w
 73 |         res_boxes = []
 74 |         res_scores = []
 75 |         for i, box in enumerate(boxes):
 76 |             box[:, 0] *= ratio[1]
 77 |             box[:, 1] *= ratio[0]
 78 |             if Polygon(box).convex_hull.area > cfg.FILTER_MIN_AREA*area:
 79 |                 res_boxes.append(box)
 80 |                 res_scores.append(scores[0][i])
 81 |         post_time = time.time()-s
 82 | 
 83 |         if show_res:
 84 |             img_name = os.path.splitext(os.path.split(img_path)[-1])[0]
 85 |             make_dir('./show')
 86 |             cv2.imwrite('show/' + img_name + '_binarize_map.jpg', binarize_map[0][0:size[0], 0:size[1], :]*255)
 87 |             cv2.imwrite('show/' + img_name + '_threshold_map.jpg', threshold_map[0][0:size[0], 0:size[1], :]*255)
 88 |             cv2.imwrite('show/' + img_name + '_thresh_binary.jpg', thresh_binary[0][0:size[0], 0:size[1], :]*255)
 89 |             for box in res_boxes:
 90 |                 cv2.polylines(img, [box.astype(np.int).reshape([-1, 1, 2])], True, (0, 255, 0))
 91 |                 # print(Polygon(box).convex_hull.area, Polygon(box).convex_hull.area/area)
 92 |             cv2.imwrite('show/' + img_name + '_show.jpg', img)
 93 | 
 94 |         return res_boxes, res_scores, (net_time, post_time)
 95 | 
 96 | 
 97 |     def detect_batch(self, batch):
 98 |         pass
 99 | 
100 |     def _resize_img(self, img, max_size=640):
101 |         h, w, _ = img.shape
102 | 
103 |         ratio = float(max(h, w)) / max_size
104 | 
105 |         new_h = int((h / ratio // 32) * 32)
106 |         new_w = int((w / ratio // 32) * 32)
107 | 
108 |         resized_img = cv2.resize(img, dsize=(new_w, new_h))
109 | 
110 |         input_img = np.zeros([max_size, max_size, 3])
111 |         input_img[0:new_h, 0:new_w, :] = resized_img
112 | 
113 |         ratio_w = w / new_w
114 |         ratio_h = h / new_h
115 | 
116 |         return input_img, (ratio_h, ratio_w), (new_h, new_w)
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     args = get_args()
121 | 
122 |     db = DB(args.ckptpath, args.gpuid)
123 | 
124 |     db.detect_img(args.imgpath, args.ispoly, args.show_res)
125 | 
126 |     img_list = os.listdir('/hostpersistent/zzh/dataset/open_data/ctw1500/test/text_image/')
127 | 
128 |     net_all = 0
129 |     post_all = 0
130 |     pipe_all = 0
131 | 
132 |     for i in tqdm.tqdm(img_list):
133 |         _, _, (net_time, post_time) = db.detect_img(os.path.join('/hostpersistent/zzh/dataset/open_data/ctw1500/test/text_image/',i), args.ispoly, show_res=True)
134 |         net_all += net_time
135 |         post_all += post_time
136 |         pipe_all += (net_time + post_time)
137 | 
138 |     print('net:', net_all/len(img_list))
139 |     print('post:', post_all/len(img_list))
140 |     print('pipe:', pipe_all/len(img_list))


--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/lib/__init__.py


--------------------------------------------------------------------------------
/lib/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/lib/dataset/__init__.py


--------------------------------------------------------------------------------
/lib/dataset/__pycache__/data_util.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/lib/dataset/__pycache__/data_util.cpython-36.pyc


--------------------------------------------------------------------------------
/lib/dataset/__pycache__/data_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/lib/dataset/__pycache__/data_utils.cpython-36.pyc


--------------------------------------------------------------------------------
/lib/dataset/__pycache__/dataload.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/lib/dataset/__pycache__/dataload.cpython-36.pyc


--------------------------------------------------------------------------------
/lib/dataset/dataloader.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import tqdm
  4 | import time
  5 | import random
  6 | import numpy as np
  7 | 
  8 | from db_config import cfg
  9 | from lib.dataset.label_maker import make_border_map, make_score_map
 10 | from lib.dataset.generator_enqueuer import GeneratorEnqueuer
 11 | from lib.dataset.img_aug import crop_area, det_aug
 12 | from lib.utils import resize_img, load_ctw1500_labels, load_icdar_labels
 13 | 
 14 | def load_labels(gt_path, data_name='ctw1500'):
 15 |     if data_name == 'ctw1500':
 16 |         return load_ctw1500_labels(gt_path)
 17 |     elif data_name == 'icdar':
 18 |         return load_icdar_labels
 19 | 
 20 | def make_train_labels(polys, tags, h, w):
 21 |     """
 22 | 
 23 |     :param polys: numpy [N, 2]
 24 |     :param tags:
 25 |     :param h:
 26 |     :param w:
 27 |     :return:
 28 |     """
 29 | 
 30 |     threshold_map, thresh_mask = make_border_map(polys, tags, h, w)
 31 |     score_map, score_mask = make_score_map(polys, tags, h, w)
 32 | 
 33 |     return score_map, score_mask, threshold_map, thresh_mask
 34 | 
 35 | def generator(batchsize, img_dir, label_dir, random_scale=np.array(cfg.TRAIN.IMG_SCALE), is_eval=False):
 36 | 
 37 |     img_list = os.listdir(img_dir)
 38 | 
 39 |     epoch = 0
 40 |     while True:
 41 |         train_imgs = []
 42 |         train_score_maps = []
 43 |         train_socre_masks = []
 44 |         train_thresh_maps = []
 45 |         train_thresh_masks = []
 46 | 
 47 |         np.random.shuffle(img_list)
 48 | 
 49 |         for img_name in img_list:
 50 |             try:
 51 |                 img_path = os.path.join(img_dir, img_name)
 52 |                 label_path = os.path.join(label_dir, os.path.splitext(img_name)[0] + '.txt')
 53 | 
 54 |                 img_input = np.zeros([cfg.TRAIN.IMG_SIZE, cfg.TRAIN.IMG_SIZE, 3], dtype=np.float32)
 55 | 
 56 |                 img = cv2.imread(img_path)[:,:, ::-1]
 57 |                 img, (ratio_h, ratio_w) = resize_img(img, cfg.TRAIN.IMG_SIZE)
 58 | 
 59 |                 if random.random() < cfg.TRAIN.DATA_AUG_PROB and not is_eval:
 60 |                     img = det_aug(img)
 61 | 
 62 |                 polys, tags = load_labels(label_path)
 63 |                 polys[:, :, 0] *= ratio_w
 64 |                 polys[:, :, 1] *= ratio_h
 65 | 
 66 |                 if (random.random() < cfg.TRAIN.CROP_PROB) and (not is_eval):
 67 |                     img, polys, tags = crop_area(img, polys, tags)
 68 |                     img, (ratio_h, ratio_w) = resize_img(img, cfg.TRAIN.IMG_SIZE)
 69 |                     polys[:, :, 0] *= ratio_w
 70 |                     polys[:, :, 1] *= ratio_h
 71 | 
 72 |                 h, w, _ = img.shape
 73 |                 img_input[:h, :w, :] = img
 74 |                 h, w, _ = img_input.shape
 75 | 
 76 |                 score_map, score_mask, threshold_map, thresh_mask = make_train_labels(polys, tags, h, w)
 77 | 
 78 |                 train_imgs.append(img_input)
 79 |                 train_score_maps.append(score_map[:, :, np.newaxis])
 80 |                 train_socre_masks.append(score_mask[:, :, np.newaxis])
 81 |                 train_thresh_maps.append(threshold_map[:, :, np.newaxis])
 82 |                 train_thresh_masks.append(thresh_mask[:, :, np.newaxis])
 83 | 
 84 |                 if len(train_imgs) == batchsize:
 85 |                     if is_eval:
 86 |                         yield train_imgs, train_score_maps, train_socre_masks, train_thresh_maps, train_thresh_masks, epoch
 87 |                     else:
 88 |                         yield train_imgs, train_score_maps, train_socre_masks, train_thresh_maps, train_thresh_masks
 89 |                     train_imgs = []
 90 |                     train_score_maps = []
 91 |                     train_socre_masks = []
 92 |                     train_thresh_maps = []
 93 |                     train_thresh_masks = []
 94 | 
 95 |             except Exception as e:
 96 |                 import traceback
 97 |                 traceback.print_exc()
 98 |                 print(img_path)
 99 |                 # print(polys[0])
100 |                 # img_input = img_input.astype(np.int)
101 |                 # for poly in polys:
102 |                 #     poly = np.array(poly, dtype=np.int)
103 |                 #     cv2.polylines(img_input, [poly.reshape((-1, 1, 2))], True, (0, 255, 0))
104 |                 # cv2.imwrite(img_name, img_input)
105 |                 continue
106 |         epoch += 1
107 | 
108 | 
109 | def get_batch(num_workers, **kwargs):
110 |     try:
111 |         enqueuer = GeneratorEnqueuer(generator(**kwargs), use_multiprocessing=True)
112 |         print('Generator use 10 batches for buffering, this may take a while, you can tune this yourself.')
113 |         enqueuer.start(max_queue_size=10, workers=num_workers)
114 |         generator_output = None
115 |         while True:
116 |             while enqueuer.is_running():
117 |                 if not enqueuer.queue.empty():
118 |                     generator_output = enqueuer.queue.get()
119 |                     break
120 |                 else:
121 |                     time.sleep(0.01)
122 |             yield generator_output
123 |             generator_output = None
124 |     finally:
125 |         if enqueuer is not None:
126 |             enqueuer.stop()
127 | 
128 | 
129 | if __name__ =='__main__':
130 |     img_dir = '/Users/zhangzihao/AI/research/datasets/ctw1500/train/text_image'
131 |     label_dir = '/Users/zhangzihao/AI/research/datasets/ctw1500/train/text_label_curve'
132 | 
133 | 
134 |     img_list = os.listdir(img_dir)
135 |     label_list = os.listdir(label_dir)
136 |     # np.random.shuffle(img_list)
137 |     print(img_list[0])
138 |     img = cv2.imread(os.path.join(img_dir, img_list[0]))
139 |     h, w, _ = img.shape
140 |     polys, tags = load_labels(os.path.join(label_dir, os.path.splitext(img_list[0])[0] + '.txt'))
141 |     threshold_map, thresh_mask = make_border_map(polys, tags, h, w)
142 |     score_map, score_mask = make_score_map(polys, tags, h, w)
143 | 
144 |     #
145 |     # for poly in polys:
146 |     #     poly = np.array(poly, dtype=np.int)
147 |     #     cv2.polylines(img, [poly.reshape((-1, 1, 2))], True, (0, 255, 0))
148 |     #
149 |     #
150 |     # threshold_map, thresh_mask = make_border_map(polys, tags, h, w)
151 |     #
152 |     # s = time.time()
153 |     # score_map, score_mask = make_score_map(polys, tags, h, w)
154 |     # print(time.time()-s)
155 |     #
156 |     # cv2.imwrite('s.jpg', score_map*255)
157 |     # cv2.imwrite('t.jpg', threshold_map*255)
158 |     # cv2.imwrite('sm.jpg', score_mask*255)
159 |     #
160 |     # cv2.imwrite('o.jpg', img)
161 | 


--------------------------------------------------------------------------------
/lib/dataset/generator_enqueuer.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | this file is modified from keras implemention of data process multi-threading,
  3 | see https://github.com/fchollet/keras/blob/master/keras/utils/data_utils.py
  4 | '''
  5 | import time
  6 | import numpy as np
  7 | import threading
  8 | import multiprocessing
  9 | try:
 10 |     import queue
 11 | except ImportError:
 12 |     import Queue as queue
 13 | 
 14 | 
 15 | class GeneratorEnqueuer():
 16 |     """Builds a queue out of a data generator.
 17 | 
 18 |     Used in `fit_generator`, `evaluate_generator`, `predict_generator`.
 19 | 
 20 |     # Arguments
 21 |         generator: a generator function which endlessly yields data
 22 |         use_multiprocessing: use multiprocessing if True, otherwise threading
 23 |         wait_time: time to sleep in-between calls to `put()`
 24 |         random_seed: Initial seed for workers,
 25 |             will be incremented by one for each workers.
 26 |     """
 27 | 
 28 |     def __init__(self, generator,
 29 |                  use_multiprocessing=False,
 30 |                  wait_time=0.05,
 31 |                  random_seed=None):
 32 |         self.wait_time = wait_time
 33 |         self._generator = generator
 34 |         self._use_multiprocessing = use_multiprocessing
 35 |         self._threads = []
 36 |         self._stop_event = None
 37 |         self.queue = None
 38 |         self.random_seed = random_seed
 39 | 
 40 |     def start(self, workers=1, max_queue_size=10):
 41 |         """Kicks off threads which add data from the generator into the queue.
 42 | 
 43 |         # Arguments
 44 |             workers: number of worker threads
 45 |             max_queue_size: queue size
 46 |                 (when full, threads could block on `put()`)
 47 |         """
 48 | 
 49 |         def data_generator_task():
 50 |             while not self._stop_event.is_set():
 51 |                 try:
 52 |                     if self._use_multiprocessing or self.queue.qsize() < max_queue_size:
 53 |                         generator_output = next(self._generator)
 54 |                         self.queue.put(generator_output)
 55 |                     else:
 56 |                         time.sleep(self.wait_time)
 57 |                 except Exception:
 58 |                     self._stop_event.set()
 59 |                     raise
 60 | 
 61 |         try:
 62 |             if self._use_multiprocessing:
 63 |                 self.queue = multiprocessing.Queue(maxsize=max_queue_size)
 64 |                 self._stop_event = multiprocessing.Event()
 65 |             else:
 66 |                 self.queue = queue.Queue()
 67 |                 self._stop_event = threading.Event()
 68 | 
 69 |             for _ in range(workers):
 70 |                 if self._use_multiprocessing:
 71 |                     # Reset random seed else all children processes
 72 |                     # share the same seed
 73 |                     np.random.seed(self.random_seed)
 74 |                     thread = multiprocessing.Process(target=data_generator_task)
 75 |                     thread.daemon = True
 76 |                     if self.random_seed is not None:
 77 |                         self.random_seed += 1
 78 |                 else:
 79 |                     thread = threading.Thread(target=data_generator_task)
 80 |                 self._threads.append(thread)
 81 |                 thread.start()
 82 |         except:
 83 |             self.stop()
 84 |             raise
 85 | 
 86 |     def is_running(self):
 87 |         return self._stop_event is not None and not self._stop_event.is_set()
 88 | 
 89 |     def stop(self, timeout=None):
 90 |         """Stops running threads and wait for them to exit, if necessary.
 91 | 
 92 |         Should be called by the same thread which called `start()`.
 93 | 
 94 |         # Arguments
 95 |             timeout: maximum time to wait on `thread.join()`.
 96 |         """
 97 |         if self.is_running():
 98 |             self._stop_event.set()
 99 | 
100 |         for thread in self._threads:
101 |             if thread.is_alive():
102 |                 if self._use_multiprocessing:
103 |                     thread.terminate()
104 |                 else:
105 |                     thread.join(timeout)
106 | 
107 |         if self._use_multiprocessing:
108 |             if self.queue is not None:
109 |                 self.queue.close()
110 | 
111 |         self._threads = []
112 |         self._stop_event = None
113 |         self.queue = None
114 | 
115 |     def get(self):
116 |         """Creates a generator to extract data from the queue.
117 | 
118 |         Skip the data if it is `None`.
119 | 
120 |         # Returns
121 |             A generator
122 |         """
123 |         while self.is_running():
124 |             if not self.queue.empty():
125 |                 inputs = self.queue.get()
126 |                 if inputs is not None:
127 |                     yield inputs
128 |             else:
129 |                 time.sleep(self.wait_time)


--------------------------------------------------------------------------------
/lib/dataset/img_aug.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import os
  3 | import json
  4 | import cv2
  5 | import random
  6 | import numpy as np
  7 | import imageio
  8 | import imgaug as ia
  9 | import imgaug.augmenters as iaa
 10 | from imgaug.augmentables.polys import Polygon
 11 | from db_config  import cfg
 12 | 
 13 | 
 14 | def crop_area(im, polys, tags, crop_background=False, max_tries=50):
 15 |     '''
 16 |     make random crop from the input image
 17 |     :param im:
 18 |     :param polys:
 19 |     :param tags:
 20 |     :param crop_background:
 21 |     :param max_tries:
 22 |     :return:
 23 |     '''
 24 |     h, w, _ = im.shape
 25 |     pad_h = h // 10
 26 |     pad_w = w // 10
 27 |     h_array = np.zeros((h + pad_h * 2), dtype=np.int32)
 28 |     w_array = np.zeros((w + pad_w * 2), dtype=np.int32)
 29 |     for poly in polys:
 30 |         poly = np.round(poly, decimals=0).astype(np.int32)
 31 |         minx = np.min(poly[:, 0])
 32 |         maxx = np.max(poly[:, 0])
 33 |         w_array[minx + pad_w:maxx + pad_w] = 1
 34 |         miny = np.min(poly[:, 1])
 35 |         maxy = np.max(poly[:, 1])
 36 |         h_array[miny + pad_h:maxy + pad_h] = 1
 37 |     # ensure the cropped area not across a text
 38 |     h_axis = np.where(h_array == 0)[0]
 39 |     w_axis = np.where(w_array == 0)[0]
 40 |     if len(h_axis) == 0 or len(w_axis) == 0:
 41 |         return im, polys, tags
 42 |     for i in range(max_tries):
 43 |         xx = np.random.choice(w_axis, size=2)
 44 |         xmin = np.min(xx) - pad_w
 45 |         xmax = np.max(xx) - pad_w
 46 |         xmin = np.clip(xmin, 0, w - 1)
 47 |         xmax = np.clip(xmax, 0, w - 1)
 48 |         yy = np.random.choice(h_axis, size=2)
 49 |         ymin = np.min(yy) - pad_h
 50 |         ymax = np.max(yy) - pad_h
 51 |         ymin = np.clip(ymin, 0, h - 1)
 52 |         ymax = np.clip(ymax, 0, h - 1)
 53 | 
 54 |         if xmax - xmin < cfg.TRAIN.MIN_CROP_SIDE_RATIO * w or \
 55 |                 ymax - ymin < cfg.TRAIN.MIN_CROP_SIDE_RATIO * h:
 56 |             continue
 57 | 
 58 |         if polys.shape[0] != 0:
 59 |             poly_axis_in_area = (polys[:, :, 0] >= xmin) & (polys[:, :, 0] <= xmax) \
 60 |                                 & (polys[:, :, 1] >= ymin) & (polys[:, :, 1] <= ymax)
 61 |             selected_polys = np.where(np.sum(poly_axis_in_area, axis=1) == 4)[0]
 62 |         else:
 63 |             selected_polys = []
 64 |         if len(selected_polys) == 0:
 65 |             # no text in this area
 66 |             if crop_background:
 67 |                 return im[ymin:ymax + 1, xmin:xmax + 1, :], polys[selected_polys], tags[selected_polys]
 68 |             else:
 69 |                 continue
 70 |         im = im[ymin:ymax + 1, xmin:xmax + 1, :]
 71 |         polys = polys[selected_polys]
 72 |         tags = tags[selected_polys]
 73 |         polys[:, :, 0] -= xmin
 74 |         polys[:, :, 1] -= ymin
 75 |         return im, polys, tags
 76 | 
 77 |     return im, polys, tags
 78 | 
 79 | 
 80 | 
 81 | def det_aug(image, polys_np=None):
 82 |     """
 83 |     随机对图像做以下的增强操作
 84 |     :param image: cv2 read
 85 |     :param polys_np:[N, 4, 2]
 86 |     :return:
 87 |     """
 88 |     aug_sample = random.sample(cfg.TRAIN.AUG_TOOL, 1)[0]  #从数组中随机取出一个增强的功能
 89 | 
 90 |     ######################################################################################################
 91 |     # blur-模糊
 92 |     aug = None
 93 |     # 高斯滤波 sigma 为1-10的保留小数点后一位的float的随机值,可根据情况调整
 94 |     if aug_sample == 'GaussianBlur':
 95 |         sigma = random.uniform(1, 2)
 96 |         sigma = round(8, 10)
 97 |         aug = iaa.GaussianBlur(sigma)
 98 | 
 99 |     # 平均模糊 k 为1-10的随机 奇 数,范围根据情况调整
100 |     if aug_sample == 'AverageBlur':
101 |         k = random.randint(8, 10) * 2 + 1
102 |         aug = iaa.AverageBlur(k)
103 | 
104 |     # 中值滤波 k 为1-10的随机 奇 数,范围根据情况调整
105 |     if aug_sample == 'MedianBlur':
106 |         k = random.randint(8, 10) * 2 + 1
107 |         aug = iaa.MedianBlur(k)
108 | 
109 |     # 双边滤波 d=1 为 奇 数, sigma_color=(10, 250), sigma_space=(10, 250)
110 |     if aug_sample == 'BilateralBlur':
111 |         d = random.randint(0, 2) * 2 + 1
112 |         sigma_color = random.randint(10, 250)
113 |         sigma_space = random.randint(10, 250)
114 |         aug = iaa.BilateralBlur(d, sigma_color, sigma_space)
115 | 
116 |     # 运动模糊 k=5 一定大于3 的 奇 数, angle=(0, 360), direction=(-1.0, 1.0)
117 |     if aug_sample == 'MotionBlur':
118 |         k = random.randint(15, 20) * 2 + 1
119 |         angle = random.randint(0, 360)
120 |         direction = random.uniform(-1, 1)
121 |         direction = round(direction, 1)
122 |         aug = iaa.MotionBlur(k, angle, direction)
123 | 
124 |     ######################################################################################################
125 |     # geometric  几何学
126 | 
127 |     # 弹性变换
128 |     if aug_sample == 'ElasticTransformation':
129 |         alpha = random.uniform(10, 20)
130 |         alpha = round(alpha, 1)
131 |         sigma = random.uniform(5, 10)
132 |         sigma = round(sigma, 1)
133 |         # print(alpha, sigma)
134 |         aug = iaa.ElasticTransformation(alpha, sigma)
135 | 
136 |     # 透视
137 |     if aug_sample == 'PerspectiveTransform':
138 |         scale = random.uniform(0, 0.2)
139 |         scale = round(scale, 3)
140 |         aug = iaa.PerspectiveTransform(scale)
141 | 
142 |     # 旋转角度
143 |     # if aug_sample == 'Affine_rot':
144 |     #     rotate = random.randint(-20, 20)
145 |     #     aug = iaa.Affine(rotate=rotate)
146 | 
147 |     # 缩放
148 |     # if aug_sample == 'Affine_scale':
149 |     #     scale = random.uniform(0, 2)
150 |     #     scale = round(scale, 1)
151 |     #     aug = iaa.Affine(scale=scale)
152 |     ######################################################################################################
153 |     # flip 镜像
154 | 
155 |     # 水平镜像
156 |     # if aug_sample == 'Fliplr':
157 |     #     aug = iaa.Fliplr(1)
158 |     #
159 |     # 垂直镜像
160 |     # if aug_sample == 'Flipud':
161 |     #     aug = iaa.Flipud(1)
162 | 
163 |     ######################################################################################################
164 |     # size 尺寸
165 | 
166 |     # if aug_sample == 'CropAndPad':
167 |     #     top = random.randint(0, 10)
168 |     #     right = random.randint(0, 10)
169 |     #     bottom = random.randint(0, 10)
170 |     #     left = random.randint(0, 10)
171 |     #     aug = iaa.CropAndPad(px=(top, right, bottom, left))  # 上 右 下 左 各crop多少像素,然后进行padding
172 | 
173 |     if aug_sample == 'Crop':
174 |         top = random.randint(0, 10)
175 |         right = random.randint(0, 10)
176 |         bottom = random.randint(0, 10)
177 |         left = random.randint(0, 10)
178 |         aug = iaa.Crop(px=(top, right, bottom, left))  # 上 右 下 左
179 | 
180 |     if aug_sample == 'Pad':
181 |         top = random.randint(0, 10)
182 |         right = random.randint(0, 10)
183 |         bottom = random.randint(0, 10)
184 |         left = random.randint(0, 10)
185 |         aug = iaa.Pad(px=(top, right, bottom, left))  # 上 右 下 左
186 | 
187 |     # if aug_sample == 'PadToFixedSize':
188 |     #     height = image.shape[0] + 32
189 |     #     width = image.shape[1] + 100
190 |     #     aug = iaa.PadToFixedSize(width=width, height=height)z
191 | 
192 |     # if aug_sample == 'CropToFixedSize':
193 |     #     height = image.shape[0] - 32
194 |     #     width = image.shape[1] - 100
195 |     #     aug = iaa.CropToFixedSize(width=width, height=height)
196 | 
197 |     if polys_np is not None:
198 |         if aug is not None:
199 |             # print(aug_sample)
200 |             h, w, _ = image.shape
201 |             boxes_info_list = []
202 |             for box in polys_np:
203 |                 boxes_info_list.append(Polygon(box))
204 | 
205 |             psoi = ia.PolygonsOnImage(boxes_info_list, shape=image.shape)  # 生成单个图像上所有多边形的对象
206 |             image, psoi_aug = aug(image=image, polygons=psoi)
207 | 
208 |             pts_list = []
209 |             for each_poly in psoi_aug.polygons:
210 |                 pts_list.append(np.array(each_poly.exterior).reshape((4, 2)))
211 |             return image, np.array(pts_list, np.float32).reshape((-1, 4, 2))
212 |         else:
213 | 
214 |             return image, polys_np
215 |     else:
216 |         image = aug(image=image)
217 |         return image
218 | 


--------------------------------------------------------------------------------
/lib/dataset/label_maker.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import numpy as np
  4 | from shapely.geometry import Polygon
  5 | import pyclipper
  6 | from db_config import cfg
  7 | 
  8 | import warnings
  9 | warnings.filterwarnings('ignore')
 10 | 
 11 | def _distance(xs, ys, point_1, point_2):
 12 |     '''
 13 |     compute the distance from point to a line
 14 |     ys: coordinates in the first axis
 15 |     xs: coordinates in the second axis
 16 |     point_1, point_2: (x, y), the end of the line
 17 |     '''
 18 |     square_distance_1 = np.square(xs - point_1[0]) + np.square(ys - point_1[1])
 19 |     square_distance_2 = np.square(xs - point_2[0]) + np.square(ys - point_2[1])
 20 |     square_distance = np.square(point_1[0] - point_2[0]) + np.square(point_1[1] - point_2[1])
 21 | 
 22 |     cosin = (square_distance - square_distance_1 - square_distance_2) / (2 * np.sqrt(square_distance_1 * square_distance_2))
 23 |     square_sin = 1 - np.square(cosin)
 24 |     square_sin = np.nan_to_num(square_sin)
 25 | 
 26 |     result = np.sqrt(square_distance_1 * square_distance_2 * square_sin / square_distance)
 27 |     result[cosin < 0] = np.sqrt(np.fmin(square_distance_1, square_distance_2))[cosin < 0]
 28 |     return result
 29 | 
 30 | def _extend_line(point1, point2, result):
 31 |     ex_point_1 = (int(round(point1[0] + (point1[0] - point2[0]) * (1 + cfg.SHRINK_RATIO))),
 32 |                   int(round(point1[1] + (point1[1] - point2[1]) * (1 + cfg.SHRINK_RATIO))))
 33 |     cv2.line(result, tuple(ex_point_1), tuple(point1), 4096.0, 1, lineType=cv2.LINE_AA, shift=0)
 34 |     ex_point_2 = (int(round(point2[0] + (point2[0] - point1[0]) * (1 + cfg.SHRINK_RATIO))),
 35 |                   int(round(point2[1] + (point2[1] - point1[1]) * (1 + cfg.SHRINK_RATIO))))
 36 |     cv2.line(result, tuple(ex_point_2), tuple(point2), 4096.0, 1, lineType=cv2.LINE_AA, shift=0)
 37 |     return ex_point_1, ex_point_2
 38 | 
 39 | def _validate_polygons(polys, tags, h, w):
 40 | 
 41 |     if len(polys) == 0:
 42 |         return polys, tags
 43 |     for poly in polys:
 44 |         poly[:, 0] = np.clip(poly[:, 0], 0, w - 1)
 45 |         poly[:, 1] = np.clip(poly[:, 1], 0, h - 1)
 46 | 
 47 |     for i in range(len(polys)):
 48 |         area = Polygon(polys[i]).convex_hull.area
 49 |         # area = _polygon_area(polys[i])
 50 |         # if abs(area) < 1:
 51 |         #     tags[i] = True
 52 |         # if area > 0:
 53 |         #     polys[i] = polys[i][::-1, :]
 54 |         if area <= cfg.TRAIN.MIN_AREA:
 55 |             tags[i] = True
 56 |     return polys, tags
 57 | 
 58 | def _polygon_area(poly):
 59 |     edge = 0
 60 |     for i in range(poly.shape[0]):
 61 |         next_index = (i + 1) % poly.shape[0]
 62 |         edge += (poly[next_index, 0] - poly[i, 0]) * (poly[next_index, 1] - poly[i, 1])
 63 |     return edge / 2.
 64 | 
 65 | 
 66 | def make_score_map(text_polys, tags, h, w):
 67 |     min_text_size = cfg.TRAIN.MIN_TEXT_SIZE
 68 |     shrink_ratio = cfg.SHRINK_RATIO
 69 | 
 70 |     text_polys, ignore_tags = _validate_polygons(text_polys, tags, h, w)
 71 |     score_map = np.zeros((h, w), dtype=np.float32)
 72 |     mask = np.ones((h, w), dtype=np.float32)
 73 | 
 74 |     for i in range(len(text_polys)):
 75 |         polygon = text_polys[i]
 76 |         height = max(polygon[:, 1]) - min(polygon[:, 1])
 77 |         width = max(polygon[:, 0]) - min(polygon[:, 0])
 78 |         if ignore_tags[i] or min(height, width) < min_text_size:
 79 |             cv2.fillPoly(mask, polygon.astype(np.int32)[np.newaxis, :, :], 0)
 80 |             ignore_tags[i] = True
 81 |         else:
 82 |             polygon_shape = Polygon(polygon)
 83 |             distance = polygon_shape.area * (1 - np.power(shrink_ratio, 2)) / polygon_shape.length
 84 |             subject = [tuple(l) for l in text_polys[i]]
 85 |             padding = pyclipper.PyclipperOffset()
 86 |             padding.AddPath(subject, pyclipper.JT_ROUND,
 87 |                             pyclipper.ET_CLOSEDPOLYGON)
 88 |             shrinked = padding.Execute(-distance)
 89 |             if shrinked == []:
 90 |                 cv2.fillPoly(mask, polygon.astype(np.int32)[np.newaxis, :, :], 0)
 91 |                 ignore_tags[i] = True
 92 |                 continue
 93 |             shrinked = np.array(shrinked[0]).reshape(-1, 2)
 94 |             cv2.fillPoly(score_map, [shrinked.astype(np.int32)], 1)
 95 | 
 96 |     return score_map, mask
 97 | 
 98 | def make_border_map(text_polys, tags, h, w):
 99 | 
100 |     canvas = np.zeros([h, w], dtype=np.float32)
101 |     mask = np.zeros([h, w], dtype=np.float32)
102 | 
103 |     for i in range(len(text_polys)):
104 |         if tags[i]:
105 |             continue
106 |         canvas, mask = _draw_border_map(text_polys[i], canvas, mask)
107 |     threshold_map = canvas * (cfg.THRESH_MAX - cfg.THRESH_MIN) + cfg.THRESH_MIN
108 | 
109 |     return threshold_map, mask
110 | 
111 | def _draw_border_map(poly, canvas, mask):
112 |     poly = np.array(poly).copy()
113 |     assert poly.ndim == 2
114 |     assert poly.shape[1] == 2
115 | 
116 |     poly_shape = Polygon(poly)
117 |     if poly_shape.area <= 0:
118 |         return
119 |     distance = poly_shape.area * (1 - np.power(cfg.SHRINK_RATIO, 2)) / poly_shape.length
120 |     subject = [tuple(l) for l in poly]
121 |     padding = pyclipper.PyclipperOffset()
122 |     padding.AddPath(subject, pyclipper.JT_ROUND,
123 |                     pyclipper.ET_CLOSEDPOLYGON)
124 | 
125 |     padded_polygon = np.array(padding.Execute(distance)[0])
126 |     cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
127 | 
128 |     xmin = padded_polygon[:, 0].min()
129 |     xmax = padded_polygon[:, 0].max()
130 |     ymin = padded_polygon[:, 1].min()
131 |     ymax = padded_polygon[:, 1].max()
132 |     width = xmax - xmin + 1
133 |     height = ymax - ymin + 1
134 | 
135 |     poly[:, 0] = poly[:, 0] - xmin
136 |     poly[:, 1] = poly[:, 1] - ymin
137 | 
138 |     xs = np.broadcast_to(
139 |         np.linspace(0, width - 1, num=width).reshape(1, width), (height, width))
140 |     ys = np.broadcast_to(
141 |         np.linspace(0, height - 1, num=height).reshape(height, 1), (height, width))
142 | 
143 |     distance_map = np.zeros(
144 |         (poly.shape[0], height, width), dtype=np.float32)
145 |     for i in range(poly.shape[0]):
146 |         j = (i + 1) % poly.shape[0]
147 |         absolute_distance = _distance(xs, ys, poly[i], poly[j])
148 |         distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
149 |     distance_map = distance_map.min(axis=0)
150 | 
151 |     xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
152 |     xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
153 |     ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
154 |     ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
155 |     # print(xmin_valid, xmax_valid, ymin_valid, ymax_valid)
156 |     # print(xmin, xmax, ymin, ymax)
157 |     # print(distance_map.shape)
158 |     # print(distance_map[
159 |     #         ymin_valid - ymin:ymax_valid - ymax + height,
160 |     #         xmin_valid - xmin:xmax_valid - xmax + width].shape)
161 |     canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax(
162 |         1 - distance_map[ymin_valid - ymin:ymax_valid - ymax + height, xmin_valid - xmin:xmax_valid - xmax + width],
163 |         canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1])
164 | 
165 |     return canvas, mask


--------------------------------------------------------------------------------
/lib/networks/losses.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from db_config import cfg
  3 | 
  4 | 
  5 | def dice_coefficient_loss(y_true_cls, y_pred_cls,
  6 |                      training_mask):
  7 |     '''
  8 |     dice loss
  9 |     :param y_true_cls:
 10 |     :param y_pred_cls:
 11 |     :param training_mask:
 12 |     :return:
 13 |     '''
 14 |     eps = 1e-6
 15 |     intersection = tf.reduce_sum(y_true_cls * y_pred_cls * training_mask)
 16 |     union = tf.reduce_sum(y_true_cls * training_mask) + tf.reduce_sum(y_pred_cls * training_mask) + eps
 17 |     loss = 1. - (2 * intersection / union)
 18 |     return loss
 19 | 
 20 | 
 21 | def balance_cross_entropy_loss(gt, pred, mask,
 22 |                                negative_ratio=3.0, eps=1e-6):
 23 |     positive = gt * mask
 24 |     negative = (1 - gt) * mask
 25 |     positive_count = tf.reduce_sum(positive)
 26 |     negative_count = tf.minimum(tf.reduce_sum(negative), positive_count * negative_ratio)
 27 |     negative_count = tf.cast(negative_count, tf.int32)
 28 |     gt = tf.reshape(gt, [-1, 1])
 29 |     pred = tf.reshape(pred, [-1, 1])
 30 |     cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=gt, logits=pred)
 31 |     positive_loss = cross_entropy * positive
 32 |     negative_loss = cross_entropy * negative
 33 |     negative_loss, _ = tf.nn.top_k(tf.reshape(negative_loss, [-1]), negative_count)
 34 | 
 35 |     negative_count = tf.cast(negative_count, tf.float32)
 36 |     balance_loss = (tf.reduce_sum(positive_loss) + tf.reduce_sum(negative_loss)) / (positive_count + negative_count + eps)
 37 | 
 38 |     return balance_loss
 39 | 
 40 | def softmax_cross_entropy_loss(y_true_cls, y_pred_cls, training_mask):
 41 |     '''
 42 |     softmax_cross_entropy(SCE) loss
 43 |     :param y_true_cls:[bs,w,h,N]
 44 |     :param y_pred_cls:[bs,w,h,N]
 45 |     :param training_mask:
 46 |     :return:
 47 |     '''
 48 |     re_mask = 1 - training_mask
 49 |     zero_mask = tf.zeros(tf.shape(re_mask))
 50 |     add_mask = tf.concat((re_mask, zero_mask, zero_mask), axis=3)
 51 | 
 52 |     y_true_cls = y_true_cls * training_mask + add_mask
 53 |     y_pred_cls = y_pred_cls * training_mask + add_mask
 54 | 
 55 |     y_true_cls = tf.reshape(y_true_cls, [-1, tf.shape(y_true_cls)[-1]])
 56 |     y_pred_cls = tf.reshape(y_pred_cls, [-1, tf.shape(y_true_cls)[-1]])
 57 | 
 58 |     cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_true_cls, logits=y_pred_cls)
 59 |     cls_loss = tf.reduce_mean(cross_entropy)
 60 | 
 61 |     return cls_loss
 62 | 
 63 | def l1_loss(pred, gt, mask):
 64 | 
 65 |     loss = tf.reduce_mean(tf.abs(pred - gt) * mask) + 1e-6
 66 | 
 67 |     return loss
 68 | 
 69 | 
 70 | def smooth_l1_loss(pred, gt, mask, sigma=1.0):
 71 |     '''
 72 | 
 73 |     :param pred:
 74 |     :param gt: shape is same as pred
 75 |     :param sigma:
 76 |     :return:
 77 |     '''
 78 |     sigma2 = sigma**2
 79 | 
 80 |     diff = pred * mask - gt
 81 | 
 82 |     with tf.name_scope('smooth_l1_loss'):
 83 |         deltas_abs = tf.abs(diff)
 84 |         smoothL1_sign = tf.cast(tf.less(deltas_abs, 1.0 / sigma2), tf.float32)
 85 |         return tf.reduce_mean(tf.square(diff) * 0.5 * sigma2 * smoothL1_sign + \
 86 |                (deltas_abs - 0.5 / sigma2) * tf.abs(smoothL1_sign - 1))
 87 | 
 88 | def compute_cls_acc(pred, gt, mask):
 89 | 
 90 |     zero = tf.zeros_like(pred, tf.float32)
 91 |     one = tf.ones_like(pred, tf.float32)
 92 | 
 93 |     pred = tf.where(pred < 0.3, x=zero, y=one)
 94 |     acc = tf.reduce_mean(tf.cast(tf.equal(pred * mask, gt * mask), tf.float32))
 95 | 
 96 |     return acc
 97 | 
 98 | 
 99 | def compute_loss(binarize_map, threshold_map, thresh_binary,
100 |                  gt_score_maps, gt_threshold_map, gt_score_mask, gt_thresh_mask):
101 | 
102 |     binarize_loss = dice_coefficient_loss(gt_score_maps, binarize_map, gt_score_mask)
103 |     threshold_loss = l1_loss(threshold_map, gt_threshold_map, gt_thresh_mask)
104 |     thresh_binary_loss = dice_coefficient_loss(gt_score_maps, thresh_binary, gt_score_mask)
105 | 
106 |     model_loss = cfg.TRAIN.LOSS_ALPHA * binarize_loss + cfg.TRAIN.LOSS_BETA * threshold_loss + thresh_binary_loss
107 | 
108 |     tf.summary.scalar('losses/binarize_loss', binarize_loss)
109 |     tf.summary.scalar('losses/threshold_loss', threshold_loss)
110 |     tf.summary.scalar('losses/thresh_binary_loss', thresh_binary_loss)
111 |     return model_loss
112 | 
113 | def compute_acc(binarize_map, threshold_map, thresh_binary,
114 |                  gt_score_maps, gt_threshold_map, gt_score_mask, gt_thresh_mask):
115 |     binarize_acc = compute_cls_acc(binarize_map, gt_score_maps, gt_score_mask)
116 |     thresh_binary_acc = compute_cls_acc(thresh_binary, gt_score_maps, gt_score_mask)
117 | 
118 |     tf.summary.scalar('acc/binarize_acc', binarize_acc)
119 |     tf.summary.scalar('acc/thresh_binary_acc', thresh_binary_acc)
120 | 
121 |     return binarize_acc, thresh_binary_acc
122 | 
123 | 
124 | 
125 | 


--------------------------------------------------------------------------------
/lib/networks/mobilenet/conv_blocks.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Convolution blocks for mobilenet."""
 16 | import contextlib
 17 | import functools
 18 | 
 19 | import tensorflow as tf
 20 | from tensorflow.contrib import slim as contrib_slim
 21 | 
 22 | slim = contrib_slim
 23 | 
 24 | 
 25 | def _fixed_padding(inputs, kernel_size, rate=1):
 26 |   """Pads the input along the spatial dimensions independently of input size.
 27 |   Pads the input such that if it was used in a convolution with 'VALID' padding,
 28 |   the output would have the same dimensions as if the unpadded input was used
 29 |   in a convolution with 'SAME' padding.
 30 |   Args:
 31 |     inputs: A tensor of size [batch, height_in, width_in, channels].
 32 |     kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
 33 |     rate: An integer, rate for atrous convolution.
 34 |   Returns:
 35 |     output: A tensor of size [batch, height_out, width_out, channels] with the
 36 |       input, either intact (if kernel_size == 1) or padded (if kernel_size > 1).
 37 |   """
 38 |   kernel_size_effective = [kernel_size[0] + (kernel_size[0] - 1) * (rate - 1),
 39 |                            kernel_size[0] + (kernel_size[0] - 1) * (rate - 1)]
 40 |   pad_total = [kernel_size_effective[0] - 1, kernel_size_effective[1] - 1]
 41 |   pad_beg = [pad_total[0] // 2, pad_total[1] // 2]
 42 |   pad_end = [pad_total[0] - pad_beg[0], pad_total[1] - pad_beg[1]]
 43 |   padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg[0], pad_end[0]],
 44 |                                   [pad_beg[1], pad_end[1]], [0, 0]])
 45 |   return padded_inputs
 46 | 
 47 | 
 48 | def _make_divisible(v, divisor, min_value=None):
 49 |   if min_value is None:
 50 |     min_value = divisor
 51 |   new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
 52 |   # Make sure that round down does not go down by more than 10%.
 53 |   if new_v < 0.9 * v:
 54 |     new_v += divisor
 55 |   return new_v
 56 | 
 57 | 
 58 | def _split_divisible(num, num_ways, divisible_by=8):
 59 |   """Evenly splits num, num_ways so each piece is a multiple of divisible_by."""
 60 |   assert num % divisible_by == 0
 61 |   assert num / num_ways >= divisible_by
 62 |   # Note: want to round down, we adjust each split to match the total.
 63 |   base = num // num_ways // divisible_by * divisible_by
 64 |   result = []
 65 |   accumulated = 0
 66 |   for i in range(num_ways):
 67 |     r = base
 68 |     while accumulated + r < num * (i + 1) / num_ways:
 69 |       r += divisible_by
 70 |     result.append(r)
 71 |     accumulated += r
 72 |   assert accumulated == num
 73 |   return result
 74 | 
 75 | 
 76 | @contextlib.contextmanager
 77 | def _v1_compatible_scope_naming(scope):
 78 |   if scope is None:  # Create uniqified separable blocks.
 79 |     with tf.variable_scope(None, default_name='separable') as s, \
 80 |          tf.name_scope(s.original_name_scope):
 81 |       yield ''
 82 |   else:
 83 |     # We use scope_depthwise, scope_pointwise for compatibility with V1 ckpts.
 84 |     # which provide numbered scopes.
 85 |     scope += '_'
 86 |     yield scope
 87 | 
 88 | 
 89 | @slim.add_arg_scope
 90 | def split_separable_conv2d(input_tensor,
 91 |                            num_outputs,
 92 |                            scope=None,
 93 |                            normalizer_fn=None,
 94 |                            stride=1,
 95 |                            rate=1,
 96 |                            endpoints=None,
 97 |                            use_explicit_padding=False):
 98 |   """Separable mobilenet V1 style convolution.
 99 |   Depthwise convolution, with default non-linearity,
100 |   followed by 1x1 depthwise convolution.  This is similar to
101 |   slim.separable_conv2d, but differs in tha it applies batch
102 |   normalization and non-linearity to depthwise. This  matches
103 |   the basic building of Mobilenet Paper
104 |   (https://arxiv.org/abs/1704.04861)
105 |   Args:
106 |     input_tensor: input
107 |     num_outputs: number of outputs
108 |     scope: optional name of the scope. Note if provided it will use
109 |     scope_depthwise for deptwhise, and scope_pointwise for pointwise.
110 |     normalizer_fn: which normalizer function to use for depthwise/pointwise
111 |     stride: stride
112 |     rate: output rate (also known as dilation rate)
113 |     endpoints: optional, if provided, will export additional tensors to it.
114 |     use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
115 |       inputs so that the output dimensions are the same as if 'SAME' padding
116 |       were used.
117 |   Returns:
118 |     output tesnor
119 |   """
120 | 
121 |   with _v1_compatible_scope_naming(scope) as scope:
122 |     dw_scope = scope + 'depthwise'
123 |     endpoints = endpoints if endpoints is not None else {}
124 |     kernel_size = [3, 3]
125 |     padding = 'SAME'
126 |     if use_explicit_padding:
127 |       padding = 'VALID'
128 |       input_tensor = _fixed_padding(input_tensor, kernel_size, rate)
129 |     net = slim.separable_conv2d(
130 |         input_tensor,
131 |         None,
132 |         kernel_size,
133 |         depth_multiplier=1,
134 |         stride=stride,
135 |         rate=rate,
136 |         normalizer_fn=normalizer_fn,
137 |         padding=padding,
138 |         scope=dw_scope)
139 | 
140 |     endpoints[dw_scope] = net
141 | 
142 |     pw_scope = scope + 'pointwise'
143 |     net = slim.conv2d(
144 |         net,
145 |         num_outputs, [1, 1],
146 |         stride=1,
147 |         normalizer_fn=normalizer_fn,
148 |         scope=pw_scope)
149 |     endpoints[pw_scope] = net
150 |   return net
151 | 
152 | 
153 | def expand_input_by_factor(n, divisible_by=8):
154 |   return lambda num_inputs, **_: _make_divisible(num_inputs * n, divisible_by)
155 | 
156 | 
157 | def split_conv(input_tensor,
158 |                num_outputs,
159 |                num_ways,
160 |                scope,
161 |                divisible_by=8,
162 |                **kwargs):
163 |   """Creates a split convolution.
164 |   Split convolution splits the input and output into
165 |   'num_blocks' blocks of approximately the same size each,
166 |   and only connects $i$-th input to $i$ output.
167 |   Args:
168 |     input_tensor: input tensor
169 |     num_outputs: number of output filters
170 |     num_ways: num blocks to split by.
171 |     scope: scope for all the operators.
172 |     divisible_by: make sure that every part is divisiable by this.
173 |     **kwargs: will be passed directly into conv2d operator
174 |   Returns:
175 |     tensor
176 |   """
177 |   b = input_tensor.get_shape().as_list()[3]
178 | 
179 |   if num_ways == 1 or min(b // num_ways,
180 |                           num_outputs // num_ways) < divisible_by:
181 |     # Don't do any splitting if we end up with less than 8 filters
182 |     # on either side.
183 |     return slim.conv2d(input_tensor, num_outputs, [1, 1], scope=scope, **kwargs)
184 | 
185 |   outs = []
186 |   input_splits = _split_divisible(b, num_ways, divisible_by=divisible_by)
187 |   output_splits = _split_divisible(
188 |       num_outputs, num_ways, divisible_by=divisible_by)
189 |   inputs = tf.split(input_tensor, input_splits, axis=3, name='split_' + scope)
190 |   base = scope
191 |   for i, (input_tensor, out_size) in enumerate(zip(inputs, output_splits)):
192 |     scope = base + '_part_%d' % (i,)
193 |     n = slim.conv2d(input_tensor, out_size, [1, 1], scope=scope, **kwargs)
194 |     n = tf.identity(n, scope + '_output')
195 |     outs.append(n)
196 |   return tf.concat(outs, 3, name=scope + '_concat')
197 | 
198 | 
199 | @slim.add_arg_scope
200 | def expanded_conv(input_tensor,
201 |                   num_outputs,
202 |                   expansion_size=expand_input_by_factor(6),
203 |                   stride=1,
204 |                   rate=1,
205 |                   kernel_size=(3, 3),
206 |                   residual=True,
207 |                   normalizer_fn=None,
208 |                   split_projection=1,
209 |                   split_expansion=1,
210 |                   split_divisible_by=8,
211 |                   expansion_transform=None,
212 |                   depthwise_location='expansion',
213 |                   depthwise_channel_multiplier=1,
214 |                   endpoints=None,
215 |                   use_explicit_padding=False,
216 |                   padding='SAME',
217 |                   inner_activation_fn=None,
218 |                   depthwise_activation_fn=None,
219 |                   project_activation_fn=tf.identity,
220 |                   depthwise_fn=slim.separable_conv2d,
221 |                   expansion_fn=split_conv,
222 |                   projection_fn=split_conv,
223 |                   scope=None):
224 |   """Depthwise Convolution Block with expansion.
225 |   Builds a composite convolution that has the following structure
226 |   expansion (1x1) -> depthwise (kernel_size) -> projection (1x1)
227 |   Args:
228 |     input_tensor: input
229 |     num_outputs: number of outputs in the final layer.
230 |     expansion_size: the size of expansion, could be a constant or a callable.
231 |       If latter it will be provided 'num_inputs' as an input. For forward
232 |       compatibility it should accept arbitrary keyword arguments.
233 |       Default will expand the input by factor of 6.
234 |     stride: depthwise stride
235 |     rate: depthwise rate
236 |     kernel_size: depthwise kernel
237 |     residual: whether to include residual connection between input
238 |       and output.
239 |     normalizer_fn: batchnorm or otherwise
240 |     split_projection: how many ways to split projection operator
241 |       (that is conv expansion->bottleneck)
242 |     split_expansion: how many ways to split expansion op
243 |       (that is conv bottleneck->expansion) ops will keep depth divisible
244 |       by this value.
245 |     split_divisible_by: make sure every split group is divisible by this number.
246 |     expansion_transform: Optional function that takes expansion
247 |       as a single input and returns output.
248 |     depthwise_location: where to put depthwise covnvolutions supported
249 |       values None, 'input', 'output', 'expansion'
250 |     depthwise_channel_multiplier: depthwise channel multiplier:
251 |     each input will replicated (with different filters)
252 |     that many times. So if input had c channels,
253 |     output will have c x depthwise_channel_multpilier.
254 |     endpoints: An optional dictionary into which intermediate endpoints are
255 |       placed. The keys "expansion_output", "depthwise_output",
256 |       "projection_output" and "expansion_transform" are always populated, even
257 |       if the corresponding functions are not invoked.
258 |     use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
259 |       inputs so that the output dimensions are the same as if 'SAME' padding
260 |       were used.
261 |     padding: Padding type to use if `use_explicit_padding` is not set.
262 |     inner_activation_fn: activation function to use in all inner convolutions.
263 |     If none, will rely on slim default scopes.
264 |     depthwise_activation_fn: activation function to use for deptwhise only.
265 |       If not provided will rely on slim default scopes. If both
266 |       inner_activation_fn and depthwise_activation_fn are provided,
267 |       depthwise_activation_fn takes precedence over inner_activation_fn.
268 |     project_activation_fn: activation function for the project layer.
269 |     (note this layer is not affected by inner_activation_fn)
270 |     depthwise_fn: Depthwise convolution function.
271 |     expansion_fn: Expansion convolution function. If use custom function then
272 |       "split_expansion" and "split_divisible_by" will be ignored.
273 |     projection_fn: Projection convolution function. If use custom function then
274 |       "split_projection" and "split_divisible_by" will be ignored.
275 |     scope: optional scope.
276 |   Returns:
277 |     Tensor of depth num_outputs
278 |   Raises:
279 |     TypeError: on inval
280 |   """
281 |   conv_defaults = {}
282 |   dw_defaults = {}
283 |   if inner_activation_fn is not None:
284 |     conv_defaults['activation_fn'] = inner_activation_fn
285 |     dw_defaults['activation_fn'] = inner_activation_fn
286 |   if depthwise_activation_fn is not None:
287 |     dw_defaults['activation_fn'] = depthwise_activation_fn
288 |   # pylint: disable=g-backslash-continuation
289 |   with tf.variable_scope(scope, default_name='expanded_conv') as s, \
290 |        tf.name_scope(s.original_name_scope), \
291 |       slim.arg_scope((slim.conv2d,), **conv_defaults), \
292 |        slim.arg_scope((slim.separable_conv2d,), **dw_defaults):
293 |     prev_depth = input_tensor.get_shape().as_list()[3]
294 |     if  depthwise_location not in [None, 'input', 'output', 'expansion']:
295 |       raise TypeError('%r is unknown value for depthwise_location' %
296 |                       depthwise_location)
297 |     if use_explicit_padding:
298 |       if padding != 'SAME':
299 |         raise TypeError('`use_explicit_padding` should only be used with '
300 |                         '"SAME" padding.')
301 |       padding = 'VALID'
302 |     depthwise_func = functools.partial(
303 |         depthwise_fn,
304 |         num_outputs=None,
305 |         kernel_size=kernel_size,
306 |         depth_multiplier=depthwise_channel_multiplier,
307 |         stride=stride,
308 |         rate=rate,
309 |         normalizer_fn=normalizer_fn,
310 |         padding=padding,
311 |         scope='depthwise')
312 |     # b1 -> b2 * r -> b2
313 |     #   i -> (o * r) (bottleneck) -> o
314 |     input_tensor = tf.identity(input_tensor, 'input')
315 |     net = input_tensor
316 | 
317 |     if depthwise_location == 'input':
318 |       if use_explicit_padding:
319 |         net = _fixed_padding(net, kernel_size, rate)
320 |       net = depthwise_func(net, activation_fn=None)
321 |       net = tf.identity(net, name='depthwise_output')
322 |       if endpoints is not None:
323 |         endpoints['depthwise_output'] = net
324 | 
325 |     if callable(expansion_size):
326 |       inner_size = expansion_size(num_inputs=prev_depth)
327 |     else:
328 |       inner_size = expansion_size
329 | 
330 |     if inner_size > net.shape[3]:
331 |       if expansion_fn == split_conv:
332 |         expansion_fn = functools.partial(
333 |             expansion_fn,
334 |             num_ways=split_expansion,
335 |             divisible_by=split_divisible_by,
336 |             stride=1)
337 |       net = expansion_fn(
338 |           net,
339 |           inner_size,
340 |           scope='expand',
341 |           normalizer_fn=normalizer_fn)
342 |       net = tf.identity(net, 'expansion_output')
343 |       if endpoints is not None:
344 |         endpoints['expansion_output'] = net
345 | 
346 |     if depthwise_location == 'expansion':
347 |       if use_explicit_padding:
348 |         net = _fixed_padding(net, kernel_size, rate)
349 |       net = depthwise_func(net)
350 |       net = tf.identity(net, name='depthwise_output')
351 |       if endpoints is not None:
352 |         endpoints['depthwise_output'] = net
353 | 
354 |     if expansion_transform:
355 |       net = expansion_transform(expansion_tensor=net, input_tensor=input_tensor)
356 |     # Note in contrast with expansion, we always have
357 |     # projection to produce the desired output size.
358 |     if projection_fn == split_conv:
359 |       projection_fn = functools.partial(
360 |           projection_fn,
361 |           num_ways=split_projection,
362 |           divisible_by=split_divisible_by,
363 |           stride=1)
364 |     net = projection_fn(
365 |         net,
366 |         num_outputs,
367 |         scope='project',
368 |         normalizer_fn=normalizer_fn,
369 |         activation_fn=project_activation_fn)
370 |     if endpoints is not None:
371 |       endpoints['projection_output'] = net
372 |     if depthwise_location == 'output':
373 |       if use_explicit_padding:
374 |         net = _fixed_padding(net, kernel_size, rate)
375 |       net = depthwise_func(net, activation_fn=None)
376 |       net = tf.identity(net, name='depthwise_output')
377 |       if endpoints is not None:
378 |         endpoints['depthwise_output'] = net
379 | 
380 |     if callable(residual):  # custom residual
381 |       net = residual(input_tensor=input_tensor, output_tensor=net)
382 |     elif (residual and
383 |           # stride check enforces that we don't add residuals when spatial
384 |           # dimensions are None
385 |           stride == 1 and
386 |           # Depth matches
387 |           net.get_shape().as_list()[3] ==
388 |           input_tensor.get_shape().as_list()[3]):
389 |       net += input_tensor
390 |     return tf.identity(net, name='output')
391 | 
392 | 
393 | @slim.add_arg_scope
394 | def squeeze_excite(input_tensor,
395 |                    divisible_by=8,
396 |                    squeeze_factor=3,
397 |                    inner_activation_fn=tf.nn.relu,
398 |                    gating_fn=tf.sigmoid,
399 |                    squeeze_input_tensor=None,
400 |                    pool=None):
401 |   """Squeeze excite block for Mobilenet V3.
402 |   Args:
403 |     input_tensor: input tensor to apply SE block to.
404 |     divisible_by: ensures all inner dimensions are divisible by this number.
405 |     squeeze_factor: the factor of squeezing in the inner fully connected layer
406 |     inner_activation_fn: non-linearity to be used in inner layer.
407 |     gating_fn: non-linearity to be used for final gating function
408 |     squeeze_input_tensor: custom tensor to use for computing gating activation.
409 |      If provided the result will be input_tensor * SE(squeeze_input_tensor)
410 |      instead of input_tensor * SE(input_tensor).
411 |     pool: if number is  provided will average pool with that kernel size
412 |       to compute inner tensor, followed by bilinear upsampling.
413 |   Returns:
414 |     Gated input_tensor. (e.g. X * SE(X))
415 |   """
416 |   with tf.variable_scope('squeeze_excite'):
417 |     if squeeze_input_tensor is None:
418 |       squeeze_input_tensor = input_tensor
419 |     input_size = input_tensor.shape.as_list()[1:3]
420 |     pool_height, pool_width = squeeze_input_tensor.shape.as_list()[1:3]
421 |     stride = 1
422 |     if pool is not None and pool_height >= pool:
423 |       pool_height, pool_width, stride = pool, pool, pool
424 |     input_channels = squeeze_input_tensor.shape.as_list()[3]
425 |     output_channels = input_tensor.shape.as_list()[3]
426 |     squeeze_channels = _make_divisible(
427 |         input_channels / squeeze_factor, divisor=divisible_by)
428 | 
429 |     pooled = tf.nn.avg_pool(squeeze_input_tensor,
430 |                             (1, pool_height, pool_width, 1),
431 |                             strides=(1, stride, stride, 1),
432 |                             padding='VALID')
433 |     squeeze = slim.conv2d(
434 |         pooled,
435 |         kernel_size=(1, 1),
436 |         num_outputs=squeeze_channels,
437 |         normalizer_fn=None,
438 |         activation_fn=inner_activation_fn)
439 |     excite_outputs = output_channels
440 |     excite = slim.conv2d(squeeze, num_outputs=excite_outputs,
441 |                          kernel_size=[1, 1],
442 |                          normalizer_fn=None,
443 |                          activation_fn=gating_fn)
444 |     if pool is not None:
445 |       # Note: As of 03/20/2019 only BILINEAR (the default) with
446 |       # align_corners=True has gradients implemented in TPU.
447 |       excite = tf.image.resize_images(
448 |           excite, input_size,
449 |           align_corners=True)
450 |     result = input_tensor * excite
451 |   return result


--------------------------------------------------------------------------------
/lib/networks/mobilenet/mobilenet.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Mobilenet Base Class."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | import collections
 21 | import contextlib
 22 | import copy
 23 | import os
 24 | 
 25 | import tensorflow as tf
 26 | from tensorflow.contrib import slim as contrib_slim
 27 | 
 28 | slim = contrib_slim
 29 | 
 30 | 
 31 | @slim.add_arg_scope
 32 | def apply_activation(x, name=None, activation_fn=None):
 33 |   return activation_fn(x, name=name) if activation_fn else x
 34 | 
 35 | 
 36 | def _fixed_padding(inputs, kernel_size, rate=1):
 37 |   """Pads the input along the spatial dimensions independently of input size.
 38 | 
 39 |   Pads the input such that if it was used in a convolution with 'VALID' padding,
 40 |   the output would have the same dimensions as if the unpadded input was used
 41 |   in a convolution with 'SAME' padding.
 42 | 
 43 |   Args:
 44 |     inputs: A tensor of size [batch, height_in, width_in, channels].
 45 |     kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
 46 |     rate: An integer, rate for atrous convolution.
 47 | 
 48 |   Returns:
 49 |     output: A tensor of size [batch, height_out, width_out, channels] with the
 50 |       input, either intact (if kernel_size == 1) or padded (if kernel_size > 1).
 51 |   """
 52 |   kernel_size_effective = [kernel_size[0] + (kernel_size[0] - 1) * (rate - 1),
 53 |                            kernel_size[0] + (kernel_size[0] - 1) * (rate - 1)]
 54 |   pad_total = [kernel_size_effective[0] - 1, kernel_size_effective[1] - 1]
 55 |   pad_beg = [pad_total[0] // 2, pad_total[1] // 2]
 56 |   pad_end = [pad_total[0] - pad_beg[0], pad_total[1] - pad_beg[1]]
 57 |   padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg[0], pad_end[0]],
 58 |                                   [pad_beg[1], pad_end[1]], [0, 0]])
 59 |   return padded_inputs
 60 | 
 61 | 
 62 | def _make_divisible(v, divisor, min_value=None):
 63 |   if min_value is None:
 64 |     min_value = divisor
 65 |   new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
 66 |   # Make sure that round down does not go down by more than 10%.
 67 |   if new_v < 0.9 * v:
 68 |     new_v += divisor
 69 |   return int(new_v)
 70 | 
 71 | 
 72 | @contextlib.contextmanager
 73 | def _set_arg_scope_defaults(defaults):
 74 |   """Sets arg scope defaults for all items present in defaults.
 75 | 
 76 |   Args:
 77 |     defaults: dictionary/list of pairs, containing a mapping from
 78 |     function to a dictionary of default args.
 79 | 
 80 |   Yields:
 81 |     context manager where all defaults are set.
 82 |   """
 83 |   if hasattr(defaults, 'items'):
 84 |     items = list(defaults.items())
 85 |   else:
 86 |     items = defaults
 87 |   if not items:
 88 |     yield
 89 |   else:
 90 |     func, default_arg = items[0]
 91 |     with slim.arg_scope(func, **default_arg):
 92 |       with _set_arg_scope_defaults(items[1:]):
 93 |         yield
 94 | 
 95 | 
 96 | @slim.add_arg_scope
 97 | def depth_multiplier(output_params,
 98 |                      multiplier,
 99 |                      divisible_by=8,
100 |                      min_depth=8,
101 |                      **unused_kwargs):
102 |   if 'num_outputs' not in output_params:
103 |     return
104 |   d = output_params['num_outputs']
105 |   output_params['num_outputs'] = _make_divisible(d * multiplier, divisible_by,
106 |                                                  min_depth)
107 | 
108 | 
109 | _Op = collections.namedtuple('Op', ['op', 'params', 'multiplier_func'])
110 | 
111 | 
112 | def op(opfunc, multiplier_func=depth_multiplier, **params):
113 |   multiplier = params.pop('multiplier_transform', multiplier_func)
114 |   return _Op(opfunc, params=params, multiplier_func=multiplier)
115 | 
116 | 
117 | class NoOpScope(object):
118 |   """No-op context manager."""
119 | 
120 |   def __enter__(self):
121 |     return None
122 | 
123 |   def __exit__(self, exc_type, exc_value, traceback):
124 |     return False
125 | 
126 | 
127 | def safe_arg_scope(funcs, **kwargs):
128 |   """Returns `slim.arg_scope` with all None arguments removed.
129 | 
130 |   Arguments:
131 |     funcs: Functions to pass to `arg_scope`.
132 |     **kwargs: Arguments to pass to `arg_scope`.
133 | 
134 |   Returns:
135 |     arg_scope or No-op context manager.
136 | 
137 |   Note: can be useful if None value should be interpreted as "do not overwrite
138 |     this parameter value".
139 |   """
140 |   filtered_args = {name: value for name, value in kwargs.items()
141 |                    if value is not None}
142 |   if filtered_args:
143 |     return slim.arg_scope(funcs, **filtered_args)
144 |   else:
145 |     return NoOpScope()
146 | 
147 | 
148 | @slim.add_arg_scope
149 | def mobilenet_base(  # pylint: disable=invalid-name
150 |     inputs,
151 |     conv_defs,
152 |     multiplier=1.0,
153 |     final_endpoint=None,
154 |     output_stride=None,
155 |     use_explicit_padding=False,
156 |     scope=None,
157 |     is_training=False):
158 |   """Mobilenet base network.
159 | 
160 |   Constructs a network from inputs to the given final endpoint. By default
161 |   the network is constructed in inference mode. To create network
162 |   in training mode use:
163 | 
164 |   with slim.arg_scope(mobilenet.training_scope()):
165 |      logits, endpoints = mobilenet_base(...)
166 | 
167 |   Args:
168 |     inputs: a tensor of shape [batch_size, height, width, channels].
169 |     conv_defs: A list of op(...) layers specifying the net architecture.
170 |     multiplier: Float multiplier for the depth (number of channels)
171 |       for all convolution ops. The value must be greater than zero. Typical
172 |       usage will be to set this value in (0, 1) to reduce the number of
173 |       parameters or computation cost of the model.
174 |     final_endpoint: The name of last layer, for early termination for
175 |     for V1-based networks: last layer is "layer_14", for V2: "layer_20"
176 |     output_stride: An integer that specifies the requested ratio of input to
177 |       output spatial resolution. If not None, then we invoke atrous convolution
178 |       if necessary to prevent the network from reducing the spatial resolution
179 |       of the activation maps. Allowed values are 1 or any even number, excluding
180 |       zero. Typical values are 8 (accurate fully convolutional mode), 16
181 |       (fast fully convolutional mode), and 32 (classification mode).
182 | 
183 |       NOTE- output_stride relies on all consequent operators to support dilated
184 |       operators via "rate" parameter. This might require wrapping non-conv
185 |       operators to operate properly.
186 | 
187 |     use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
188 |       inputs so that the output dimensions are the same as if 'SAME' padding
189 |       were used.
190 |     scope: optional variable scope.
191 |     is_training: How to setup batch_norm and other ops. Note: most of the time
192 |       this does not need be set directly. Use mobilenet.training_scope() to set
193 |       up training instead. This parameter is here for backward compatibility
194 |       only. It is safe to set it to the value matching
195 |       training_scope(is_training=...). It is also safe to explicitly set
196 |       it to False, even if there is outer training_scope set to to training.
197 |       (The network will be built in inference mode). If this is set to None,
198 |       no arg_scope is added for slim.batch_norm's is_training parameter.
199 | 
200 |   Returns:
201 |     tensor_out: output tensor.
202 |     end_points: a set of activations for external use, for example summaries or
203 |                 losses.
204 | 
205 |   Raises:
206 |     ValueError: depth_multiplier <= 0, or the target output_stride is not
207 |                 allowed.
208 |   """
209 |   if multiplier <= 0:
210 |     raise ValueError('multiplier is not greater than zero.')
211 | 
212 |   # Set conv defs defaults and overrides.
213 |   conv_defs_defaults = conv_defs.get('defaults', {})
214 |   conv_defs_overrides = conv_defs.get('overrides', {})
215 |   if use_explicit_padding:
216 |     conv_defs_overrides = copy.deepcopy(conv_defs_overrides)
217 |     conv_defs_overrides[
218 |         (slim.conv2d, slim.separable_conv2d)] = {'padding': 'VALID'}
219 | 
220 |   if output_stride is not None:
221 |     if output_stride == 0 or (output_stride > 1 and output_stride % 2):
222 |       raise ValueError('Output stride must be None, 1 or a multiple of 2.')
223 | 
224 |   # a) Set the tensorflow scope
225 |   # b) set padding to default: note we might consider removing this
226 |   # since it is also set by mobilenet_scope
227 |   # c) set all defaults
228 |   # d) set all extra overrides.
229 |   # pylint: disable=g-backslash-continuation
230 |   with _scope_all(scope, default_scope='Mobilenet'), \
231 |       safe_arg_scope([slim.batch_norm], is_training=is_training), \
232 |       _set_arg_scope_defaults(conv_defs_defaults), \
233 |       _set_arg_scope_defaults(conv_defs_overrides):
234 |     # The current_stride variable keeps track of the output stride of the
235 |     # activations, i.e., the running product of convolution strides up to the
236 |     # current network layer. This allows us to invoke atrous convolution
237 |     # whenever applying the next convolution would result in the activations
238 |     # having output stride larger than the target output_stride.
239 |     current_stride = 1
240 | 
241 |     # The atrous convolution rate parameter.
242 |     rate = 1
243 | 
244 |     net = inputs
245 |     # Insert default parameters before the base scope which includes
246 |     # any custom overrides set in mobilenet.
247 |     end_points = {}
248 |     scopes = {}
249 |     for i, opdef in enumerate(conv_defs['spec']):
250 |       params = dict(opdef.params)
251 |       opdef.multiplier_func(params, multiplier)
252 |       stride = params.get('stride', 1)
253 |       if output_stride is not None and current_stride == output_stride:
254 |         # If we have reached the target output_stride, then we need to employ
255 |         # atrous convolution with stride=1 and multiply the atrous rate by the
256 |         # current unit's stride for use in subsequent layers.
257 |         layer_stride = 1
258 |         layer_rate = rate
259 |         rate *= stride
260 |       else:
261 |         layer_stride = stride
262 |         layer_rate = 1
263 |         current_stride *= stride
264 |       # Update params.
265 |       params['stride'] = layer_stride
266 |       # Only insert rate to params if rate > 1 and kernel size is not [1, 1].
267 |       if layer_rate > 1:
268 |         if tuple(params.get('kernel_size', [])) != (1, 1):
269 |           # We will apply atrous rate in the following cases:
270 |           # 1) When kernel_size is not in params, the operation then uses
271 |           #   default kernel size 3x3.
272 |           # 2) When kernel_size is in params, and if the kernel_size is not
273 |           #   equal to (1, 1) (there is no need to apply atrous convolution to
274 |           #   any 1x1 convolution).
275 |           params['rate'] = layer_rate
276 |       # Set padding
277 |       if use_explicit_padding:
278 |         if 'kernel_size' in params:
279 |           net = _fixed_padding(net, params['kernel_size'], layer_rate)
280 |         else:
281 |           params['use_explicit_padding'] = True
282 | 
283 |       end_point = 'layer_%d' % (i + 1)
284 |       try:
285 |         net = opdef.op(net, **params)
286 |       except Exception:
287 |         print('Failed to create op %i: %r params: %r' % (i, opdef, params))
288 |         raise
289 |       end_points[end_point] = net
290 |       scope = os.path.dirname(net.name)
291 |       scopes[scope] = end_point
292 |       if final_endpoint is not None and end_point == final_endpoint:
293 |         break
294 | 
295 |     # Add all tensors that end with 'output' to
296 |     # endpoints
297 |     for t in net.graph.get_operations():
298 |       scope = os.path.dirname(t.name)
299 |       bn = os.path.basename(t.name)
300 |       if scope in scopes and t.name.endswith('output'):
301 |         end_points[scopes[scope] + '/' + bn] = t.outputs[0]
302 |     return net, end_points
303 | 
304 | 
305 | @contextlib.contextmanager
306 | def _scope_all(scope, default_scope=None):
307 |   with tf.variable_scope(scope, default_name=default_scope) as s,\
308 |        tf.name_scope(s.original_name_scope):
309 |     yield s
310 | 
311 | 
312 | @slim.add_arg_scope
313 | def mobilenet(inputs,
314 |               num_classes=1001,
315 |               prediction_fn=slim.softmax,
316 |               reuse=None,
317 |               scope='Mobilenet',
318 |               base_only=False,
319 |               **mobilenet_args):
320 |   """Mobilenet model for classification, supports both V1 and V2.
321 | 
322 |   Note: default mode is inference, use mobilenet.training_scope to create
323 |   training network.
324 | 
325 | 
326 |   Args:
327 |     inputs: a tensor of shape [batch_size, height, width, channels].
328 |     num_classes: number of predicted classes. If 0 or None, the logits layer
329 |       is omitted and the input features to the logits layer (before dropout)
330 |       are returned instead.
331 |     prediction_fn: a function to get predictions out of logits
332 |       (default softmax).
333 |     reuse: whether or not the network and its variables should be reused. To be
334 |       able to reuse 'scope' must be given.
335 |     scope: Optional variable_scope.
336 |     base_only: if True will only create the base of the network (no pooling
337 |     and no logits).
338 |     **mobilenet_args: passed to mobilenet_base verbatim.
339 |       - conv_defs: list of conv defs
340 |       - multiplier: Float multiplier for the depth (number of channels)
341 |       for all convolution ops. The value must be greater than zero. Typical
342 |       usage will be to set this value in (0, 1) to reduce the number of
343 |       parameters or computation cost of the model.
344 |       - output_stride: will ensure that the last layer has at most total stride.
345 |       If the architecture calls for more stride than that provided
346 |       (e.g. output_stride=16, but the architecture has 5 stride=2 operators),
347 |       it will replace output_stride with fractional convolutions using Atrous
348 |       Convolutions.
349 | 
350 |   Returns:
351 |     logits: the pre-softmax activations, a tensor of size
352 |       [batch_size, num_classes]
353 |     end_points: a dictionary from components of the network to the corresponding
354 |       activation tensor.
355 | 
356 |   Raises:
357 |     ValueError: Input rank is invalid.
358 |   """
359 |   is_training = mobilenet_args.get('is_training', False)
360 |   input_shape = inputs.get_shape().as_list()
361 |   if len(input_shape) != 4:
362 |     raise ValueError('Expected rank 4 input, was: %d' % len(input_shape))
363 | 
364 |   with tf.variable_scope(scope, 'Mobilenet', reuse=reuse) as scope:
365 |     inputs = tf.identity(inputs, 'input')
366 |     net, end_points = mobilenet_base(inputs, scope=scope, **mobilenet_args)
367 |     if base_only:
368 |       return net, end_points
369 | 
370 |     net = tf.identity(net, name='embedding')
371 | 
372 |     with tf.variable_scope('Logits'):
373 |       net = global_pool(net)
374 |       end_points['global_pool'] = net
375 |       if not num_classes:
376 |         return net, end_points
377 |       net = slim.dropout(net, scope='Dropout', is_training=is_training)
378 |       # 1 x 1 x num_classes
379 |       # Note: legacy scope name.
380 |       logits = slim.conv2d(
381 |           net,
382 |           num_classes, [1, 1],
383 |           activation_fn=None,
384 |           normalizer_fn=None,
385 |           biases_initializer=tf.zeros_initializer(),
386 |           scope='Conv2d_1c_1x1')
387 | 
388 |       logits = tf.squeeze(logits, [1, 2])
389 | 
390 |       logits = tf.identity(logits, name='output')
391 |     end_points['Logits'] = logits
392 |     if prediction_fn:
393 |       end_points['Predictions'] = prediction_fn(logits, 'Predictions')
394 |   return logits, end_points
395 | 
396 | 
397 | def global_pool(input_tensor, pool_op=tf.nn.avg_pool):
398 |   """Applies avg pool to produce 1x1 output.
399 | 
400 |   NOTE: This function is funcitonally equivalenet to reduce_mean, but it has
401 |   baked in average pool which has better support across hardware.
402 | 
403 |   Args:
404 |     input_tensor: input tensor
405 |     pool_op: pooling op (avg pool is default)
406 |   Returns:
407 |     a tensor batch_size x 1 x 1 x depth.
408 |   """
409 |   shape = input_tensor.get_shape().as_list()
410 |   if shape[1] is None or shape[2] is None:
411 |     kernel_size = tf.convert_to_tensor(
412 |         [1, tf.shape(input_tensor)[1],
413 |          tf.shape(input_tensor)[2], 1])
414 |   else:
415 |     kernel_size = [1, shape[1], shape[2], 1]
416 |   output = pool_op(
417 |       input_tensor, ksize=kernel_size, strides=[1, 1, 1, 1], padding='VALID')
418 |   # Recover output shape, for unknown shape.
419 |   output.set_shape([None, 1, 1, None])
420 |   return output
421 | 
422 | 
423 | def training_scope(is_training=True,
424 |                    weight_decay=0.00004,
425 |                    stddev=0.09,
426 |                    dropout_keep_prob=0.8,
427 |                    bn_decay=0.997):
428 |   """Defines Mobilenet training scope.
429 | 
430 |   Usage:
431 |      with tf.contrib.slim.arg_scope(mobilenet.training_scope()):
432 |        logits, endpoints = mobilenet_v2.mobilenet(input_tensor)
433 | 
434 |      # the network created will be trainble with dropout/batch norm
435 |      # initialized appropriately.
436 |   Args:
437 |     is_training: if set to False this will ensure that all customizations are
438 |       set to non-training mode. This might be helpful for code that is reused
439 |       across both training/evaluation, but most of the time training_scope with
440 |       value False is not needed. If this is set to None, the parameters is not
441 |       added to the batch_norm arg_scope.
442 | 
443 |     weight_decay: The weight decay to use for regularizing the model.
444 |     stddev: Standard deviation for initialization, if negative uses xavier.
445 |     dropout_keep_prob: dropout keep probability (not set if equals to None).
446 |     bn_decay: decay for the batch norm moving averages (not set if equals to
447 |       None).
448 | 
449 |   Returns:
450 |     An argument scope to use via arg_scope.
451 |   """
452 |   # Note: do not introduce parameters that would change the inference
453 |   # model here (for example whether to use bias), modify conv_def instead.
454 |   batch_norm_params = {
455 |       'decay': bn_decay,
456 |       'is_training': is_training
457 |   }
458 |   if stddev < 0:
459 |     weight_intitializer = slim.initializers.xavier_initializer()
460 |   else:
461 |     weight_intitializer = tf.truncated_normal_initializer(stddev=stddev)
462 | 
463 |   # Set weight_decay for weights in Conv and FC layers.
464 |   with slim.arg_scope(
465 |       [slim.conv2d, slim.fully_connected, slim.separable_conv2d],
466 |       weights_initializer=weight_intitializer,
467 |       normalizer_fn=slim.batch_norm), \
468 |       slim.arg_scope([mobilenet_base, mobilenet], is_training=is_training),\
469 |       safe_arg_scope([slim.batch_norm], **batch_norm_params), \
470 |       safe_arg_scope([slim.dropout], is_training=is_training,
471 |                      keep_prob=dropout_keep_prob), \
472 |       slim.arg_scope([slim.conv2d], \
473 |                      weights_regularizer=slim.l2_regularizer(weight_decay)), \
474 |       slim.arg_scope([slim.separable_conv2d], weights_regularizer=None) as s:
475 |     return s


--------------------------------------------------------------------------------
/lib/networks/mobilenet/mobilenet_v2.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Implementation of Mobilenet V2.
 16 | 
 17 | Architecture: https://arxiv.org/abs/1801.04381
 18 | 
 19 | The base model gives 72.2% accuracy on ImageNet, with 300MMadds,
 20 | 3.4 M parameters.
 21 | """
 22 | 
 23 | from __future__ import absolute_import
 24 | from __future__ import division
 25 | from __future__ import print_function
 26 | 
 27 | import copy
 28 | import functools
 29 | 
 30 | import tensorflow as tf
 31 | from tensorflow.contrib import layers as contrib_layers
 32 | from tensorflow.contrib import slim as contrib_slim
 33 | 
 34 | from lib.networks.mobilenet import conv_blocks as ops
 35 | from lib.networks.mobilenet import mobilenet as lib
 36 | 
 37 | slim = contrib_slim
 38 | op = lib.op
 39 | 
 40 | expand_input = ops.expand_input_by_factor
 41 | 
 42 | # pyformat: disable
 43 | # Architecture: https://arxiv.org/abs/1801.04381
 44 | V2_DEF = dict(
 45 |     defaults={
 46 |         # Note: these parameters of batch norm affect the architecture
 47 |         # that's why they are here and not in training_scope.
 48 |         (slim.batch_norm,): {'center': True, 'scale': True},
 49 |         (slim.conv2d, slim.fully_connected, slim.separable_conv2d): {
 50 |             'normalizer_fn': slim.batch_norm, 'activation_fn': tf.nn.relu6
 51 |         },
 52 |         (ops.expanded_conv,): {
 53 |             'expansion_size': expand_input(6),
 54 |             'split_expansion': 1,
 55 |             'normalizer_fn': slim.batch_norm,
 56 |             'residual': True
 57 |         },
 58 |         (slim.conv2d, slim.separable_conv2d): {'padding': 'SAME'}
 59 |     },
 60 |     spec=[
 61 |         op(slim.conv2d, stride=2, num_outputs=32, kernel_size=[3, 3]),
 62 |         op(ops.expanded_conv,
 63 |            expansion_size=expand_input(1, divisible_by=1),
 64 |            num_outputs=16),
 65 |         op(ops.expanded_conv, stride=2, num_outputs=24),
 66 |         op(ops.expanded_conv, stride=1, num_outputs=24),
 67 |         op(ops.expanded_conv, stride=2, num_outputs=32),
 68 |         op(ops.expanded_conv, stride=1, num_outputs=32),
 69 |         op(ops.expanded_conv, stride=1, num_outputs=32),
 70 |         op(ops.expanded_conv, stride=2, num_outputs=64),
 71 |         op(ops.expanded_conv, stride=1, num_outputs=64),
 72 |         op(ops.expanded_conv, stride=1, num_outputs=64),
 73 |         op(ops.expanded_conv, stride=1, num_outputs=64),
 74 |         op(ops.expanded_conv, stride=1, num_outputs=96),
 75 |         op(ops.expanded_conv, stride=1, num_outputs=96),
 76 |         op(ops.expanded_conv, stride=1, num_outputs=96),
 77 |         op(ops.expanded_conv, stride=2, num_outputs=160),
 78 |         op(ops.expanded_conv, stride=1, num_outputs=160),
 79 |         op(ops.expanded_conv, stride=1, num_outputs=160),
 80 |         op(ops.expanded_conv, stride=1, num_outputs=320),
 81 |         op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=1280)
 82 |     ],
 83 | )
 84 | # pyformat: enable
 85 | 
 86 | # Mobilenet v2 Definition with group normalization.
 87 | V2_DEF_GROUP_NORM = copy.deepcopy(V2_DEF)
 88 | V2_DEF_GROUP_NORM['defaults'] = {
 89 |     (contrib_slim.conv2d, contrib_slim.fully_connected,
 90 |      contrib_slim.separable_conv2d): {
 91 |         'normalizer_fn': contrib_layers.group_norm,  # pylint: disable=C0330
 92 |         'activation_fn': tf.nn.relu6,  # pylint: disable=C0330
 93 |     },  # pylint: disable=C0330
 94 |     (ops.expanded_conv,): {
 95 |         'expansion_size': ops.expand_input_by_factor(6),
 96 |         'split_expansion': 1,
 97 |         'normalizer_fn': contrib_layers.group_norm,
 98 |         'residual': True
 99 |     },
100 |     (contrib_slim.conv2d, contrib_slim.separable_conv2d): {
101 |         'padding': 'SAME'
102 |     }
103 | }
104 | 
105 | 
106 | @slim.add_arg_scope
107 | def mobilenet(input_tensor,
108 |               num_classes=1001,
109 |               depth_multiplier=1.0,
110 |               scope='MobilenetV2',
111 |               conv_defs=None,
112 |               finegrain_classification_mode=False,
113 |               min_depth=None,
114 |               divisible_by=None,
115 |               activation_fn=None,
116 |               **kwargs):
117 |   """Creates mobilenet V2 network.
118 | 
119 |   Inference mode is created by default. To create training use training_scope
120 |   below.
121 | 
122 |   with tf.contrib.slim.arg_scope(mobilenet_v2.training_scope()):
123 |      logits, endpoints = mobilenet_v2.mobilenet(input_tensor)
124 | 
125 |   Args:
126 |     input_tensor: The input tensor
127 |     num_classes: number of classes
128 |     depth_multiplier: The multiplier applied to scale number of
129 |     channels in each layer.
130 |     scope: Scope of the operator
131 |     conv_defs: Allows to override default conv def.
132 |     finegrain_classification_mode: When set to True, the model
133 |     will keep the last layer large even for small multipliers. Following
134 |     https://arxiv.org/abs/1801.04381
135 |     suggests that it improves performance for ImageNet-type of problems.
136 |       *Note* ignored if final_endpoint makes the builder exit earlier.
137 |     min_depth: If provided, will ensure that all layers will have that
138 |     many channels after application of depth multiplier.
139 |     divisible_by: If provided will ensure that all layers # channels
140 |     will be divisible by this number.
141 |     activation_fn: Activation function to use, defaults to tf.nn.relu6 if not
142 |       specified.
143 |     **kwargs: passed directly to mobilenet.mobilenet:
144 |       prediction_fn- what prediction function to use.
145 |       reuse-: whether to reuse variables (if reuse set to true, scope
146 |       must be given).
147 |   Returns:
148 |     logits/endpoints pair
149 | 
150 |   Raises:
151 |     ValueError: On invalid arguments
152 |   """
153 |   if conv_defs is None:
154 |     conv_defs = V2_DEF
155 |   if 'multiplier' in kwargs:
156 |     raise ValueError('mobilenetv2 doesn\'t support generic '
157 |                      'multiplier parameter use "depth_multiplier" instead.')
158 |   if finegrain_classification_mode:
159 |     conv_defs = copy.deepcopy(conv_defs)
160 |     if depth_multiplier < 1:
161 |       conv_defs['spec'][-1].params['num_outputs'] /= depth_multiplier
162 |   if activation_fn:
163 |     conv_defs = copy.deepcopy(conv_defs)
164 |     defaults = conv_defs['defaults']
165 |     conv_defaults = (
166 |         defaults[(slim.conv2d, slim.fully_connected, slim.separable_conv2d)])
167 |     conv_defaults['activation_fn'] = activation_fn
168 | 
169 |   depth_args = {}
170 |   # NB: do not set depth_args unless they are provided to avoid overriding
171 |   # whatever default depth_multiplier might have thanks to arg_scope.
172 |   if min_depth is not None:
173 |     depth_args['min_depth'] = min_depth
174 |   if divisible_by is not None:
175 |     depth_args['divisible_by'] = divisible_by
176 | 
177 |   with slim.arg_scope((lib.depth_multiplier,), **depth_args):
178 |     return lib.mobilenet(
179 |         input_tensor,
180 |         num_classes=num_classes,
181 |         conv_defs=conv_defs,
182 |         scope=scope,
183 |         multiplier=depth_multiplier,
184 |         **kwargs)
185 | 
186 | mobilenet.default_image_size = 224
187 | 
188 | 
189 | def wrapped_partial(func, *args, **kwargs):
190 |   partial_func = functools.partial(func, *args, **kwargs)
191 |   functools.update_wrapper(partial_func, func)
192 |   return partial_func
193 | 
194 | 
195 | # Wrappers for mobilenet v2 with depth-multipliers. Be noticed that
196 | # 'finegrain_classification_mode' is set to True, which means the embedding
197 | # layer will not be shrinked when given a depth-multiplier < 1.0.
198 | mobilenet_v2_140 = wrapped_partial(mobilenet, depth_multiplier=1.4)
199 | mobilenet_v2_050 = wrapped_partial(mobilenet, depth_multiplier=0.50,
200 |                                    finegrain_classification_mode=True)
201 | mobilenet_v2_035 = wrapped_partial(mobilenet, depth_multiplier=0.35,
202 |                                    finegrain_classification_mode=True)
203 | 
204 | 
205 | @slim.add_arg_scope
206 | def mobilenet_base(input_tensor, depth_multiplier=1.0, **kwargs):
207 |   """Creates base of the mobilenet (no pooling and no logits) ."""
208 |   return mobilenet(input_tensor,
209 |                    depth_multiplier=depth_multiplier,
210 |                    base_only=True, **kwargs)
211 | 
212 | 
213 | @slim.add_arg_scope
214 | def mobilenet_base_group_norm(input_tensor, depth_multiplier=1.0, **kwargs):
215 |   """Creates base of the mobilenet (no pooling and no logits) ."""
216 |   kwargs['conv_defs'] = V2_DEF_GROUP_NORM
217 |   kwargs['conv_defs']['defaults'].update({
218 |       (contrib_layers.group_norm,): {
219 |           'groups': kwargs.pop('groups', 8)
220 |       }
221 |   })
222 |   return mobilenet(
223 |       input_tensor, depth_multiplier=depth_multiplier, base_only=True, **kwargs)
224 | 
225 | 
226 | def training_scope(**kwargs):
227 |   """Defines MobilenetV2 training scope.
228 | 
229 |   Usage:
230 |      with tf.contrib.slim.arg_scope(mobilenet_v2.training_scope()):
231 |        logits, endpoints = mobilenet_v2.mobilenet(input_tensor)
232 | 
233 |   with slim.
234 | 
235 |   Args:
236 |     **kwargs: Passed to mobilenet.training_scope. The following parameters
237 |     are supported:
238 |       weight_decay- The weight decay to use for regularizing the model.
239 |       stddev-  Standard deviation for initialization, if negative uses xavier.
240 |       dropout_keep_prob- dropout keep probability
241 |       bn_decay- decay for the batch norm moving averages.
242 | 
243 |   Returns:
244 |     An `arg_scope` to use for the mobilenet v2 model.
245 |   """
246 |   return lib.training_scope(**kwargs)
247 | 
248 | 
249 | __all__ = ['training_scope', 'mobilenet_base', 'mobilenet', 'V2_DEF']


--------------------------------------------------------------------------------
/lib/networks/mobilenet/mobilenet_v3.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Mobilenet V3 conv defs and helper functions."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import copy
 22 | import functools
 23 | import numpy as np
 24 | 
 25 | import tensorflow as tf
 26 | from tensorflow.contrib import slim as contrib_slim
 27 | 
 28 | from lib.networks.mobilenet import conv_blocks as ops
 29 | from lib.networks.mobilenet import mobilenet as lib
 30 | 
 31 | slim = contrib_slim
 32 | op = lib.op
 33 | expand_input = ops.expand_input_by_factor
 34 | 
 35 | # Squeeze Excite with all parameters filled-in, we use hard-sigmoid
 36 | # for gating function and relu for inner activation function.
 37 | squeeze_excite = functools.partial(
 38 |     ops.squeeze_excite, squeeze_factor=4,
 39 |     inner_activation_fn=tf.nn.relu,
 40 |     gating_fn=lambda x: tf.nn.relu6(x+3)*0.16667)
 41 | 
 42 | # Wrap squeeze excite op as expansion_transform that takes
 43 | # both expansion and input tensor.
 44 | _se4 = lambda expansion_tensor, input_tensor: squeeze_excite(expansion_tensor)
 45 | 
 46 | 
 47 | def hard_swish(x):
 48 |   with tf.name_scope('hard_swish'):
 49 |     return x * tf.nn.relu6(x + np.float32(3)) * np.float32(1. / 6.)
 50 | 
 51 | 
 52 | def reduce_to_1x1(input_tensor, default_size=7, **kwargs):
 53 |   h, w = input_tensor.shape.as_list()[1:3]
 54 |   if h is not None and w == h:
 55 |     k = [h, h]
 56 |   else:
 57 |     k = [default_size, default_size]
 58 |   return slim.avg_pool2d(input_tensor, kernel_size=k, **kwargs)
 59 | 
 60 | 
 61 | def mbv3_op(ef, n, k, s=1, act=tf.nn.relu, se=None, **kwargs):
 62 |   """Defines a single Mobilenet V3 convolution block.
 63 | 
 64 |   Args:
 65 |     ef: expansion factor
 66 |     n: number of output channels
 67 |     k: stride of depthwise
 68 |     s: stride
 69 |     act: activation function in inner layers
 70 |     se: squeeze excite function.
 71 |     **kwargs: passed to expanded_conv
 72 | 
 73 |   Returns:
 74 |     An object (lib._Op) for inserting in conv_def, representing this operation.
 75 |   """
 76 |   return op(
 77 |       ops.expanded_conv,
 78 |       expansion_size=expand_input(ef),
 79 |       kernel_size=(k, k),
 80 |       stride=s,
 81 |       num_outputs=n,
 82 |       inner_activation_fn=act,
 83 |       expansion_transform=se,
 84 |       **kwargs)
 85 | 
 86 | 
 87 | def mbv3_fused(ef, n, k, s=1, **kwargs):
 88 |   """Defines a single Mobilenet V3 convolution block.
 89 | 
 90 |   Args:
 91 |     ef: expansion factor
 92 |     n: number of output channels
 93 |     k: stride of depthwise
 94 |     s: stride
 95 |     **kwargs: will be passed to mbv3_op
 96 | 
 97 |   Returns:
 98 |     An object (lib._Op) for inserting in conv_def, representing this operation.
 99 |   """
100 |   expansion_fn = functools.partial(slim.conv2d, kernel_size=k, stride=s)
101 |   return mbv3_op(
102 |       ef,
103 |       n,
104 |       k=1,
105 |       s=s,
106 |       depthwise_location=None,
107 |       expansion_fn=expansion_fn,
108 |       **kwargs)
109 | 
110 | 
111 | mbv3_op_se = functools.partial(mbv3_op, se=_se4)
112 | 
113 | 
114 | DEFAULTS = {
115 |     (ops.expanded_conv,):
116 |         dict(
117 |             normalizer_fn=slim.batch_norm,
118 |             residual=True),
119 |     (slim.conv2d, slim.fully_connected, slim.separable_conv2d): {
120 |         'normalizer_fn': slim.batch_norm,
121 |         'activation_fn': tf.nn.relu,
122 |     },
123 |     (slim.batch_norm,): {
124 |         'center': True,
125 |         'scale': True
126 |     },
127 | }
128 | 
129 | # Compatible checkpoint: http://mldash/5511169891790690458#scalars
130 | V3_LARGE = dict(
131 |     defaults=dict(DEFAULTS),
132 |     spec=([
133 |         # stage 1
134 |         op(slim.conv2d, stride=2, num_outputs=16, kernel_size=(3, 3),
135 |            activation_fn=hard_swish),
136 |         mbv3_op(ef=1, n=16, k=3),
137 |         mbv3_op(ef=4, n=24, k=3, s=2),
138 |         mbv3_op(ef=3, n=24, k=3, s=1),
139 |         mbv3_op_se(ef=3, n=40, k=5, s=2),
140 |         mbv3_op_se(ef=3, n=40, k=5, s=1),
141 |         mbv3_op_se(ef=3, n=40, k=5, s=1),
142 |         mbv3_op(ef=6, n=80, k=3, s=2, act=hard_swish),
143 |         mbv3_op(ef=2.5, n=80, k=3, s=1, act=hard_swish),
144 |         mbv3_op(ef=184/80., n=80, k=3, s=1, act=hard_swish),
145 |         mbv3_op(ef=184/80., n=80, k=3, s=1, act=hard_swish),
146 |         mbv3_op_se(ef=6, n=112, k=3, s=1, act=hard_swish),
147 |         mbv3_op_se(ef=6, n=112, k=3, s=1, act=hard_swish),
148 |         mbv3_op_se(ef=6, n=160, k=5, s=2, act=hard_swish),
149 |         mbv3_op_se(ef=6, n=160, k=5, s=1, act=hard_swish),
150 |         mbv3_op_se(ef=6, n=160, k=5, s=1, act=hard_swish),
151 |         op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=960,
152 |            activation_fn=hard_swish),
153 |         op(reduce_to_1x1, default_size=7, stride=1, padding='VALID'),
154 |         op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=1280,
155 |            normalizer_fn=None, activation_fn=hard_swish)
156 |     ]))
157 | 
158 | # 72.2% accuracy.
159 | V3_LARGE_MINIMALISTIC = dict(
160 |     defaults=dict(DEFAULTS),
161 |     spec=([
162 |         # stage 1
163 |         op(slim.conv2d, stride=2, num_outputs=16, kernel_size=(3, 3)),
164 |         mbv3_op(ef=1, n=16, k=3),
165 |         mbv3_op(ef=4, n=24, k=3, s=2),
166 |         mbv3_op(ef=3, n=24, k=3, s=1),
167 |         mbv3_op(ef=3, n=40, k=3, s=2),
168 |         mbv3_op(ef=3, n=40, k=3, s=1),
169 |         mbv3_op(ef=3, n=40, k=3, s=1),
170 |         mbv3_op(ef=6, n=80, k=3, s=2),
171 |         mbv3_op(ef=2.5, n=80, k=3, s=1),
172 |         mbv3_op(ef=184 / 80., n=80, k=3, s=1),
173 |         mbv3_op(ef=184 / 80., n=80, k=3, s=1),
174 |         mbv3_op(ef=6, n=112, k=3, s=1),
175 |         mbv3_op(ef=6, n=112, k=3, s=1),
176 |         mbv3_op(ef=6, n=160, k=3, s=2),
177 |         mbv3_op(ef=6, n=160, k=3, s=1),
178 |         mbv3_op(ef=6, n=160, k=3, s=1),
179 |         op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=960),
180 |         op(reduce_to_1x1, default_size=7, stride=1, padding='VALID'),
181 |         op(slim.conv2d,
182 |            stride=1,
183 |            kernel_size=[1, 1],
184 |            num_outputs=1280,
185 |            normalizer_fn=None)
186 |     ]))
187 | 
188 | # Compatible run: http://mldash/2023283040014348118#scalars
189 | V3_SMALL = dict(
190 |     defaults=dict(DEFAULTS),
191 |     spec=([
192 |         # stage 1
193 |         op(slim.conv2d, stride=2, num_outputs=16, kernel_size=(3, 3),
194 |            activation_fn=hard_swish),
195 |         mbv3_op_se(ef=1, n=16, k=3, s=2),
196 |         mbv3_op(ef=72./16, n=24, k=3, s=2),
197 |         mbv3_op(ef=(88./24), n=24, k=3, s=1),
198 |         mbv3_op_se(ef=4, n=40, k=5, s=2, act=hard_swish),
199 |         mbv3_op_se(ef=6, n=40, k=5, s=1, act=hard_swish),
200 |         mbv3_op_se(ef=6, n=40, k=5, s=1, act=hard_swish),
201 |         mbv3_op_se(ef=3, n=48, k=5, s=1, act=hard_swish),
202 |         mbv3_op_se(ef=3, n=48, k=5, s=1, act=hard_swish),
203 |         mbv3_op_se(ef=6, n=96, k=5, s=2, act=hard_swish),
204 |         mbv3_op_se(ef=6, n=96, k=5, s=1, act=hard_swish),
205 |         mbv3_op_se(ef=6, n=96, k=5, s=1, act=hard_swish),
206 |         op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=576,
207 |            activation_fn=hard_swish),
208 |         op(reduce_to_1x1, default_size=7, stride=1, padding='VALID'),
209 |         op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=1024,
210 |            normalizer_fn=None, activation_fn=hard_swish)
211 |     ]))
212 | 
213 | # 62% accuracy.
214 | V3_SMALL_MINIMALISTIC = dict(
215 |     defaults=dict(DEFAULTS),
216 |     spec=([
217 |         # stage 1
218 |         op(slim.conv2d, stride=2, num_outputs=16, kernel_size=(3, 3)),
219 |         mbv3_op(ef=1, n=16, k=3, s=2),
220 |         mbv3_op(ef=72. / 16, n=24, k=3, s=2),
221 |         mbv3_op(ef=(88. / 24), n=24, k=3, s=1),
222 |         mbv3_op(ef=4, n=40, k=3, s=2),
223 |         mbv3_op(ef=6, n=40, k=3, s=1),
224 |         mbv3_op(ef=6, n=40, k=3, s=1),
225 |         mbv3_op(ef=3, n=48, k=3, s=1),
226 |         mbv3_op(ef=3, n=48, k=3, s=1),
227 |         mbv3_op(ef=6, n=96, k=3, s=2),
228 |         mbv3_op(ef=6, n=96, k=3, s=1),
229 |         mbv3_op(ef=6, n=96, k=3, s=1),
230 |         op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=576),
231 |         op(reduce_to_1x1, default_size=7, stride=1, padding='VALID'),
232 |         op(slim.conv2d,
233 |            stride=1,
234 |            kernel_size=[1, 1],
235 |            num_outputs=1024,
236 |            normalizer_fn=None)
237 |     ]))
238 | 
239 | 
240 | # EdgeTPU friendly variant of MobilenetV3 that uses fused convolutions
241 | # instead of depthwise in the early layers.
242 | V3_EDGETPU = dict(
243 |     defaults=dict(DEFAULTS),
244 |     spec=[
245 |         op(slim.conv2d, stride=2, num_outputs=32, kernel_size=(3, 3)),
246 |         mbv3_fused(k=3, s=1, ef=1, n=16),
247 |         mbv3_fused(k=3, s=2, ef=8, n=32),
248 |         mbv3_fused(k=3, s=1, ef=4, n=32),
249 |         mbv3_fused(k=3, s=1, ef=4, n=32),
250 |         mbv3_fused(k=3, s=1, ef=4, n=32),
251 |         mbv3_fused(k=3, s=2, ef=8, n=48),
252 |         mbv3_fused(k=3, s=1, ef=4, n=48),
253 |         mbv3_fused(k=3, s=1, ef=4, n=48),
254 |         mbv3_fused(k=3, s=1, ef=4, n=48),
255 |         mbv3_op(k=3, s=2, ef=8, n=96),
256 |         mbv3_op(k=3, s=1, ef=4, n=96),
257 |         mbv3_op(k=3, s=1, ef=4, n=96),
258 |         mbv3_op(k=3, s=1, ef=4, n=96),
259 |         mbv3_op(k=3, s=1, ef=8, n=96, residual=False),
260 |         mbv3_op(k=3, s=1, ef=4, n=96),
261 |         mbv3_op(k=3, s=1, ef=4, n=96),
262 |         mbv3_op(k=3, s=1, ef=4, n=96),
263 |         mbv3_op(k=5, s=2, ef=8, n=160),
264 |         mbv3_op(k=5, s=1, ef=4, n=160),
265 |         mbv3_op(k=5, s=1, ef=4, n=160),
266 |         mbv3_op(k=5, s=1, ef=4, n=160),
267 |         mbv3_op(k=3, s=1, ef=8, n=192),
268 |         op(slim.conv2d, stride=1, num_outputs=1280, kernel_size=(1, 1)),
269 |     ])
270 | 
271 | 
272 | @slim.add_arg_scope
273 | def mobilenet(input_tensor,
274 |               num_classes=1001,
275 |               depth_multiplier=1.0,
276 |               scope='MobilenetV3',
277 |               conv_defs=None,
278 |               finegrain_classification_mode=False,
279 |               **kwargs):
280 |   """Creates mobilenet V3 network.
281 | 
282 |   Inference mode is created by default. To create training use training_scope
283 |   below.
284 | 
285 |   with tf.contrib.slim.arg_scope(mobilenet_v3.training_scope()):
286 |      logits, endpoints = mobilenet_v3.mobilenet(input_tensor)
287 | 
288 |   Args:
289 |     input_tensor: The input tensor
290 |     num_classes: number of classes
291 |     depth_multiplier: The multiplier applied to scale number of
292 |     channels in each layer.
293 |     scope: Scope of the operator
294 |     conv_defs: Which version to create. Could be large/small or
295 |     any conv_def (see mobilenet_v3.py for examples).
296 |     finegrain_classification_mode: When set to True, the model
297 |     will keep the last layer large even for small multipliers. Following
298 |     https://arxiv.org/abs/1801.04381
299 |     it improves performance for ImageNet-type of problems.
300 |       *Note* ignored if final_endpoint makes the builder exit earlier.
301 |     **kwargs: passed directly to mobilenet.mobilenet:
302 |       prediction_fn- what prediction function to use.
303 |       reuse-: whether to reuse variables (if reuse set to true, scope
304 |       must be given).
305 |   Returns:
306 |     logits/endpoints pair
307 | 
308 |   Raises:
309 |     ValueError: On invalid arguments
310 |   """
311 |   if conv_defs is None:
312 |     conv_defs = V3_LARGE
313 |   if 'multiplier' in kwargs:
314 |     raise ValueError('mobilenetv2 doesn\'t support generic '
315 |                      'multiplier parameter use "depth_multiplier" instead.')
316 |   if finegrain_classification_mode:
317 |     conv_defs = copy.deepcopy(conv_defs)
318 |     conv_defs['spec'][-1] = conv_defs['spec'][-1]._replace(
319 |         multiplier_func=lambda params, multiplier: params)
320 |   depth_args = {}
321 |   with slim.arg_scope((lib.depth_multiplier,), **depth_args):
322 |     return lib.mobilenet(
323 |         input_tensor,
324 |         num_classes=num_classes,
325 |         conv_defs=conv_defs,
326 |         scope=scope,
327 |         multiplier=depth_multiplier,
328 |         **kwargs)
329 | 
330 | mobilenet.default_image_size = 224
331 | training_scope = lib.training_scope
332 | 
333 | 
334 | @slim.add_arg_scope
335 | def mobilenet_base(input_tensor, depth_multiplier=1.0, **kwargs):
336 |   """Creates base of the mobilenet (no pooling and no logits) ."""
337 |   return mobilenet(
338 |       input_tensor, depth_multiplier=depth_multiplier, base_only=True, **kwargs)
339 | 
340 | 
341 | def wrapped_partial(func, new_defaults=None,
342 |                     **kwargs):
343 |   """Partial function with new default parameters and updated docstring."""
344 |   if not new_defaults:
345 |     new_defaults = {}
346 |   def func_wrapper(*f_args, **f_kwargs):
347 |     new_kwargs = dict(new_defaults)
348 |     new_kwargs.update(f_kwargs)
349 |     return func(*f_args, **new_kwargs)
350 |   functools.update_wrapper(func_wrapper, func)
351 |   partial_func = functools.partial(func_wrapper, **kwargs)
352 |   functools.update_wrapper(partial_func, func)
353 |   return partial_func
354 | 
355 | 
356 | large = wrapped_partial(mobilenet, conv_defs=V3_LARGE)
357 | small = wrapped_partial(mobilenet, conv_defs=V3_SMALL)
358 | edge_tpu = wrapped_partial(mobilenet,
359 |                            new_defaults={'scope': 'MobilenetEdgeTPU'},
360 |                            conv_defs=V3_EDGETPU)
361 | 
362 | # Minimalistic model that does not have Squeeze Excite blocks,
363 | # Hardswish, or 5x5 depthwise convolution.
364 | # This makes the model very friendly for a wide range of hardware
365 | large_minimalistic = wrapped_partial(mobilenet, conv_defs=V3_LARGE_MINIMALISTIC)
366 | small_minimalistic = wrapped_partial(mobilenet, conv_defs=V3_SMALL_MINIMALISTIC)
367 | 
368 | 
369 | def _reduce_consecutive_layers(conv_defs, start_id, end_id, multiplier=0.5):
370 |   """Reduce the outputs of consecutive layers with multiplier.
371 | 
372 |   Args:
373 |     conv_defs: Mobilenet conv_defs.
374 |     start_id: 0-based index of the starting conv_def to be reduced.
375 |     end_id: 0-based index of the last conv_def to be reduced.
376 |     multiplier: The multiplier by which to reduce the conv_defs.
377 | 
378 |   Returns:
379 |     Mobilenet conv_defs where the output sizes from layers [start_id, end_id],
380 |     inclusive, are reduced by multiplier.
381 | 
382 |   Raises:
383 |     ValueError if any layer to be reduced does not have the 'num_outputs'
384 |     attribute.
385 |   """
386 |   defs = copy.deepcopy(conv_defs)
387 |   for d in defs['spec'][start_id:end_id+1]:
388 |     d.params.update({
389 |         'num_outputs': np.int(np.round(d.params['num_outputs'] * multiplier))
390 |     })
391 |   return defs
392 | 
393 | 
394 | V3_LARGE_DETECTION = _reduce_consecutive_layers(V3_LARGE, 13, 16)
395 | V3_SMALL_DETECTION = _reduce_consecutive_layers(V3_SMALL, 9, 12)
396 | 
397 | 
398 | __all__ = ['training_scope', 'mobilenet', 'V3_LARGE', 'V3_SMALL', 'large',
399 |            'small', 'V3_LARGE_DETECTION', 'V3_SMALL_DETECTION']


--------------------------------------------------------------------------------
/lib/networks/model.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | import tensorflow as tf
  3 | from tensorflow.contrib import slim
  4 | from db_config import cfg
  5 | 
  6 | import lib.networks.resnet.resnet_v1 as resnet_v1
  7 | import lib.networks.resnet.resnet_v1_tiny as resnet_v1_tiny
  8 | 
  9 | 
 10 | def unpool(inputs, ratio=2):
 11 |     return tf.image.resize_bilinear(inputs, size=[tf.shape(inputs)[1] * ratio, tf.shape(inputs)[2] * ratio])
 12 | 
 13 | 
 14 | def mean_image_subtraction(images, means=[123.68, 116.78, 103.94]):
 15 |     '''
 16 |     :param images:
 17 |     :param means:
 18 |     :return:
 19 |     '''
 20 |     num_channels = images.get_shape().as_list()[-1]
 21 |     if len(means) != num_channels:
 22 |         raise ValueError('len(means) must match the number of channels')
 23 |     channels = tf.split(axis=3, num_or_size_splits=num_channels, value=images)
 24 |     for i in range(num_channels):
 25 |         channels[i] -= means[i]
 26 |     return tf.concat(axis=3, values=channels)
 27 | 
 28 | def backbone(input, weight_decay, is_training, backbone_name=cfg.BACKBONE):
 29 |     # ['resnet_v1_50', 'resnet_v1_18', 'resnet_v2_50', 'resnet_v2_18', 'mobilenet_v2', 'mobilenet_v3']
 30 | 
 31 |     if backbone_name == 'resnet_v1_50':
 32 |         with slim.arg_scope(resnet_v1.resnet_arg_scope(weight_decay=weight_decay)):
 33 |             logits, end_points = resnet_v1.resnet_v1_50(input, is_training=is_training, scope=backbone_name)
 34 |         return logits, end_points
 35 |     elif backbone_name == 'resnet_v1_18':
 36 |         with slim.arg_scope(resnet_v1_tiny.resnet_arg_scope(weight_decay=weight_decay)):
 37 |             logits, end_points = resnet_v1_tiny.resnet_v1_18(input, is_training=is_training, scope=backbone_name)
 38 |         return logits, end_points
 39 |     else:
 40 |         print('{} is error backbone name, not support!'.format(backbone_name))
 41 |         assert 0
 42 | 
 43 | 
 44 | def model(images, weight_decay=1e-5, is_training=True):
 45 |     """
 46 |     resnet-50
 47 |     :param images:
 48 |     :param weight_decay:
 49 |     :param is_training:
 50 |     :return:
 51 |     """
 52 | 
 53 |     images = mean_image_subtraction(images)
 54 | 
 55 |     logits, end_points = backbone(images, weight_decay, is_training)
 56 | 
 57 |     with tf.variable_scope('feature_fusion', values=[end_points.values]):
 58 |         batch_norm_params = {'decay': cfg["TRAIN"]["MOVING_AVERAGE_DECAY"],
 59 |                              'epsilon': 1e-5,
 60 |                              'scale': True,
 61 |                              'is_training': is_training}
 62 | 
 63 |         with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
 64 |                             activation_fn=tf.nn.relu,
 65 |                             normalizer_fn=slim.batch_norm,
 66 |                             normalizer_params=batch_norm_params,
 67 |                             weights_regularizer=slim.l2_regularizer(weight_decay)):
 68 | 
 69 |             f = [end_points['pool5'], end_points['pool4'],
 70 |                  end_points['pool3'], end_points['pool2']]
 71 | 
 72 |             g = [None, None, None, None]
 73 |             h = [None, None, None, None]
 74 | 
 75 |             num_outputs = [None, 128, 64, 32]
 76 | 
 77 |             # size = K+(K-1)*(r-1)
 78 |             if cfg.ASPP_LAYER:
 79 |                 with tf.variable_scope('aspp_layer'):
 80 |                     f_32x = f[0]
 81 |                     f_32x_1 = slim.conv2d(f_32x, 256, 1)
 82 |                     f_32x_2 = slim.conv2d(f_32x, 256, 3)
 83 |                     f_32x_3 = slim.conv2d(f_32x, 256, 3, rate=3)
 84 |                     f_32x_4 = slim.conv2d(f_32x, 256, 3, rate=6)
 85 |                     aspp_32x = tf.concat([f_32x_1, f_32x_2, f_32x_3, f_32x_4], axis=-1)
 86 |                     f[0] = slim.conv2d(aspp_32x, 2048, 1)
 87 | 
 88 |             for i in range(len(f)):
 89 |                 if i == 0:
 90 |                     h[i] = f[i]
 91 |                 else:
 92 |                     c1_1 = slim.conv2d(tf.concat([g[i - 1], f[i]], axis=-1), num_outputs[i], 1)
 93 |                     h[i] = slim.conv2d(c1_1, num_outputs[i], 3)
 94 |                 if i <= 2:
 95 |                     g[i] = unpool(h[i])
 96 |                 else:
 97 |                     g[i] = slim.conv2d(h[i], num_outputs[i], 3)
 98 | 
 99 |             with tf.variable_scope('concat_branch'):
100 |                 features = [g[3], h[2], h[1], h[0]]
101 | 
102 |                 concat_feature = None
103 | 
104 |                 for i, f in enumerate(features):
105 |                     if i is 0:
106 |                         conv_f = slim.conv2d(f, 64, 3)
107 |                         concat_feature = conv_f
108 |                     else:
109 |                         up_f = slim.conv2d(f, 64, 3)
110 |                         up_f = unpool(up_f, 2**i)
111 |                         concat_feature = tf.concat([concat_feature, up_f], axis=-1)
112 | 
113 |                 final_f = slim.conv2d(concat_feature, 64, 3)
114 | 
115 |             with tf.variable_scope('binarize_branch'):
116 |                 b_conv = slim.conv2d(final_f, 64, 3)
117 |                 b_conv = slim.conv2d_transpose(b_conv, 64, 2, 2)
118 |                 binarize_map = slim.conv2d_transpose(b_conv, 1, 2, 2, activation_fn=tf.nn.sigmoid)
119 | 
120 |             with tf.variable_scope('threshold_branch'):
121 |                 b_conv = slim.conv2d(final_f, 64, 3)
122 |                 b_conv = slim.conv2d_transpose(b_conv, 256, 2, 2)
123 |                 threshold_map = slim.conv2d_transpose(b_conv, 1, 2, 2, activation_fn=tf.nn.sigmoid)
124 | 
125 |             with tf.variable_scope('thresh_binary_branch'):
126 |                 thresh_binary = tf.reciprocal(1 + tf.exp(-cfg.K * (binarize_map-threshold_map)), name='thresh_binary')
127 | 
128 |     return binarize_map, threshold_map, thresh_binary
129 | 
130 | 


--------------------------------------------------------------------------------
/lib/networks/resnet/resnet_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Contains building blocks for various versions of Residual Networks.
 16 | Residual networks (ResNets) were proposed in:
 17 |   Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
 18 |   Deep Residual Learning for Image Recognition. arXiv:1512.03385, 2015
 19 | More variants were introduced in:
 20 |   Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
 21 |   Identity Mappings in Deep Residual Networks. arXiv: 1603.05027, 2016
 22 | We can obtain different ResNet variants by changing the network depth, width,
 23 | and form of residual unit. This module implements the infrastructure for
 24 | building them. Concrete ResNet units and full ResNet networks are implemented in
 25 | the accompanying resnet_v1.py and resnet_v2.py modules.
 26 | Compared to https://github.com/KaimingHe/deep-residual-networks, in the current
 27 | implementation we subsample the output activations in the last residual unit of
 28 | each block, instead of subsampling the input activations in the first residual
 29 | unit of each block. The two implementations give identical results but our
 30 | implementation is more memory efficient.
 31 | """
 32 | from __future__ import absolute_import
 33 | from __future__ import division
 34 | from __future__ import print_function
 35 | 
 36 | import collections
 37 | import tensorflow as tf
 38 | from tensorflow.contrib import slim as contrib_slim
 39 | 
 40 | slim = contrib_slim
 41 | 
 42 | 
 43 | class Block(collections.namedtuple('Block', ['scope', 'unit_fn', 'args'])):
 44 |   """A named tuple describing a ResNet block.
 45 |   Its parts are:
 46 |     scope: The scope of the `Block`.
 47 |     unit_fn: The ResNet unit function which takes as input a `Tensor` and
 48 |       returns another `Tensor` with the output of the ResNet unit.
 49 |     args: A list of length equal to the number of units in the `Block`. The list
 50 |       contains one (depth, depth_bottleneck, stride) tuple for each unit in the
 51 |       block to serve as argument to unit_fn.
 52 |   """
 53 | 
 54 | 
 55 | def subsample(inputs, factor, scope=None):
 56 |   """Subsamples the input along the spatial dimensions.
 57 |   Args:
 58 |     inputs: A `Tensor` of size [batch, height_in, width_in, channels].
 59 |     factor: The subsampling factor.
 60 |     scope: Optional variable_scope.
 61 |   Returns:
 62 |     output: A `Tensor` of size [batch, height_out, width_out, channels] with the
 63 |       input, either intact (if factor == 1) or subsampled (if factor > 1).
 64 |   """
 65 |   if factor == 1:
 66 |     return inputs
 67 |   else:
 68 |     return slim.max_pool2d(inputs, [1, 1], stride=factor, scope=scope)
 69 | 
 70 | 
 71 | def conv2d_same(inputs, num_outputs, kernel_size, stride, rate=1, scope=None):
 72 |   """Strided 2-D convolution with 'SAME' padding.
 73 |   When stride > 1, then we do explicit zero-padding, followed by conv2d with
 74 |   'VALID' padding.
 75 |   Note that
 76 |      net = conv2d_same(inputs, num_outputs, 3, stride=stride)
 77 |   is equivalent to
 78 |      net = slim.conv2d(inputs, num_outputs, 3, stride=1, padding='SAME')
 79 |      net = subsample(net, factor=stride)
 80 |   whereas
 81 |      net = slim.conv2d(inputs, num_outputs, 3, stride=stride, padding='SAME')
 82 |   is different when the input's height or width is even, which is why we add the
 83 |   current function. For more details, see ResnetUtilsTest.testConv2DSameEven().
 84 |   Args:
 85 |     inputs: A 4-D tensor of size [batch, height_in, width_in, channels].
 86 |     num_outputs: An integer, the number of output filters.
 87 |     kernel_size: An int with the kernel_size of the filters.
 88 |     stride: An integer, the output stride.
 89 |     rate: An integer, rate for atrous convolution.
 90 |     scope: Scope.
 91 |   Returns:
 92 |     output: A 4-D tensor of size [batch, height_out, width_out, channels] with
 93 |       the convolution output.
 94 |   """
 95 |   if stride == 1:
 96 |     return slim.conv2d(inputs, num_outputs, kernel_size, stride=1, rate=rate,
 97 |                        padding='SAME', scope=scope)
 98 |   else:
 99 |     kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1)
100 |     pad_total = kernel_size_effective - 1
101 |     pad_beg = pad_total // 2
102 |     pad_end = pad_total - pad_beg
103 |     inputs = tf.pad(inputs,
104 |                     [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]])
105 |     return slim.conv2d(inputs, num_outputs, kernel_size, stride=stride,
106 |                        rate=rate, padding='VALID', scope=scope)
107 | 
108 | 
109 | @slim.add_arg_scope
110 | def stack_blocks_dense(net, blocks, output_stride=None,
111 |                        store_non_strided_activations=False,
112 |                        outputs_collections=None):
113 |   """Stacks ResNet `Blocks` and controls output feature density.
114 |   First, this function creates scopes for the ResNet in the form of
115 |   'block_name/unit_1', 'block_name/unit_2', etc.
116 |   Second, this function allows the user to explicitly control the ResNet
117 |   output_stride, which is the ratio of the input to output spatial resolution.
118 |   This is useful for dense prediction tasks such as semantic segmentation or
119 |   object detection.
120 |   Most ResNets consist of 4 ResNet blocks and subsample the activations by a
121 |   factor of 2 when transitioning between consecutive ResNet blocks. This results
122 |   to a nominal ResNet output_stride equal to 8. If we set the output_stride to
123 |   half the nominal network stride (e.g., output_stride=4), then we compute
124 |   responses twice.
125 |   Control of the output feature density is implemented by atrous convolution.
126 |   Args:
127 |     net: A `Tensor` of size [batch, height, width, channels].
128 |     blocks: A list of length equal to the number of ResNet `Blocks`. Each
129 |       element is a ResNet `Block` object describing the units in the `Block`.
130 |     output_stride: If `None`, then the output will be computed at the nominal
131 |       network stride. If output_stride is not `None`, it specifies the requested
132 |       ratio of input to output spatial resolution, which needs to be equal to
133 |       the product of unit strides from the start up to some level of the ResNet.
134 |       For example, if the ResNet employs units with strides 1, 2, 1, 3, 4, 1,
135 |       then valid values for the output_stride are 1, 2, 6, 24 or None (which
136 |       is equivalent to output_stride=24).
137 |     store_non_strided_activations: If True, we compute non-strided (undecimated)
138 |       activations at the last unit of each block and store them in the
139 |       `outputs_collections` before subsampling them. This gives us access to
140 |       higher resolution intermediate activations which are useful in some
141 |       dense prediction problems but increases 4x the computation and memory cost
142 |       at the last unit of each block.
143 |     outputs_collections: Collection to add the ResNet block outputs.
144 |   Returns:
145 |     net: Output tensor with stride equal to the specified output_stride.
146 |   Raises:
147 |     ValueError: If the target output_stride is not valid.
148 |   """
149 |   # The current_stride variable keeps track of the effective stride of the
150 |   # activations. This allows us to invoke atrous convolution whenever applying
151 |   # the next residual unit would result in the activations having stride larger
152 |   # than the target output_stride.
153 |   current_stride = 1
154 | 
155 |   # The atrous convolution rate parameter.
156 |   rate = 1
157 | 
158 |   for block in blocks:
159 |     with tf.variable_scope(block.scope, 'block', [net]) as sc:
160 |       block_stride = 1
161 |       for i, unit in enumerate(block.args):
162 |         if store_non_strided_activations and i == len(block.args) - 1:
163 |           # Move stride from the block's last unit to the end of the block.
164 |           block_stride = unit.get('stride', 1)
165 |           unit = dict(unit, stride=1)
166 | 
167 |         with tf.variable_scope('unit_%d' % (i + 1), values=[net]):
168 |           # If we have reached the target output_stride, then we need to employ
169 |           # atrous convolution with stride=1 and multiply the atrous rate by the
170 |           # current unit's stride for use in subsequent layers.
171 |           if output_stride is not None and current_stride == output_stride:
172 |             net = block.unit_fn(net, rate=rate, **dict(unit, stride=1))
173 |             rate *= unit.get('stride', 1)
174 | 
175 |           else:
176 |             net = block.unit_fn(net, rate=1, **unit)
177 |             current_stride *= unit.get('stride', 1)
178 |             if output_stride is not None and current_stride > output_stride:
179 |               raise ValueError('The target output_stride cannot be reached.')
180 | 
181 |       # Collect activations at the block's end before performing subsampling.
182 |       net = slim.utils.collect_named_outputs(outputs_collections, sc.name, net)
183 | 
184 |       # Subsampling of the block's output activations.
185 |       if output_stride is not None and current_stride == output_stride:
186 |         rate *= block_stride
187 |       else:
188 |         net = subsample(net, block_stride)
189 |         current_stride *= block_stride
190 |         if output_stride is not None and current_stride > output_stride:
191 |           raise ValueError('The target output_stride cannot be reached.')
192 | 
193 |   if output_stride is not None and current_stride != output_stride:
194 |     raise ValueError('The target output_stride cannot be reached.')
195 | 
196 |   return net
197 | 
198 | 
199 | def resnet_arg_scope(weight_decay=0.0001,
200 |                      batch_norm_decay=0.997,
201 |                      batch_norm_epsilon=1e-5,
202 |                      batch_norm_scale=True,
203 |                      activation_fn=tf.nn.relu,
204 |                      use_batch_norm=True,
205 |                      batch_norm_updates_collections=tf.GraphKeys.UPDATE_OPS):
206 |   """Defines the default ResNet arg scope.
207 |   TODO(gpapan): The batch-normalization related default values above are
208 |     appropriate for use in conjunction with the reference ResNet models
209 |     released at https://github.com/KaimingHe/deep-residual-networks. When
210 |     training ResNets from scratch, they might need to be tuned.
211 |   Args:
212 |     weight_decay: The weight decay to use for regularizing the model.
213 |     batch_norm_decay: The moving average decay when estimating layer activation
214 |       statistics in batch normalization.
215 |     batch_norm_epsilon: Small constant to prevent division by zero when
216 |       normalizing activations by their variance in batch normalization.
217 |     batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the
218 |       activations in the batch normalization layer.
219 |     activation_fn: The activation function which is used in ResNet.
220 |     use_batch_norm: Whether or not to use batch normalization.
221 |     batch_norm_updates_collections: Collection for the update ops for
222 |       batch norm.
223 |   Returns:
224 |     An `arg_scope` to use for the resnet models.
225 |   """
226 |   batch_norm_params = {
227 |       'decay': batch_norm_decay,
228 |       'epsilon': batch_norm_epsilon,
229 |       'scale': batch_norm_scale,
230 |       'updates_collections': batch_norm_updates_collections,
231 |       'fused': None,  # Use fused batch norm if possible.
232 |   }
233 | 
234 |   with slim.arg_scope(
235 |       [slim.conv2d],
236 |       weights_regularizer=slim.l2_regularizer(weight_decay),
237 |       weights_initializer=slim.variance_scaling_initializer(),
238 |       activation_fn=activation_fn,
239 |       normalizer_fn=slim.batch_norm if use_batch_norm else None,
240 |       normalizer_params=batch_norm_params):
241 |     with slim.arg_scope([slim.batch_norm], **batch_norm_params):
242 |       # The following implies padding='SAME' for pool1, which makes feature
243 |       # alignment easier for dense prediction tasks. This is also used in
244 |       # https://github.com/facebook/fb.resnet.torch. However the accompanying
245 |       # code of 'Deep Residual Learning for Image Recognition' uses
246 |       # padding='VALID' for pool1. You can switch to that choice by setting
247 |       # slim.arg_scope([slim.max_pool2d], padding='VALID').
248 |       with slim.arg_scope([slim.max_pool2d], padding='SAME') as arg_sc:
249 |         return arg_sc


--------------------------------------------------------------------------------
/lib/networks/resnet/resnet_v1.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Contains definitions for the original form of Residual Networks.
 16 | The 'v1' residual networks (ResNets) implemented in this module were proposed
 17 | by:
 18 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
 19 |     Deep Residual Learning for Image Recognition. arXiv:1512.03385
 20 | Other variants were introduced in:
 21 | [2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
 22 |     Identity Mappings in Deep Residual Networks. arXiv: 1603.05027
 23 | The networks defined in this module utilize the bottleneck building block of
 24 | [1] with projection shortcuts only for increasing depths. They employ batch
 25 | normalization *after* every weight layer. This is the architecture used by
 26 | MSRA in the Imagenet and MSCOCO 2016 competition models ResNet-101 and
 27 | ResNet-152. See [2; Fig. 1a] for a comparison between the current 'v1'
 28 | architecture and the alternative 'v2' architecture of [2] which uses batch
 29 | normalization *before* every weight layer in the so-called full pre-activation
 30 | units.
 31 | Typical use:
 32 |    from tensorflow.contrib.slim.nets import resnet_v1
 33 | ResNet-101 for image classification into 1000 classes:
 34 |    # inputs has shape [batch, 224, 224, 3]
 35 |    with slim.arg_scope(resnet_v1.resnet_arg_scope()):
 36 |       net, end_points = resnet_v1.resnet_v1_101(inputs, 1000, is_training=False)
 37 | ResNet-101 for semantic segmentation into 21 classes:
 38 |    # inputs has shape [batch, 513, 513, 3]
 39 |    with slim.arg_scope(resnet_v1.resnet_arg_scope()):
 40 |       net, end_points = resnet_v1.resnet_v1_101(inputs,
 41 |                                                 21,
 42 |                                                 is_training=False,
 43 |                                                 global_pool=False,
 44 |                                                 output_stride=16)
 45 | """
 46 | from __future__ import absolute_import
 47 | from __future__ import division
 48 | from __future__ import print_function
 49 | 
 50 | import tensorflow as tf
 51 | from tensorflow.contrib import slim as contrib_slim
 52 | 
 53 | from lib.networks.resnet import resnet_utils
 54 | 
 55 | 
 56 | resnet_arg_scope = resnet_utils.resnet_arg_scope
 57 | slim = contrib_slim
 58 | 
 59 | 
 60 | class NoOpScope(object):
 61 |   """No-op context manager."""
 62 | 
 63 |   def __enter__(self):
 64 |     return None
 65 | 
 66 |   def __exit__(self, exc_type, exc_value, traceback):
 67 |     return False
 68 | 
 69 | 
 70 | @slim.add_arg_scope
 71 | def bottleneck(inputs,
 72 |                depth,
 73 |                depth_bottleneck,
 74 |                stride,
 75 |                rate=1,
 76 |                outputs_collections=None,
 77 |                scope=None,
 78 |                use_bounded_activations=False):
 79 |   """Bottleneck residual unit variant with BN after convolutions.
 80 |   This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for
 81 |   its definition. Note that we use here the bottleneck variant which has an
 82 |   extra bottleneck layer.
 83 |   When putting together two consecutive ResNet blocks that use this unit, one
 84 |   should use stride = 2 in the last unit of the first block.
 85 |   Args:
 86 |     inputs: A tensor of size [batch, height, width, channels].
 87 |     depth: The depth of the ResNet unit output.
 88 |     depth_bottleneck: The depth of the bottleneck layers.
 89 |     stride: The ResNet unit's stride. Determines the amount of downsampling of
 90 |       the units output compared to its input.
 91 |     rate: An integer, rate for atrous convolution.
 92 |     outputs_collections: Collection to add the ResNet unit output.
 93 |     scope: Optional variable_scope.
 94 |     use_bounded_activations: Whether or not to use bounded activations. Bounded
 95 |       activations better lend themselves to quantized inference.
 96 |   Returns:
 97 |     The ResNet unit's output.
 98 |   """
 99 |   with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc:
100 |     depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4)
101 |     if depth == depth_in:
102 |       shortcut = resnet_utils.subsample(inputs, stride, 'shortcut')
103 |     else:
104 |       shortcut = slim.conv2d(
105 |           inputs,
106 |           depth, [1, 1],
107 |           stride=stride,
108 |           activation_fn=tf.nn.relu6 if use_bounded_activations else None,
109 |           scope='shortcut')
110 | 
111 |     residual = slim.conv2d(inputs, depth_bottleneck, [1, 1], stride=1,
112 |                            scope='conv1')
113 |     residual = resnet_utils.conv2d_same(residual, depth_bottleneck, 3, stride,
114 |                                         rate=rate, scope='conv2')
115 |     residual = slim.conv2d(residual, depth, [1, 1], stride=1,
116 |                            activation_fn=None, scope='conv3')
117 | 
118 |     if use_bounded_activations:
119 |       # Use clip_by_value to simulate bandpass activation.
120 |       residual = tf.clip_by_value(residual, -6.0, 6.0)
121 |       output = tf.nn.relu6(shortcut + residual)
122 |     else:
123 |       output = tf.nn.relu(shortcut + residual)
124 | 
125 |     return slim.utils.collect_named_outputs(outputs_collections,
126 |                                             sc.name,
127 |                                             output)
128 | 
129 | 
130 | def resnet_v1(inputs,
131 |               blocks,
132 |               num_classes=None,
133 |               is_training=True,
134 |               global_pool=True,
135 |               output_stride=None,
136 |               include_root_block=True,
137 |               spatial_squeeze=True,
138 |               store_non_strided_activations=False,
139 |               reuse=None,
140 |               scope=None):
141 |   """Generator for v1 ResNet models.
142 |   This function generates a family of ResNet v1 models. See the resnet_v1_*()
143 |   methods for specific model instantiations, obtained by selecting different
144 |   block instantiations that produce ResNets of various depths.
145 |   Training for image classification on Imagenet is usually done with [224, 224]
146 |   inputs, resulting in [7, 7] feature maps at the output of the last ResNet
147 |   block for the ResNets defined in [1] that have nominal stride equal to 32.
148 |   However, for dense prediction tasks we advise that one uses inputs with
149 |   spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In
150 |   this case the feature maps at the ResNet output will have spatial shape
151 |   [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1]
152 |   and corners exactly aligned with the input image corners, which greatly
153 |   facilitates alignment of the features to the image. Using as input [225, 225]
154 |   images results in [8, 8] feature maps at the output of the last ResNet block.
155 |   For dense prediction tasks, the ResNet needs to run in fully-convolutional
156 |   (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all
157 |   have nominal stride equal to 32 and a good choice in FCN mode is to use
158 |   output_stride=16 in order to increase the density of the computed features at
159 |   small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915.
160 |   Args:
161 |     inputs: A tensor of size [batch, height_in, width_in, channels].
162 |     blocks: A list of length equal to the number of ResNet blocks. Each element
163 |       is a resnet_utils.Block object describing the units in the block.
164 |     num_classes: Number of predicted classes for classification tasks.
165 |       If 0 or None, we return the features before the logit layer.
166 |     is_training: whether batch_norm layers are in training mode. If this is set
167 |       to None, the callers can specify slim.batch_norm's is_training parameter
168 |       from an outer slim.arg_scope.
169 |     global_pool: If True, we perform global average pooling before computing the
170 |       logits. Set to True for image classification, False for dense prediction.
171 |     output_stride: If None, then the output will be computed at the nominal
172 |       network stride. If output_stride is not None, it specifies the requested
173 |       ratio of input to output spatial resolution.
174 |     include_root_block: If True, include the initial convolution followed by
175 |       max-pooling, if False excludes it.
176 |     spatial_squeeze: if True, logits is of shape [B, C], if false logits is
177 |         of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
178 |         To use this parameter, the input images must be smaller than 300x300
179 |         pixels, in which case the output logit layer does not contain spatial
180 |         information and can be removed.
181 |     store_non_strided_activations: If True, we compute non-strided (undecimated)
182 |       activations at the last unit of each block and store them in the
183 |       `outputs_collections` before subsampling them. This gives us access to
184 |       higher resolution intermediate activations which are useful in some
185 |       dense prediction problems but increases 4x the computation and memory cost
186 |       at the last unit of each block.
187 |     reuse: whether or not the network and its variables should be reused. To be
188 |       able to reuse 'scope' must be given.
189 |     scope: Optional variable_scope.
190 |   Returns:
191 |     net: A rank-4 tensor of size [batch, height_out, width_out, channels_out].
192 |       If global_pool is False, then height_out and width_out are reduced by a
193 |       factor of output_stride compared to the respective height_in and width_in,
194 |       else both height_out and width_out equal one. If num_classes is 0 or None,
195 |       then net is the output of the last ResNet block, potentially after global
196 |       average pooling. If num_classes a non-zero integer, net contains the
197 |       pre-softmax activations.
198 |     end_points: A dictionary from components of the network to the corresponding
199 |       activation.
200 |   Raises:
201 |     ValueError: If the target output_stride is not valid.
202 |   """
203 |   with tf.variable_scope(scope, 'resnet_v1', [inputs], reuse=reuse) as sc:
204 |     end_points_collection = sc.original_name_scope + '_end_points'
205 |     with slim.arg_scope([slim.conv2d, bottleneck,
206 |                          resnet_utils.stack_blocks_dense],
207 |                         outputs_collections=end_points_collection):
208 |       with (slim.arg_scope([slim.batch_norm], is_training=is_training)
209 |             if is_training is not None else NoOpScope()):
210 |         net = inputs
211 |         if include_root_block:
212 |           if output_stride is not None:
213 |             if output_stride % 4 != 0:
214 |               raise ValueError('The output_stride needs to be a multiple of 4.')
215 |             output_stride /= 4
216 |           net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1')
217 |           net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1')
218 |           net = slim.utils.collect_named_outputs(end_points_collection, 'pool2', net)
219 |         net = resnet_utils.stack_blocks_dense(net, blocks, output_stride,
220 |                                               store_non_strided_activations)
221 |         # Convert end_points_collection into a dictionary of end_points.
222 |         end_points = slim.utils.convert_collection_to_dict(
223 |             end_points_collection)
224 | 
225 |         try:
226 |             end_points['pool3'] = end_points[scope + '/block1']
227 |             end_points['pool4'] = end_points[scope + '/block2']
228 |         except:
229 |             end_points['pool3'] = end_points['Detection/' + scope + '/block1']
230 |             end_points['pool4'] = end_points['Detection/' + scope + '/block1']
231 |         end_points['pool5'] = net
232 | 
233 |         return net, end_points
234 | resnet_v1.default_image_size = 224
235 | 
236 | 
237 | def resnet_v1_block(scope, base_depth, num_units, stride):
238 |   """Helper function for creating a resnet_v1 bottleneck block.
239 |   Args:
240 |     scope: The scope of the block.
241 |     base_depth: The depth of the bottleneck layer for each unit.
242 |     num_units: The number of units in the block.
243 |     stride: The stride of the block, implemented as a stride in the last unit.
244 |       All other units have stride=1.
245 |   Returns:
246 |     A resnet_v1 bottleneck block.
247 |   """
248 |   return resnet_utils.Block(scope, bottleneck, [{
249 |       'depth': base_depth * 4,
250 |       'depth_bottleneck': base_depth,
251 |       'stride': 1
252 |   }] * (num_units - 1) + [{
253 |       'depth': base_depth * 4,
254 |       'depth_bottleneck': base_depth,
255 |       'stride': stride
256 |   }])
257 | 
258 | 
259 | def resnet_v1_50(inputs,
260 |                  num_classes=None,
261 |                  is_training=True,
262 |                  global_pool=True,
263 |                  output_stride=None,
264 |                  spatial_squeeze=True,
265 |                  store_non_strided_activations=False,
266 |                  min_base_depth=8,
267 |                  depth_multiplier=1,
268 |                  reuse=None,
269 |                  scope='resnet_v1_50'):
270 |   """ResNet-50 model of [1]. See resnet_v1() for arg and return description."""
271 |   depth_func = lambda d: max(int(d * depth_multiplier), min_base_depth)
272 |   blocks = [
273 |       resnet_v1_block('block1', base_depth=depth_func(64), num_units=3,
274 |                       stride=2),
275 |       resnet_v1_block('block2', base_depth=depth_func(128), num_units=4,
276 |                       stride=2),
277 |       resnet_v1_block('block3', base_depth=depth_func(256), num_units=6,
278 |                       stride=2),
279 |       resnet_v1_block('block4', base_depth=depth_func(512), num_units=3,
280 |                       stride=1),
281 |   ]
282 |   return resnet_v1(inputs, blocks, num_classes, is_training,
283 |                    global_pool=global_pool, output_stride=output_stride,
284 |                    include_root_block=True, spatial_squeeze=spatial_squeeze,
285 |                    store_non_strided_activations=store_non_strided_activations,
286 |                    reuse=reuse, scope=scope)
287 | resnet_v1_50.default_image_size = resnet_v1.default_image_size
288 | 
289 | 
290 | def resnet_v1_101(inputs,
291 |                   num_classes=None,
292 |                   is_training=True,
293 |                   global_pool=True,
294 |                   output_stride=None,
295 |                   spatial_squeeze=True,
296 |                   store_non_strided_activations=False,
297 |                   min_base_depth=8,
298 |                   depth_multiplier=1,
299 |                   reuse=None,
300 |                   scope='resnet_v1_101'):
301 |   """ResNet-101 model of [1]. See resnet_v1() for arg and return description."""
302 |   depth_func = lambda d: max(int(d * depth_multiplier), min_base_depth)
303 |   blocks = [
304 |       resnet_v1_block('block1', base_depth=depth_func(64), num_units=3,
305 |                       stride=2),
306 |       resnet_v1_block('block2', base_depth=depth_func(128), num_units=4,
307 |                       stride=2),
308 |       resnet_v1_block('block3', base_depth=depth_func(256), num_units=23,
309 |                       stride=2),
310 |       resnet_v1_block('block4', base_depth=depth_func(512), num_units=3,
311 |                       stride=1),
312 |   ]
313 |   return resnet_v1(inputs, blocks, num_classes, is_training,
314 |                    global_pool=global_pool, output_stride=output_stride,
315 |                    include_root_block=True, spatial_squeeze=spatial_squeeze,
316 |                    store_non_strided_activations=store_non_strided_activations,
317 |                    reuse=reuse, scope=scope)
318 | resnet_v1_101.default_image_size = resnet_v1.default_image_size
319 | 
320 | 
321 | def resnet_v1_152(inputs,
322 |                   num_classes=None,
323 |                   is_training=True,
324 |                   global_pool=True,
325 |                   output_stride=None,
326 |                   store_non_strided_activations=False,
327 |                   spatial_squeeze=True,
328 |                   min_base_depth=8,
329 |                   depth_multiplier=1,
330 |                   reuse=None,
331 |                   scope='resnet_v1_152'):
332 |   """ResNet-152 model of [1]. See resnet_v1() for arg and return description."""
333 |   depth_func = lambda d: max(int(d * depth_multiplier), min_base_depth)
334 |   blocks = [
335 |       resnet_v1_block('block1', base_depth=depth_func(64), num_units=3,
336 |                       stride=2),
337 |       resnet_v1_block('block2', base_depth=depth_func(128), num_units=8,
338 |                       stride=2),
339 |       resnet_v1_block('block3', base_depth=depth_func(256), num_units=36,
340 |                       stride=2),
341 |       resnet_v1_block('block4', base_depth=depth_func(512), num_units=3,
342 |                       stride=1),
343 |   ]
344 |   return resnet_v1(inputs, blocks, num_classes, is_training,
345 |                    global_pool=global_pool, output_stride=output_stride,
346 |                    include_root_block=True, spatial_squeeze=spatial_squeeze,
347 |                    store_non_strided_activations=store_non_strided_activations,
348 |                    reuse=reuse, scope=scope)
349 | resnet_v1_152.default_image_size = resnet_v1.default_image_size
350 | 
351 | 
352 | def resnet_v1_200(inputs,
353 |                   num_classes=None,
354 |                   is_training=True,
355 |                   global_pool=True,
356 |                   output_stride=None,
357 |                   store_non_strided_activations=False,
358 |                   spatial_squeeze=True,
359 |                   min_base_depth=8,
360 |                   depth_multiplier=1,
361 |                   reuse=None,
362 |                   scope='resnet_v1_200'):
363 |   """ResNet-200 model of [2]. See resnet_v1() for arg and return description."""
364 |   depth_func = lambda d: max(int(d * depth_multiplier), min_base_depth)
365 |   blocks = [
366 |       resnet_v1_block('block1', base_depth=depth_func(64), num_units=3,
367 |                       stride=2),
368 |       resnet_v1_block('block2', base_depth=depth_func(128), num_units=24,
369 |                       stride=2),
370 |       resnet_v1_block('block3', base_depth=depth_func(256), num_units=36,
371 |                       stride=2),
372 |       resnet_v1_block('block4', base_depth=depth_func(512), num_units=3,
373 |                       stride=1),
374 |   ]
375 |   return resnet_v1(inputs, blocks, num_classes, is_training,
376 |                    global_pool=global_pool, output_stride=output_stride,
377 |                    include_root_block=True, spatial_squeeze=spatial_squeeze,
378 |                    store_non_strided_activations=store_non_strided_activations,
379 |                    reuse=reuse, scope=scope)
380 | resnet_v1_200.default_image_size = resnet_v1.default_image_size


--------------------------------------------------------------------------------
/lib/networks/resnet/resnet_v1_tiny.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Contains definitions for the original form of Residual Networks.
 16 | The 'v1' residual networks (ResNets) implemented in this module were proposed
 17 | by:
 18 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
 19 |     Deep Residual Learning for Image Recognition. arXiv:1512.03385
 20 | Other variants were introduced in:
 21 | [2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
 22 |     Identity Mappings in Deep Residual Networks. arXiv: 1603.05027
 23 | The networks defined in this module utilize the bottleneck building block of
 24 | [1] with projection shortcuts only for increasing depths. They employ batch
 25 | normalization *after* every weight layer. This is the architecture used by
 26 | MSRA in the Imagenet and MSCOCO 2016 competition models ResNet-101 and
 27 | ResNet-152. See [2; Fig. 1a] for a comparison between the current 'v1'
 28 | architecture and the alternative 'v2' architecture of [2] which uses batch
 29 | normalization *before* every weight layer in the so-called full pre-activation
 30 | units.
 31 | Typical use:
 32 |    from tensorflow.contrib.slim.nets import resnet_v1
 33 | ResNet-101 for image classification into 1000 classes:
 34 |    # inputs has shape [batch, 224, 224, 3]
 35 |    with slim.arg_scope(resnet_v1.resnet_arg_scope()):
 36 |       net, end_points = resnet_v1.resnet_v1_101(inputs, 1000, is_training=False)
 37 | ResNet-101 for semantic segmentation into 21 classes:
 38 |    # inputs has shape [batch, 513, 513, 3]
 39 |    with slim.arg_scope(resnet_v1.resnet_arg_scope()):
 40 |       net, end_points = resnet_v1.resnet_v1_101(inputs,
 41 |                                                 21,
 42 |                                                 is_training=False,
 43 |                                                 global_pool=False,
 44 |                                                 output_stride=16)
 45 | """
 46 | from __future__ import absolute_import
 47 | from __future__ import division
 48 | from __future__ import print_function
 49 | 
 50 | import tensorflow as tf
 51 | from tensorflow.contrib import slim as contrib_slim
 52 | 
 53 | from lib.networks.resnet import resnet_utils
 54 | 
 55 | 
 56 | resnet_arg_scope = resnet_utils.resnet_arg_scope
 57 | slim = contrib_slim
 58 | 
 59 | 
 60 | class NoOpScope(object):
 61 |   """No-op context manager."""
 62 | 
 63 |   def __enter__(self):
 64 |     return None
 65 | 
 66 |   def __exit__(self, exc_type, exc_value, traceback):
 67 |     return False
 68 | 
 69 | 
 70 | @slim.add_arg_scope
 71 | def bottleneck_tiny(inputs,
 72 |                depth,
 73 |                depth_bottleneck,
 74 |                stride,
 75 |                rate=1,
 76 |                outputs_collections=None,
 77 |                scope=None,
 78 |                use_bounded_activations=False):
 79 |   """Bottleneck residual unit variant with BN after convolutions.
 80 |   This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for
 81 |   its definition. Note that we use here the bottleneck variant which has an
 82 |   extra bottleneck layer.
 83 |   When putting together two consecutive ResNet blocks that use this unit, one
 84 |   should use stride = 2 in the last unit of the first block.
 85 |   Args:
 86 |     inputs: A tensor of size [batch, height, width, channels].
 87 |     depth: The depth of the ResNet unit output.
 88 |     depth_bottleneck: The depth of the bottleneck layers.
 89 |     stride: The ResNet unit's stride. Determines the amount of downsampling of
 90 |       the units output compared to its input.
 91 |     rate: An integer, rate for atrous convolution.
 92 |     outputs_collections: Collection to add the ResNet unit output.
 93 |     scope: Optional variable_scope.
 94 |     use_bounded_activations: Whether or not to use bounded activations. Bounded
 95 |       activations better lend themselves to quantized inference.
 96 |   Returns:
 97 |     The ResNet unit's output.
 98 |   """
 99 |   with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc:
100 |     depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4)
101 |     if depth == depth_in:
102 |       shortcut = resnet_utils.subsample(inputs, stride, 'shortcut')
103 |     else:
104 |       shortcut = slim.conv2d(
105 |           inputs,
106 |           depth, [1, 1],
107 |           stride=stride,
108 |           activation_fn=tf.nn.relu6 if use_bounded_activations else None,
109 |           scope='shortcut')
110 |     # zzh: for resnet 18, 24. It is diff from 50, 101, 251 etc
111 |     residual = resnet_utils.conv2d_same(inputs, depth_bottleneck, 3, 1,
112 |                                         rate=rate, scope='conv1')
113 |     residual = resnet_utils.conv2d_same(residual, depth, 3, stride,
114 |                                         rate=rate, scope='conv2')
115 | 
116 |     # residual = slim.conv2d(inputs, depth_bottleneck, [1, 1], stride=1,
117 |     #                        scope='conv1')
118 |     # residual = resnet_utils.conv2d_same(residual, depth_bottleneck, 3, stride,
119 |     #                                     rate=rate, scope='conv2')
120 |     # residual = slim.conv2d(residual, depth, [1, 1], stride=1,
121 |     #                        activation_fn=None, scope='conv3')
122 | 
123 | 
124 |     if use_bounded_activations:
125 |       # Use clip_by_value to simulate bandpass activation.
126 |       residual = tf.clip_by_value(residual, -6.0, 6.0)
127 |       output = tf.nn.relu6(shortcut + residual)
128 |     else:
129 |       output = tf.nn.relu(shortcut + residual)
130 | 
131 |     return slim.utils.collect_named_outputs(outputs_collections,
132 |                                             sc.name,
133 |                                             output)
134 | 
135 | 
136 | def resnet_v1_tiny(inputs,
137 |               blocks,
138 |               num_classes=None,
139 |               is_training=True,
140 |               global_pool=True,
141 |               output_stride=None,
142 |               include_root_block=True,
143 |               spatial_squeeze=True,
144 |               store_non_strided_activations=False,
145 |               reuse=None,
146 |               scope=None):
147 |   """Generator for v1 ResNet models.
148 |   This function generates a family of ResNet v1 models. See the resnet_v1_*()
149 |   methods for specific model instantiations, obtained by selecting different
150 |   block instantiations that produce ResNets of various depths.
151 |   Training for image classification on Imagenet is usually done with [224, 224]
152 |   inputs, resulting in [7, 7] feature maps at the output of the last ResNet
153 |   block for the ResNets defined in [1] that have nominal stride equal to 32.
154 |   However, for dense prediction tasks we advise that one uses inputs with
155 |   spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In
156 |   this case the feature maps at the ResNet output will have spatial shape
157 |   [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1]
158 |   and corners exactly aligned with the input image corners, which greatly
159 |   facilitates alignment of the features to the image. Using as input [225, 225]
160 |   images results in [8, 8] feature maps at the output of the last ResNet block.
161 |   For dense prediction tasks, the ResNet needs to run in fully-convolutional
162 |   (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all
163 |   have nominal stride equal to 32 and a good choice in FCN mode is to use
164 |   output_stride=16 in order to increase the density of the computed features at
165 |   small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915.
166 |   Args:
167 |     inputs: A tensor of size [batch, height_in, width_in, channels].
168 |     blocks: A list of length equal to the number of ResNet blocks. Each element
169 |       is a resnet_utils.Block object describing the units in the block.
170 |     num_classes: Number of predicted classes for classification tasks.
171 |       If 0 or None, we return the features before the logit layer.
172 |     is_training: whether batch_norm layers are in training mode. If this is set
173 |       to None, the callers can specify slim.batch_norm's is_training parameter
174 |       from an outer slim.arg_scope.
175 |     global_pool: If True, we perform global average pooling before computing the
176 |       logits. Set to True for image classification, False for dense prediction.
177 |     output_stride: If None, then the output will be computed at the nominal
178 |       network stride. If output_stride is not None, it specifies the requested
179 |       ratio of input to output spatial resolution.
180 |     include_root_block: If True, include the initial convolution followed by
181 |       max-pooling, if False excludes it.
182 |     spatial_squeeze: if True, logits is of shape [B, C], if false logits is
183 |         of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
184 |         To use this parameter, the input images must be smaller than 300x300
185 |         pixels, in which case the output logit layer does not contain spatial
186 |         information and can be removed.
187 |     store_non_strided_activations: If True, we compute non-strided (undecimated)
188 |       activations at the last unit of each block and store them in the
189 |       `outputs_collections` before subsampling them. This gives us access to
190 |       higher resolution intermediate activations which are useful in some
191 |       dense prediction problems but increases 4x the computation and memory cost
192 |       at the last unit of each block.
193 |     reuse: whether or not the network and its variables should be reused. To be
194 |       able to reuse 'scope' must be given.
195 |     scope: Optional variable_scope.
196 |   Returns:
197 |     net: A rank-4 tensor of size [batch, height_out, width_out, channels_out].
198 |       If global_pool is False, then height_out and width_out are reduced by a
199 |       factor of output_stride compared to the respective height_in and width_in,
200 |       else both height_out and width_out equal one. If num_classes is 0 or None,
201 |       then net is the output of the last ResNet block, potentially after global
202 |       average pooling. If num_classes a non-zero integer, net contains the
203 |       pre-softmax activations.
204 |     end_points: A dictionary from components of the network to the corresponding
205 |       activation.
206 |   Raises:
207 |     ValueError: If the target output_stride is not valid.
208 |   """
209 |   with tf.variable_scope(scope, 'resnet_v1', [inputs], reuse=reuse) as sc:
210 |     end_points_collection = sc.original_name_scope + '_end_points'
211 |     with slim.arg_scope([slim.conv2d, bottleneck_tiny,
212 |                          resnet_utils.stack_blocks_dense],
213 |                         outputs_collections=end_points_collection):
214 |       with (slim.arg_scope([slim.batch_norm], is_training=is_training)
215 |             if is_training is not None else NoOpScope()):
216 |         net = inputs
217 |         if include_root_block:
218 |           if output_stride is not None:
219 |             if output_stride % 4 != 0:
220 |               raise ValueError('The output_stride needs to be a multiple of 4.')
221 |             output_stride /= 4
222 |           net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1')
223 |           net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1')
224 |           net = slim.utils.collect_named_outputs(end_points_collection, 'pool2', net)
225 |         net = resnet_utils.stack_blocks_dense(net, blocks, output_stride,
226 |                                               store_non_strided_activations)
227 |         # Convert end_points_collection into a dictionary of end_points.
228 |         end_points = slim.utils.convert_collection_to_dict(
229 |             end_points_collection)
230 | 
231 |         try:
232 |             end_points['pool3'] = end_points[scope + '/block1']
233 |             end_points['pool4'] = end_points[scope + '/block2']
234 |         except:
235 |             end_points['pool3'] = end_points['Detection/' + scope + '/block1']
236 |             end_points['pool4'] = end_points['Detection/' + scope + '/block1']
237 |         end_points['pool5'] = net
238 | 
239 |         return net, end_points
240 | resnet_v1_tiny.default_image_size = 224
241 | 
242 | 
243 | def resnet_v1_block(scope, base_depth, num_units, stride):
244 |   """Helper function for creating a resnet_v1 bottleneck block.
245 |   Args:
246 |     scope: The scope of the block.
247 |     base_depth: The depth of the bottleneck layer for each unit.
248 |     num_units: The number of units in the block.
249 |     stride: The stride of the block, implemented as a stride in the last unit.
250 |       All other units have stride=1.
251 |   Returns:
252 |     A resnet_v1 bottleneck block.
253 |   """
254 |   return resnet_utils.Block(scope, bottleneck_tiny, [{
255 |       'depth': base_depth * 4,
256 |       'depth_bottleneck': base_depth,
257 |       'stride': 1
258 |   }] * (num_units - 1) + [{
259 |       'depth': base_depth * 4,
260 |       'depth_bottleneck': base_depth,
261 |       'stride': stride
262 |   }])
263 | 
264 | def resnet_v1_18(inputs,
265 |                  num_classes=None,
266 |                  is_training=True,
267 |                  global_pool=True,
268 |                  output_stride=None,
269 |                  spatial_squeeze=True,
270 |                  store_non_strided_activations=False,
271 |                  min_base_depth=8,
272 |                  depth_multiplier=1,
273 |                  reuse=None,
274 |                  scope='resnet_v1_18'):
275 |   """ResNet-18 model of [1]. See resnet_v1() for arg and return description."""
276 |   depth_func = lambda d: max(int(d * depth_multiplier), min_base_depth)
277 |   blocks = [
278 |       resnet_v1_block('block1', base_depth=depth_func(64), num_units=2,
279 |                       stride=2),
280 |       resnet_v1_block('block2', base_depth=depth_func(128), num_units=2,
281 |                       stride=2),
282 |       resnet_v1_block('block3', base_depth=depth_func(256), num_units=2,
283 |                       stride=2),
284 |       resnet_v1_block('block4', base_depth=depth_func(512), num_units=2,
285 |                       stride=1),
286 |   ]
287 |   return resnet_v1_tiny(inputs, blocks, num_classes, is_training,
288 |                    global_pool=global_pool, output_stride=output_stride,
289 |                    include_root_block=True, spatial_squeeze=spatial_squeeze,
290 |                    store_non_strided_activations=store_non_strided_activations,
291 |                    reuse=reuse, scope=scope)
292 | resnet_v1_18.default_image_size = resnet_v1_tiny.default_image_size
293 | 
294 | def resnet_v1_34(inputs,
295 |                  num_classes=None,
296 |                  is_training=True,
297 |                  global_pool=True,
298 |                  output_stride=None,
299 |                  spatial_squeeze=True,
300 |                  store_non_strided_activations=False,
301 |                  min_base_depth=8,
302 |                  depth_multiplier=1,
303 |                  reuse=None,
304 |                  scope='resnet_v1_18'):
305 |   """ResNet-18 model of [1]. See resnet_v1() for arg and return description."""
306 |   depth_func = lambda d: max(int(d * depth_multiplier), min_base_depth)
307 |   blocks = [
308 |       resnet_v1_block('block1', base_depth=depth_func(64), num_units=3,
309 |                       stride=2),
310 |       resnet_v1_block('block2', base_depth=depth_func(128), num_units=4,
311 |                       stride=2),
312 |       resnet_v1_block('block3', base_depth=depth_func(256), num_units=6,
313 |                       stride=2),
314 |       resnet_v1_block('block4', base_depth=depth_func(512), num_units=3,
315 |                       stride=1),
316 |   ]
317 |   return resnet_v1_tiny(inputs, blocks, num_classes, is_training,
318 |                    global_pool=global_pool, output_stride=output_stride,
319 |                    include_root_block=True, spatial_squeeze=spatial_squeeze,
320 |                    store_non_strided_activations=store_non_strided_activations,
321 |                    reuse=reuse, scope=scope)
322 | resnet_v1_34.default_image_size = resnet_v1_tiny.default_image_size
323 | 


--------------------------------------------------------------------------------
/lib/networks/resnet/resnet_v2.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Contains definitions for the preactivation form of Residual Networks.
 16 | Residual networks (ResNets) were originally proposed in:
 17 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
 18 |     Deep Residual Learning for Image Recognition. arXiv:1512.03385
 19 | The full preactivation 'v2' ResNet variant implemented in this module was
 20 | introduced by:
 21 | [2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
 22 |     Identity Mappings in Deep Residual Networks. arXiv: 1603.05027
 23 | The key difference of the full preactivation 'v2' variant compared to the
 24 | 'v1' variant in [1] is the use of batch normalization before every weight layer.
 25 | Typical use:
 26 |    from tensorflow.contrib.slim.nets import resnet_v2
 27 | ResNet-101 for image classification into 1000 classes:
 28 |    # inputs has shape [batch, 224, 224, 3]
 29 |    with slim.arg_scope(resnet_v2.resnet_arg_scope()):
 30 |       net, end_points = resnet_v2.resnet_v2_101(inputs, 1000, is_training=False)
 31 | ResNet-101 for semantic segmentation into 21 classes:
 32 |    # inputs has shape [batch, 513, 513, 3]
 33 |    with slim.arg_scope(resnet_v2.resnet_arg_scope()):
 34 |       net, end_points = resnet_v2.resnet_v2_101(inputs,
 35 |                                                 21,
 36 |                                                 is_training=False,
 37 |                                                 global_pool=False,
 38 |                                                 output_stride=16)
 39 | """
 40 | from __future__ import absolute_import
 41 | from __future__ import division
 42 | from __future__ import print_function
 43 | 
 44 | import tensorflow as tf
 45 | from tensorflow.contrib import slim as contrib_slim
 46 | 
 47 | from lib.networks.resnet import resnet_utils
 48 | 
 49 | slim = contrib_slim
 50 | resnet_arg_scope = resnet_utils.resnet_arg_scope
 51 | 
 52 | 
 53 | @slim.add_arg_scope
 54 | def bottleneck(inputs, depth, depth_bottleneck, stride, rate=1,
 55 |                outputs_collections=None, scope=None):
 56 |   """Bottleneck residual unit variant with BN before convolutions.
 57 |   This is the full preactivation residual unit variant proposed in [2]. See
 58 |   Fig. 1(b) of [2] for its definition. Note that we use here the bottleneck
 59 |   variant which has an extra bottleneck layer.
 60 |   When putting together two consecutive ResNet blocks that use this unit, one
 61 |   should use stride = 2 in the last unit of the first block.
 62 |   Args:
 63 |     inputs: A tensor of size [batch, height, width, channels].
 64 |     depth: The depth of the ResNet unit output.
 65 |     depth_bottleneck: The depth of the bottleneck layers.
 66 |     stride: The ResNet unit's stride. Determines the amount of downsampling of
 67 |       the units output compared to its input.
 68 |     rate: An integer, rate for atrous convolution.
 69 |     outputs_collections: Collection to add the ResNet unit output.
 70 |     scope: Optional variable_scope.
 71 |   Returns:
 72 |     The ResNet unit's output.
 73 |   """
 74 |   with tf.variable_scope(scope, 'bottleneck_v2', [inputs]) as sc:
 75 |     depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4)
 76 |     preact = slim.batch_norm(inputs, activation_fn=tf.nn.relu, scope='preact')
 77 |     if depth == depth_in:
 78 |       shortcut = resnet_utils.subsample(inputs, stride, 'shortcut')
 79 |     else:
 80 |       shortcut = slim.conv2d(preact, depth, [1, 1], stride=stride,
 81 |                              normalizer_fn=None, activation_fn=None,
 82 |                              scope='shortcut')
 83 | 
 84 |     residual = slim.conv2d(preact, depth_bottleneck, [1, 1], stride=1,
 85 |                            scope='conv1')
 86 |     residual = resnet_utils.conv2d_same(residual, depth_bottleneck, 3, stride,
 87 |                                         rate=rate, scope='conv2')
 88 |     residual = slim.conv2d(residual, depth, [1, 1], stride=1,
 89 |                            normalizer_fn=None, activation_fn=None,
 90 |                            scope='conv3')
 91 | 
 92 |     output = shortcut + residual
 93 | 
 94 |     return slim.utils.collect_named_outputs(outputs_collections,
 95 |                                             sc.name,
 96 |                                             output)
 97 | 
 98 | 
 99 | def resnet_v2(inputs,
100 |               blocks,
101 |               num_classes=None,
102 |               is_training=True,
103 |               global_pool=True,
104 |               output_stride=None,
105 |               include_root_block=True,
106 |               spatial_squeeze=True,
107 |               reuse=None,
108 |               scope=None):
109 |   """Generator for v2 (preactivation) ResNet models.
110 |   This function generates a family of ResNet v2 models. See the resnet_v2_*()
111 |   methods for specific model instantiations, obtained by selecting different
112 |   block instantiations that produce ResNets of various depths.
113 |   Training for image classification on Imagenet is usually done with [224, 224]
114 |   inputs, resulting in [7, 7] feature maps at the output of the last ResNet
115 |   block for the ResNets defined in [1] that have nominal stride equal to 32.
116 |   However, for dense prediction tasks we advise that one uses inputs with
117 |   spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In
118 |   this case the feature maps at the ResNet output will have spatial shape
119 |   [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1]
120 |   and corners exactly aligned with the input image corners, which greatly
121 |   facilitates alignment of the features to the image. Using as input [225, 225]
122 |   images results in [8, 8] feature maps at the output of the last ResNet block.
123 |   For dense prediction tasks, the ResNet needs to run in fully-convolutional
124 |   (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all
125 |   have nominal stride equal to 32 and a good choice in FCN mode is to use
126 |   output_stride=16 in order to increase the density of the computed features at
127 |   small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915.
128 |   Args:
129 |     inputs: A tensor of size [batch, height_in, width_in, channels].
130 |     blocks: A list of length equal to the number of ResNet blocks. Each element
131 |       is a resnet_utils.Block object describing the units in the block.
132 |     num_classes: Number of predicted classes for classification tasks.
133 |       If 0 or None, we return the features before the logit layer.
134 |     is_training: whether batch_norm layers are in training mode.
135 |     global_pool: If True, we perform global average pooling before computing the
136 |       logits. Set to True for image classification, False for dense prediction.
137 |     output_stride: If None, then the output will be computed at the nominal
138 |       network stride. If output_stride is not None, it specifies the requested
139 |       ratio of input to output spatial resolution.
140 |     include_root_block: If True, include the initial convolution followed by
141 |       max-pooling, if False excludes it. If excluded, `inputs` should be the
142 |       results of an activation-less convolution.
143 |     spatial_squeeze: if True, logits is of shape [B, C], if false logits is
144 |         of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
145 |         To use this parameter, the input images must be smaller than 300x300
146 |         pixels, in which case the output logit layer does not contain spatial
147 |         information and can be removed.
148 |     reuse: whether or not the network and its variables should be reused. To be
149 |       able to reuse 'scope' must be given.
150 |     scope: Optional variable_scope.
151 |   Returns:
152 |     net: A rank-4 tensor of size [batch, height_out, width_out, channels_out].
153 |       If global_pool is False, then height_out and width_out are reduced by a
154 |       factor of output_stride compared to the respective height_in and width_in,
155 |       else both height_out and width_out equal one. If num_classes is 0 or None,
156 |       then net is the output of the last ResNet block, potentially after global
157 |       average pooling. If num_classes is a non-zero integer, net contains the
158 |       pre-softmax activations.
159 |     end_points: A dictionary from components of the network to the corresponding
160 |       activation.
161 |   Raises:
162 |     ValueError: If the target output_stride is not valid.
163 |   """
164 |   with tf.variable_scope(scope, 'resnet_v2', [inputs], reuse=reuse) as sc:
165 |     end_points_collection = sc.original_name_scope + '_end_points'
166 |     with slim.arg_scope([slim.conv2d, bottleneck,
167 |                          resnet_utils.stack_blocks_dense],
168 |                         outputs_collections=end_points_collection):
169 |       with slim.arg_scope([slim.batch_norm], is_training=is_training):
170 |         net = inputs
171 |         if include_root_block:
172 |           if output_stride is not None:
173 |             if output_stride % 4 != 0:
174 |               raise ValueError('The output_stride needs to be a multiple of 4.')
175 |             output_stride /= 4
176 |           # We do not include batch normalization or activation functions in
177 |           # conv1 because the first ResNet unit will perform these. Cf.
178 |           # Appendix of [2].
179 |           with slim.arg_scope([slim.conv2d],
180 |                               activation_fn=None, normalizer_fn=None):
181 |             net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1')
182 |           net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1')
183 |           net = slim.utils.collect_named_outputs(end_points_collection, 'pool2', net)
184 |         net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)
185 |         # This is needed because the pre-activation variant does not have batch
186 |         # normalization or activation functions in the residual unit output. See
187 |         # Appendix of [2].
188 |         net = slim.batch_norm(net, activation_fn=tf.nn.relu, scope='postnorm')
189 |         # Convert end_points_collection into a dictionary of end_points.
190 |         end_points = slim.utils.convert_collection_to_dict(
191 |             end_points_collection)
192 | 
193 |         try:
194 |             end_points['pool3'] = end_points[scope + '/block1']
195 |             end_points['pool4'] = end_points[scope + '/block2']
196 |         except:
197 |             end_points['pool3'] = end_points['Detection/' + scope + '/block1']
198 |             end_points['pool4'] = end_points['Detection/' + scope + '/block1']
199 |         end_points['pool5'] = net
200 | 
201 |         # if global_pool:
202 |         #   # Global average pooling.
203 |         #   net = tf.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
204 |         #   end_points['global_pool'] = net
205 |         # if num_classes:
206 |         #   net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None,
207 |         #                     normalizer_fn=None, scope='logits')
208 |         #   end_points[sc.name + '/logits'] = net
209 |         #   if spatial_squeeze:
210 |         #     net = tf.squeeze(net, [1, 2], name='SpatialSqueeze')
211 |         #     end_points[sc.name + '/spatial_squeeze'] = net
212 |         #   end_points['predictions'] = slim.softmax(net, scope='predictions')
213 |         return net, end_points
214 | resnet_v2.default_image_size = 224
215 | 
216 | 
217 | def resnet_v2_block(scope, base_depth, num_units, stride):
218 |   """Helper function for creating a resnet_v2 bottleneck block.
219 |   Args:
220 |     scope: The scope of the block.
221 |     base_depth: The depth of the bottleneck layer for each unit.
222 |     num_units: The number of units in the block.
223 |     stride: The stride of the block, implemented as a stride in the last unit.
224 |       All other units have stride=1.
225 |   Returns:
226 |     A resnet_v2 bottleneck block.
227 |   """
228 |   return resnet_utils.Block(scope, bottleneck, [{
229 |       'depth': base_depth * 4,
230 |       'depth_bottleneck': base_depth,
231 |       'stride': 1
232 |   }] * (num_units - 1) + [{
233 |       'depth': base_depth * 4,
234 |       'depth_bottleneck': base_depth,
235 |       'stride': stride
236 |   }])
237 | resnet_v2.default_image_size = 224
238 | 
239 | 
240 | def resnet_v2_50(inputs,
241 |                  num_classes=None,
242 |                  is_training=True,
243 |                  global_pool=True,
244 |                  output_stride=None,
245 |                  spatial_squeeze=True,
246 |                  reuse=None,
247 |                  scope='resnet_v2_50'):
248 |   """ResNet-50 model of [1]. See resnet_v2() for arg and return description."""
249 |   blocks = [
250 |       resnet_v2_block('block1', base_depth=64, num_units=3, stride=2),
251 |       resnet_v2_block('block2', base_depth=128, num_units=4, stride=2),
252 |       resnet_v2_block('block3', base_depth=256, num_units=6, stride=2),
253 |       resnet_v2_block('block4', base_depth=512, num_units=3, stride=1),
254 |   ]
255 |   return resnet_v2(inputs, blocks, num_classes, is_training=is_training,
256 |                    global_pool=global_pool, output_stride=output_stride,
257 |                    include_root_block=True, spatial_squeeze=spatial_squeeze,
258 |                    reuse=reuse, scope=scope)
259 | resnet_v2_50.default_image_size = resnet_v2.default_image_size
260 | 
261 | 
262 | def resnet_v2_101(inputs,
263 |                   num_classes=None,
264 |                   is_training=True,
265 |                   global_pool=True,
266 |                   output_stride=None,
267 |                   spatial_squeeze=True,
268 |                   reuse=None,
269 |                   scope='resnet_v2_101'):
270 |   """ResNet-101 model of [1]. See resnet_v2() for arg and return description."""
271 |   blocks = [
272 |       resnet_v2_block('block1', base_depth=64, num_units=3, stride=2),
273 |       resnet_v2_block('block2', base_depth=128, num_units=4, stride=2),
274 |       resnet_v2_block('block3', base_depth=256, num_units=23, stride=2),
275 |       resnet_v2_block('block4', base_depth=512, num_units=3, stride=1),
276 |   ]
277 |   return resnet_v2(inputs, blocks, num_classes, is_training=is_training,
278 |                    global_pool=global_pool, output_stride=output_stride,
279 |                    include_root_block=True, spatial_squeeze=spatial_squeeze,
280 |                    reuse=reuse, scope=scope)
281 | resnet_v2_101.default_image_size = resnet_v2.default_image_size
282 | 
283 | 
284 | def resnet_v2_152(inputs,
285 |                   num_classes=None,
286 |                   is_training=True,
287 |                   global_pool=True,
288 |                   output_stride=None,
289 |                   spatial_squeeze=True,
290 |                   reuse=None,
291 |                   scope='resnet_v2_152'):
292 |   """ResNet-152 model of [1]. See resnet_v2() for arg and return description."""
293 |   blocks = [
294 |       resnet_v2_block('block1', base_depth=64, num_units=3, stride=2),
295 |       resnet_v2_block('block2', base_depth=128, num_units=8, stride=2),
296 |       resnet_v2_block('block3', base_depth=256, num_units=36, stride=2),
297 |       resnet_v2_block('block4', base_depth=512, num_units=3, stride=1),
298 |   ]
299 |   return resnet_v2(inputs, blocks, num_classes, is_training=is_training,
300 |                    global_pool=global_pool, output_stride=output_stride,
301 |                    include_root_block=True, spatial_squeeze=spatial_squeeze,
302 |                    reuse=reuse, scope=scope)
303 | resnet_v2_152.default_image_size = resnet_v2.default_image_size
304 | 
305 | 
306 | def resnet_v2_200(inputs,
307 |                   num_classes=None,
308 |                   is_training=True,
309 |                   global_pool=True,
310 |                   output_stride=None,
311 |                   spatial_squeeze=True,
312 |                   reuse=None,
313 |                   scope='resnet_v2_200'):
314 |   """ResNet-200 model of [2]. See resnet_v2() for arg and return description."""
315 |   blocks = [
316 |       resnet_v2_block('block1', base_depth=64, num_units=3, stride=2),
317 |       resnet_v2_block('block2', base_depth=128, num_units=24, stride=2),
318 |       resnet_v2_block('block3', base_depth=256, num_units=36, stride=2),
319 |       resnet_v2_block('block4', base_depth=512, num_units=3, stride=1),
320 |   ]
321 |   return resnet_v2(inputs, blocks, num_classes, is_training=is_training,
322 |                    global_pool=global_pool, output_stride=output_stride,
323 |                    include_root_block=True, spatial_squeeze=spatial_squeeze,
324 |                    reuse=reuse, scope=scope)
325 | resnet_v2_200.default_image_size = resnet_v2.default_image_size


--------------------------------------------------------------------------------
/lib/postprocess/post_process.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | import pyclipper
  4 | 
  5 | from shapely.geometry import Polygon
  6 | from db_config import cfg
  7 | 
  8 | class SegDetectorRepresenter():
  9 |     def __init__(self, thresh=0.3, box_thresh=0.7, max_candidates=1000):
 10 |         self.min_size = 1
 11 |         self.thresh = thresh
 12 |         self.box_thresh = box_thresh
 13 |         self.max_candidates = max_candidates
 14 | 
 15 |     def __call__(self, input_batch, score_maps, is_output_polygon=False):
 16 |         segmentation = self._binarize(score_maps)
 17 |         boxes_batch = []
 18 |         scores_batch = []
 19 |         for batch_index in range(len(input_batch)):
 20 |             height, width, _ = input_batch[batch_index].shape
 21 |             if is_output_polygon:
 22 |                 boxes, scores = self._polygons_from_bitmap(score_maps[batch_index], segmentation[batch_index], width, height)
 23 |             else:
 24 |                 boxes, scores = self._boxes_from_bitmap(score_maps[batch_index], segmentation[batch_index], width, height)
 25 |             boxes_batch.append(boxes)
 26 |             scores_batch.append(scores)
 27 |         return boxes_batch, scores_batch
 28 | 
 29 |     def _binarize(self, pred):
 30 |         return pred > self.thresh
 31 | 
 32 |     def _polygons_from_bitmap(self, pred, bitmap, dest_width, dest_height):
 33 |         '''
 34 |         _bitmap: single map with shape (H, W),
 35 |             whose values are binarized as {0, 1}
 36 |         '''
 37 | 
 38 |         assert len(bitmap.shape) == 3
 39 |         height, width, _ = bitmap.shape
 40 |         boxes = []
 41 |         scores = []
 42 | 
 43 |         contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
 44 |         for contour in contours[:self.max_candidates]:
 45 |             epsilon = cfg.EPSILON_RATIO * cv2.arcLength(contour, True)
 46 |             approx = cv2.approxPolyDP(contour, epsilon, True)
 47 |             points = approx.reshape((-1, 2))
 48 |             if points.shape[0] < 4:
 49 |                 continue
 50 |             # print('poly contour shape', contour.shape)
 51 |             contour = contour.reshape([-1, 2])
 52 |             score = self._box_score_fast(pred, contour)
 53 |             if self.box_thresh > score:
 54 |                 continue
 55 |             # print('points', points)
 56 |             if points.shape[0] > 2:
 57 |                 box = self._unclip(points, unclip_ratio=2.0)
 58 |                 # print('bbox', box)
 59 |                 if len(box) != 1:
 60 |                     continue
 61 |             else:
 62 |                 continue
 63 |             # print('box', box.shape)
 64 |             box = box.reshape(-1, 2)
 65 |             # print('re', box.shape)
 66 |             _, sside = self._get_mini_boxes(box.reshape((-1, 1, 2)))
 67 |             if sside < self.min_size + 2:
 68 |                 continue
 69 | 
 70 |             if not isinstance(dest_width, int):
 71 |                 dest_width = dest_width.item()
 72 |                 dest_height = dest_height.item()
 73 | 
 74 |             box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width)
 75 |             box[:, 1] = np.clip(np.round(box[:, 1] / height * dest_height), 0, dest_height)
 76 |             boxes.append(box.astype(np.float))
 77 |             scores.append(score)
 78 |         return np.array(boxes, ), scores
 79 | 
 80 |     def _boxes_from_bitmap(self, pred, bitmap, dest_width, dest_height):
 81 |         '''
 82 |         _bitmap: single map with shape (H, W),
 83 |             whose values are binarized as {0, 1}
 84 |         '''
 85 | 
 86 |         assert len(bitmap.shape) == 3
 87 |         height, width, _ = bitmap.shape
 88 |         contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
 89 |         num_contours = min(len(contours), self.max_candidates)
 90 |         boxes = np.zeros((num_contours, 4, 2), dtype=np.float32)
 91 |         scores = np.zeros((num_contours,), dtype=np.float32)
 92 | 
 93 |         for index in range(num_contours):
 94 |             contour = contours[index].squeeze(1)
 95 |             points, sside = self._get_mini_boxes(contour)
 96 |             if sside < self.min_size:
 97 |                 continue
 98 |             points = np.array(points)
 99 |             # print('bbox contour shape', contour.shape)
100 |             score = self._box_score_fast(pred, contour)
101 |             if self.box_thresh > score:
102 |                 continue
103 | 
104 |             box = self._unclip(points).reshape(-1, 1, 2)
105 |             box, sside = self._get_mini_boxes(box)
106 |             if sside < self.min_size + 2:
107 |                 continue
108 |             box = np.array(box)
109 |             if not isinstance(dest_width, int):
110 |                 dest_width = dest_width.item()
111 |                 dest_height = dest_height.item()
112 | 
113 |             box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width)
114 |             box[:, 1] = np.clip(np.round(box[:, 1] / height * dest_height), 0, dest_height)
115 |             boxes[index, :, :] = box.astype(np.int16)
116 |             scores[index] = score
117 |         return boxes, scores
118 | 
119 |     def _unclip(self, box, unclip_ratio=1.5):
120 |         poly = Polygon(box)
121 |         distance = poly.area * unclip_ratio / poly.length
122 |         offset = pyclipper.PyclipperOffset()
123 |         offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
124 |         expanded = np.array(offset.Execute(distance))
125 |         return expanded
126 | 
127 |     def _get_mini_boxes(self, contour):
128 |         # print(contour.shape)
129 |         bounding_box = cv2.minAreaRect(contour)
130 |         points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
131 | 
132 |         index_1, index_2, index_3, index_4 = 0, 1, 2, 3
133 |         if points[1][1] > points[0][1]:
134 |             index_1 = 0
135 |             index_4 = 1
136 |         else:
137 |             index_1 = 1
138 |             index_4 = 0
139 |         if points[3][1] > points[2][1]:
140 |             index_2 = 2
141 |             index_3 = 3
142 |         else:
143 |             index_2 = 3
144 |             index_3 = 2
145 | 
146 |         box = [points[index_1], points[index_2], points[index_3], points[index_4]]
147 |         return box, min(bounding_box[1])
148 | 
149 |     def _box_score_fast(self, bitmap, _box):
150 |         h, w = bitmap.shape[:2]
151 |         box = _box.copy()
152 |         xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int), 0, w - 1)
153 |         xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int), 0, w - 1)
154 |         ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int), 0, h - 1)
155 |         ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int), 0, h - 1)
156 | 
157 |         mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
158 |         box[:, 0] = box[:, 0] - xmin
159 |         box[:, 1] = box[:, 1] - ymin
160 |         cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1)
161 |         return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
162 | 


--------------------------------------------------------------------------------
/lib/utils.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import cv2
  3 | import os
  4 | import shutil
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | 
  8 | from shapely.geometry import Polygon, MultiPoint
  9 | 
 10 | import lib.networks.model as model
 11 | 
 12 | def quad_iou(_gt_bbox, _pre_bbox):
 13 | 
 14 |     gt_poly = Polygon(_gt_bbox).convex_hull
 15 |     pre_poly = Polygon(_pre_bbox).convex_hull
 16 | 
 17 |     union_poly = np.concatenate((_gt_bbox, _pre_bbox))
 18 | 
 19 |     if not gt_poly.intersects(pre_poly):
 20 |         iou = 0
 21 |         return iou
 22 |     else:
 23 |         inter_area = gt_poly.intersection(pre_poly).area
 24 |         union_area = MultiPoint(union_poly).convex_hull.area
 25 | 
 26 |         if union_area == 0:
 27 |             iou = 0
 28 |         else:
 29 |             iou = float(inter_area) / union_area
 30 | 
 31 |         return iou
 32 | 
 33 | def polygon_riou(pred_box, gt_box):
 34 |     """
 35 |     :param pred_box: list [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
 36 |     :param gt_box: list [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
 37 |     :return:
 38 |     """
 39 |     pred_polygon_points = np.array(pred_box).reshape(-1, 2)
 40 |     pred_poly = Polygon(pred_polygon_points).convex_hull
 41 | 
 42 |     gt_polygon_points = np.array(gt_box).reshape(-1, 2)
 43 | 
 44 |     gt_poly = Polygon(gt_polygon_points).convex_hull
 45 |     if not pred_poly.intersects(gt_poly):
 46 |         iou = 0
 47 |     else:
 48 |         inter_area = pred_poly.intersection(gt_poly).area
 49 |         union_area = gt_poly.area
 50 |         if union_area == 0:
 51 |             iou = 0
 52 |         else:
 53 |             iou = float(inter_area) / union_area
 54 |     return iou
 55 | 
 56 | def compute_f1_score(precision, recall):
 57 |     if precision == 0 or recall == 0:
 58 |         return 0.0
 59 |     else:
 60 |         return 2.0 * (precision * recall) / (precision + recall)
 61 | 
 62 | def load_ctw1500_labels(path):
 63 |     """
 64 |     load pts
 65 |     :param path:
 66 |     :return: polys shape [N, 14, 2]
 67 |     """
 68 |     assert os.path.exists(path), '{} is not exits'.format(path)
 69 |     polys = []
 70 |     tags = []
 71 |     with open(path, 'r') as f:
 72 |         lines = f.readlines()
 73 |         for line in lines:
 74 |             parts = line.strip().split(',')
 75 |             x = float(parts[0])
 76 |             y = float(parts[1])
 77 |             pts = [float(i) for i in parts[4:32]]
 78 |             poly = np.array(pts) + [x, y] * 14
 79 |             polys.append(poly.reshape([-1, 2]))
 80 |             tags.append(False)
 81 |     return np.array(polys, np.float), tags
 82 | 
 83 | def load_icdar_labels(path):
 84 |     pass
 85 | 
 86 | def make_dir(dir):
 87 |     if os.path.exists(dir):
 88 |         shutil.rmtree(dir)
 89 |     os.makedirs(dir)
 90 | 
 91 | 
 92 | def resize_img(img, max_size=736):
 93 |     h, w, _ = img.shape
 94 | 
 95 |     if max(h, w) > max_size:
 96 |         ratio = float(max_size) / h if h > w else float(max_size) / w
 97 |     else:
 98 |         ratio = 1.
 99 | 
100 |     resize_h = int(ratio * h)
101 |     resize_w = int(ratio * w)
102 | 
103 |     resize_h = resize_h if resize_h % 32 == 0 else abs(resize_h // 32 - 1) * 32
104 |     resize_w = resize_w if resize_w % 32 == 0 else abs(resize_w // 32 - 1) * 32
105 |     resized_img = cv2.resize(img, (int(resize_w), int(resize_h)))
106 | 
107 |     ratio_h = resize_h / float(h)
108 |     ratio_w = resize_w / float(w)
109 | 
110 |     return resized_img, (ratio_h, ratio_w)
111 | 
112 | def ckpt2pb(ckptpath):
113 | 
114 |     os.environ['CUDA_VISIBLE_DEVICES'] = '0'
115 |     tf.reset_default_graph()
116 |     input_images = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_images')
117 |     global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)
118 | 
119 |     binarize_map, threshold_map, thresh_binary = model.model(input_images, is_training=False)
120 | 
121 |     variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step)
122 |     saver = tf.train.Saver(variable_averages.variables_to_restore())
123 |     gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
124 |     gpu_config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options, allow_soft_placement=True)
125 |     sess = tf.Session(config=gpu_config)
126 |     saver.restore(sess, ckptpath)
127 | 
128 |     from tensorflow.python.framework import graph_util
129 |     constant_graph = graph_util.convert_variables_to_constants(
130 |         sess,
131 |         sess.graph_def,
132 |         ['feature_fusion/binarize_branch/Conv2d_transpose_1/Sigmoid'])
133 | 
134 |     with tf.gfile.FastGFile('db.pb', mode='wb') as f:
135 |         f.write(constant_graph.SerializeToString())
136 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy
 2 | imgaug
 3 | pyclipper
 4 | easydict
 5 | Shapely
 6 | tqdm
 7 | tensorflow
 8 | imageio
 9 | opencv_python
10 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import time
  3 | import numpy as np
  4 | import logging
  5 | import os
  6 | import tensorflow as tf
  7 | from tensorflow.contrib import slim
  8 | 
  9 | from db_config import cfg
 10 | 
 11 | import lib.networks.model as model
 12 | from lib.networks.losses import compute_loss, compute_acc
 13 | from lib.dataset.dataloader import get_batch
 14 | 
 15 | import warnings
 16 | warnings.filterwarnings("ignore")
 17 | 
 18 | def make_dir(dir):
 19 |     if not os.path.exists(dir):
 20 |         os.makedirs(dir)
 21 | 
 22 | def tower_loss(images, gt_score_maps, gt_threshold_map, gt_score_mask,
 23 |                gt_thresh_mask, reuse_variables):
 24 | 
 25 |     with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables):
 26 |         binarize_map, threshold_map, thresh_binary = model.model(images, is_training=True)
 27 | 
 28 |     model_loss = compute_loss(binarize_map, threshold_map, thresh_binary,
 29 |                               gt_score_maps, gt_threshold_map, gt_score_mask, gt_thresh_mask)
 30 | 
 31 |     total_loss = tf.add_n([model_loss] + tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
 32 | 
 33 |     # add summary
 34 |     if reuse_variables is None:
 35 |         tf.summary.image('gt/input_imgs', images)
 36 |         tf.summary.image('gt/score_map', gt_score_maps)
 37 |         tf.summary.image('gt/threshold_map', gt_threshold_map * 255)
 38 |         tf.summary.image('gt/score_mask', gt_score_mask)
 39 |         tf.summary.image('gt/thresh_mask', gt_thresh_mask)
 40 | 
 41 |         tf.summary.image('pred/binarize_map', binarize_map)
 42 |         tf.summary.image('pred/threshold_map', threshold_map * 255)
 43 |         tf.summary.image('pred/thresh_binary', thresh_binary)
 44 | 
 45 |         tf.summary.scalar('model_loss', model_loss)
 46 |         tf.summary.scalar('total_loss', total_loss)
 47 | 
 48 |     return total_loss, model_loss, binarize_map, threshold_map, thresh_binary
 49 | 
 50 | 
 51 | def average_gradients(tower_grads):
 52 |     average_grads = []
 53 |     for grad_and_vars in zip(*tower_grads):
 54 |         grads = []
 55 |         for g, _ in grad_and_vars:
 56 |             expanded_g = tf.expand_dims(g, 0)
 57 |             grads.append(expanded_g)
 58 | 
 59 |         grad = tf.concat(grads, 0)
 60 |         grad = tf.reduce_mean(grad, 0)
 61 | 
 62 |         v = grad_and_vars[0][1]
 63 |         grad_and_var = (grad, v)
 64 |         average_grads.append(grad_and_var)
 65 | 
 66 |     return average_grads
 67 | 
 68 | 
 69 | def _train_logger_init():
 70 |     """
 71 |     初始化log日志
 72 |     :return:
 73 |     """
 74 |     train_logger = logging.getLogger('train')
 75 |     train_logger.setLevel(logging.DEBUG)
 76 | 
 77 |     # 添加文件输出
 78 |     log_file = os.path.join(cfg["TRAIN"]["TRAIN_LOGS"], time.strftime('%Y%m%d%H%M', time.localtime(time.time())) + '.logs')
 79 |     file_handler = logging.FileHandler(log_file, mode='w')
 80 |     file_handler.setLevel(logging.DEBUG)
 81 |     file_formatter = logging.Formatter('%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')
 82 |     file_handler.setFormatter(file_formatter)
 83 |     train_logger.addHandler(file_handler)
 84 | 
 85 |     # 添加控制台输出
 86 |     consol_handler = logging.StreamHandler()
 87 |     consol_handler.setLevel(logging.DEBUG)
 88 |     consol_formatter = logging.Formatter('%(message)s')
 89 |     consol_handler.setFormatter(consol_formatter)
 90 |     train_logger.addHandler(consol_handler)
 91 |     return train_logger
 92 | 
 93 | 
 94 | def main():
 95 |     import os
 96 |     os.environ['CUDA_VISIBLE_DEVICES'] = cfg.TRAIN.VIS_GPU
 97 |     if not tf.gfile.Exists(cfg["TRAIN"]["CHECKPOINTS_OUTPUT_DIR"]):
 98 |         tf.gfile.MkDir(cfg["TRAIN"]["CHECKPOINTS_OUTPUT_DIR"])
 99 | 
100 |     train_logger = _train_logger_init()
101 | 
102 |     input_images = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_images')
103 |     input_score_maps = tf.placeholder(tf.float32, shape=[None, None, None, 1], name='input_score_maps')
104 |     input_threshold_maps = tf.placeholder(tf.float32, shape=[None, None, None, 1], name='input_threshold_maps')
105 | 
106 |     input_score_masks = tf.placeholder(tf.float32, shape=[None, None, None, 1], name='input_score_masks')
107 |     input_threshold_masks = tf.placeholder(tf.float32, shape=[None, None, None, 1], name='input_threshold_masks')
108 | 
109 |     global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)
110 | 
111 |     learning_rate = tf.train.exponential_decay(cfg["TRAIN"]["LEARNING_RATE"], global_step, decay_steps=10000,
112 |                                                decay_rate=0.94, staircase=True)
113 | 
114 |     if cfg.TRAIN.OPT == 'adam':
115 |         # learning_rate = tf.constant(cfg["TRAIN"]["LEARNING_RATE"], tf.float32)
116 |         opt = tf.train.AdamOptimizer(learning_rate)
117 |     elif cfg.TRAIN.OPT == 'momentum':
118 |         opt = tf.train.MomentumOptimizer(learning_rate, 0.9)
119 |     else:
120 |         assert 0, 'error optimzer'
121 |     print('use ', cfg.TRAIN.OPT)
122 | 
123 |     # add summary
124 |     tf.summary.scalar('learning_rate', learning_rate)
125 | 
126 |     gpus = [str(i) for i in range(len(cfg.TRAIN.VIS_GPU.split(',')))]
127 |     input_images_split = tf.split(input_images, len(gpus))
128 |     input_score_maps_split = tf.split(input_score_maps, len(gpus))
129 |     input_threshold_maps_split = tf.split(input_threshold_maps, len(gpus))
130 |     input_score_masks_split = tf.split(input_score_masks, len(gpus))
131 |     input_threshold_masks_split = tf.split(input_threshold_masks, len(gpus))
132 | 
133 | 
134 |     tower_grads = []
135 |     reuse_variables = None
136 |     total_binarize_acc = 0
137 |     total_thresh_binary_acc = 0
138 |     for i, gpu_id in enumerate(gpus):
139 |         print('gpu_id', gpu_id)
140 |         with tf.device('/gpu:' + gpu_id):
141 |             with tf.name_scope('model_' + gpu_id) as scope:
142 |                 gt_imgs = input_images_split[i]
143 |                 gt_scores = input_score_maps_split[i]
144 |                 gt_thresholds = input_threshold_maps_split[i]
145 |                 gt_score_masks = input_score_masks_split[i]
146 |                 gt_threshold_masks = input_threshold_masks_split[i]
147 |                 total_loss, model_loss, binarize_map, threshold_map, thresh_binary = \
148 |                     tower_loss(gt_imgs, gt_scores, gt_thresholds, gt_score_masks, gt_threshold_masks, reuse_variables)
149 |                 binarize_acc, thresh_binary_acc = compute_acc(binarize_map, threshold_map, thresh_binary,
150 |                               gt_scores, gt_thresholds, gt_score_masks, gt_threshold_masks)
151 |                 total_binarize_acc += binarize_acc
152 |                 total_thresh_binary_acc += thresh_binary_acc
153 |                 reuse_variables = True
154 | 
155 |                 batch_norm_updates_op = tf.group(*tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope))
156 | 
157 |                 grads = opt.compute_gradients(total_loss)
158 |                 tower_grads.append(grads)
159 | 
160 |     grads = average_gradients(tower_grads)
161 |     apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
162 | 
163 |     avg_binarize_acc = total_binarize_acc / 2.0
164 |     avg_thresh_binary_acc = total_thresh_binary_acc / 2.0
165 | 
166 |     summary_op = tf.summary.merge_all()
167 | 
168 |     variable_averages = tf.train.ExponentialMovingAverage(cfg["TRAIN"]["MOVING_AVERAGE_DECAY"], global_step)
169 | 
170 |     variables_averages_op = variable_averages.apply(tf.trainable_variables())
171 | 
172 |     with tf.control_dependencies([variables_averages_op, apply_gradient_op, batch_norm_updates_op]):
173 |         train_op = tf.no_op(name='train_op')
174 | 
175 |     saver = tf.train.Saver(tf.global_variables(), max_to_keep=cfg.TRAIN.SAVE_MAX)
176 | 
177 | 
178 |     train_logs_dir = os.path.join(cfg.TRAIN.TRAIN_LOGS, 'train')
179 |     val_logs_dir = os.path.join(cfg.TRAIN.TRAIN_LOGS, 'val')
180 | 
181 |     make_dir(train_logs_dir)
182 |     make_dir(val_logs_dir)
183 | 
184 |     train_summary_writer = tf.summary.FileWriter(train_logs_dir, tf.get_default_graph())
185 |     val_summary_writer = tf.summary.FileWriter(val_logs_dir, tf.get_default_graph())
186 | 
187 | 
188 |     init = tf.global_variables_initializer()
189 | 
190 |     with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
191 |         try:
192 | 
193 |             if cfg["TRAIN"]["RESTORE"]:
194 |                 train_logger.info('continue training from previous checkpoint')
195 |                 ckpt = tf.train.get_checkpoint_state(cfg["TRAIN"]["RESTORE_CKPT_PATH"])
196 |                 train_logger.info('restore model path:', ckpt.model_checkpoint_path)
197 |                 saver.restore(sess, ckpt.model_checkpoint_path)
198 |                 train_logger.info("done")
199 |             elif cfg["TRAIN"]["PRETRAINED_MODEL_PATH"] is not None:
200 |                 sess.run(init)
201 |                 print(cfg["TRAIN"]["PRETRAINED_MODEL_PATH"])
202 |                 train_logger.info('load pretrain model:{}', str(cfg["TRAIN"]["PRETRAINED_MODEL_PATH"]))
203 |                 variable_restore_op = slim.assign_from_checkpoint_fn(cfg["TRAIN"]["PRETRAINED_MODEL_PATH"],
204 |                                                                      slim.get_trainable_variables(),
205 |                                                                      ignore_missing_vars=True)
206 |                 variable_restore_op(sess)
207 |                 train_logger.info("done")
208 | 
209 |             else:
210 |                 sess.run(init)
211 |         except:
212 |             assert 0, 'load error'
213 | 
214 |         train_data_generator = get_batch(num_workers=cfg.TRAIN.NUM_READERS,
215 |                                          img_dir=cfg.TRAIN.IMG_DIR,
216 |                                          label_dir=cfg.TRAIN.LABEL_DIR,
217 |                                          batchsize=cfg.TRAIN.BATCH_SIZE_PER_GPU * len(gpus))
218 | 
219 |         val_data_generator = get_batch(num_workers=10,
220 |                                        img_dir=cfg.EVAL.IMG_DIR,
221 |                                        label_dir=cfg.EVAL.LABEL_DIR,
222 |                                        batchsize=cfg.TRAIN.BATCH_SIZE_PER_GPU * len(gpus))
223 | 
224 |         test_data_generator = get_batch(num_workers=1,
225 |                                         img_dir=cfg.EVAL.IMG_DIR,
226 |                                         label_dir=cfg.EVAL.LABEL_DIR,
227 |                                         batchsize=cfg.TRAIN.BATCH_SIZE_PER_GPU * len(gpus),
228 |                                         is_eval=True)
229 | 
230 |         test_epoch = 0
231 | 
232 |         start = time.time()
233 |         for step in range(cfg["TRAIN"]["MAX_STEPS"]):
234 |             train_data = next(train_data_generator)
235 | 
236 |             train_feed_dict = {input_images: train_data[0],
237 |                          input_score_maps: train_data[1],
238 |                          input_threshold_maps: train_data[3],
239 |                          input_score_masks: train_data[2],
240 |                          input_threshold_masks: train_data[4]}
241 | 
242 |             ml, tl, _ = sess.run([model_loss, total_loss, train_op], feed_dict=train_feed_dict)
243 |             if np.isnan(tl):
244 |                 train_logger.info('Loss diverged, stop training')
245 |                 break
246 | 
247 |             if step % 10 == 0:
248 |                 avg_time_per_step = (time.time() - start) / 10
249 |                 avg_examples_per_second = (10 * cfg["TRAIN"]["BATCH_SIZE_PER_GPU"] * len(gpus)) / (time.time() - start)
250 |                 start = time.time()
251 |                 train_logger.info(
252 |                     '{}->Step {:06d}, model loss {:.4f}, total loss {:.4f}, {:.2f} seconds/step, {:.2f} examples/second'.format(
253 |                         cfg.TRAIN.VERSION, step, ml, tl, avg_time_per_step, avg_examples_per_second))
254 | 
255 |             if step % cfg["TRAIN"]["SAVE_CHECKPOINT_STEPS"] == 0:
256 |                 saver.save(sess, os.path.join(cfg["TRAIN"]["CHECKPOINTS_OUTPUT_DIR"],
257 |                                               'DB_' + cfg.BACKBONE + '_' + cfg.TRAIN.VERSION + '_model.ckpt'),
258 |                            global_step=global_step)
259 | 
260 |             if step % cfg["TRAIN"]["SAVE_SUMMARY_STEPS"] == 0:
261 |                 _, tl, train_summary_str = sess.run([train_op, total_loss, summary_op], feed_dict=train_feed_dict)
262 |                 train_summary_writer.add_summary(train_summary_str, global_step=step)
263 | 
264 |                 val_data = next(val_data_generator)
265 |                 val_feed_dict = {input_images: val_data[0],
266 |                                   input_score_maps: val_data[1],
267 |                                   input_threshold_maps: val_data[3],
268 |                                   input_score_masks: val_data[2],
269 |                                   input_threshold_masks: val_data[4]}
270 |                 eval_summary_str = sess.run(summary_op, feed_dict=val_feed_dict)
271 | 
272 |                 val_summary_writer.add_summary(eval_summary_str, global_step=step)
273 | 
274 |             if step % cfg.EVAL.TEST_STEP == 0 and step != 0:
275 |                 temp_epoch = test_epoch
276 |                 train_logger.info('~~~~~~~~~~~~~~~~~~start to test~~~~~~~~~~~~~~~~~~~~~')
277 |                 avg_bc = []
278 |                 avg_tbc = []
279 |                 while temp_epoch==test_epoch:
280 |                     test_data = next(test_data_generator)
281 |                     test_feed_dict = {input_images: test_data[0],
282 |                                       input_score_maps: test_data[1],
283 |                                       input_threshold_maps: test_data[3],
284 |                                       input_score_masks: test_data[2],
285 |                                       input_threshold_masks: test_data[4]}
286 |                     test_epoch = test_data[5]
287 |                     bc, tbc = sess.run([avg_binarize_acc, avg_thresh_binary_acc],
288 |                                                         feed_dict=test_feed_dict)
289 | 
290 |                     avg_bc.append(bc)
291 |                     avg_tbc.append(tbc)
292 | 
293 |                 train_logger.info('avg binarize acc is :{}'.format(sum(avg_bc)/len(avg_bc)))
294 |                 train_logger.info('avg thresh binary acc is :{}'.format(sum(avg_tbc)/len(avg_tbc)))
295 | 
296 | 
297 | if __name__ == '__main__':
298 | 
299 |     main()
300 | 
301 | 


--------------------------------------------------------------------------------