├── .gitignore ├── README.md ├── __init__.py ├── __pycache__ ├── config.cpython-36.pyc └── nms.cpython-36.pyc ├── db_config.py ├── evaluate.py ├── figures ├── 1039_bboxshow.jpg ├── 1039_binarize_map.jpg ├── 1039_polyshow.jpg ├── 1039_thresh_binary.jpg ├── 1039_threshold_map.jpg ├── bacc.png ├── bloss.png ├── mloss.png ├── net.png ├── org.jpg ├── tbacc.png ├── tbloss.png ├── tloss.png └── ttloss.png ├── inference.py ├── lib ├── __init__.py ├── dataset │ ├── __init__.py │ ├── __pycache__ │ │ ├── data_util.cpython-36.pyc │ │ ├── data_utils.cpython-36.pyc │ │ └── dataload.cpython-36.pyc │ ├── dataloader.py │ ├── generator_enqueuer.py │ ├── img_aug.py │ └── label_maker.py ├── networks │ ├── losses.py │ ├── mobilenet │ │ ├── conv_blocks.py │ │ ├── mobilenet.py │ │ ├── mobilenet_v2.py │ │ └── mobilenet_v3.py │ ├── model.py │ └── resnet │ │ ├── resnet_utils.py │ │ ├── resnet_v1.py │ │ ├── resnet_v1_tiny.py │ │ └── resnet_v2.py ├── postprocess │ └── post_process.py └── utils.py ├── requirements.txt └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | # pycharm 2 | .idea 3 | *.xml 4 | .DS_Store 5 | 6 | # model 7 | #checkpoints 8 | #checkpoint 9 | *.pb 10 | *.npy 11 | *ckpt* 12 | 13 | # image 14 | #*.png 15 | *.jpg 16 | *.JPG 17 | *.jpeg 18 | 19 | # txt result 20 | *.txt 21 | 22 | # 编译 23 | *.pyc 24 | 25 | 26 | # logs output 27 | events* 28 | *.pkl 29 | 30 | # mAP 31 | *.json -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DB: Real-time Scene Text Detection with Differentiable Binarization 2 | 3 | 4 | ## Introduction 5 | This is a TensorFlow implementation of ["Real-time Scene Text Detection with Differentiable Binarization"](https://arxiv.org/abs/1911.08947). 6 | 7 | Part of the code is inherited from [DB](https://github.com/MhLiao/DB). 8 | 9 | ![net](figures/net.png) 10 | 11 | 12 | ## ToDo List 13 | 14 | - [x] Release trained models 15 | - [x] Training code 16 | - [x] Inference code 17 | - [x] Muti gpu training 18 | - [x] Tensorboard support 19 | - [x] Exp another train losses 20 | - [ ] Eval code 21 | - [x] Data augmentation(crop and random img aug) 22 | - [x] More backbones 23 | - [x] Add dilation conv(ASPP layer) 24 | - [ ] Deformable Convolutional Networks 25 | 26 | 27 | ## Install 28 | 29 | pip install -r requirements.txt 30 | 31 | 32 | ## Test 33 | 34 | ### 1.Download model. 35 | 36 | | Model | Download link | 37 | |------------ |------- | 38 | | ResNet-50| [BaiduYun](https://pan.baidu.com/s/1Pfwl8M6aBwuUpJbP2jVFuw), [GoogleDrive](https://drive.google.com/drive/folders/1uJL6sf6EP6ekK_4XLNGLt1U9EGRJ0eDO?usp=sharing)| 39 | | ResNet-50-ASPP |[BaiduYun](https://pan.baidu.com/s/1OlMbhLSaQYb4U1VZZGabHgf), [GoogleDrive](https://drive.google.com/open?id=1s91HWS4dtXCFv5x5-YlCaj-KbobnEEUuf)| 40 | ### 2.Config network 41 | revise the `db_config.py` 42 | 43 | cfg.BACKBONE = 'resnet_v1_50' 44 | # if trained model name does not have aspp, it should be False. 45 | cfg.ASPP_LAYER = False 46 | 47 | ### 3.Start to test img. 48 | 49 | python inference.py --gpuid='0' --ckptpath='path' --imgpath='img.jpg' 50 | 51 | 52 | ## Samples show 53 | 54 | | org show | poly show | bbox show | 55 | |------------ |------- |------- | 56 | | ![poly_img](figures/org.jpg) | ![poly_img](figures/1039_polyshow.jpg) | ![bbox_img](figures/1039_bboxshow.jpg) | 57 | | binarize_map | threshold_map | thresh_binary | 58 | | ![bin_map](figures/1039_binarize_map.jpg) | ![thres_map](figures/1039_threshold_map.jpg) | ![bin_thres_map](figures/1039_thresh_binary.jpg) | 59 | 60 | 61 | ## Dataset 62 | This repo is train on CTW1500 dataset. 63 | Download from [BaiduYun](https://pan.baidu.com/s/1yG_191LemrQa7K0h7Wispw) (key:yjiz) or 64 | [OneDrive](https://1drv.ms/u/s!Aplwt7jiPGKilH4XzZPoKrO7Aulk). 65 | 66 | 67 | ## Training model 68 | #### 1. Get the CTW1500 train images path and labels path. 69 | 70 | revise the `db_config.py` 71 | 72 | # Train data config 73 | cfg.TRAIN.IMG_DIR = '/path/ctw1500/train/text_image' 74 | cfg.TRAIN.LABEL_DIR = '/path/ctw1500/train/text_label_curve' 75 | 76 | # Val or test data config 77 | cfg.EVAL.IMG_DIR = '/path/ctw1500/test/text_image' 78 | cfg.EVAL.LABEL_DIR = '/path/ctw1500/test/text_label_circum' 79 | 80 | 81 | #### 2. Muti gpu train and config network. 82 | 83 | revise the `db_config.py` 84 | 85 | # only support 'resnet_v1_50' and 'resnet_v1_18' 86 | cfg.BACKBONE = 'resnet_v1_50' 87 | # if you want to train aspp network, it should be True 88 | cfg.ASPP_LAYER = False 89 | cfg.TRAIN.VIS_GPU = '5,6' # single gpu -> '0' 90 | 91 | 92 | #### 3. Save train logs and models. 93 | 94 | revise the `db_config.py` 95 | 96 | cfg.TRAIN.TRAIN_LOGS = '/path/tf_logs' 97 | cfg.TRAIN.CHECKPOINTS_OUTPUT_DIR = '/path/ckpt' 98 | 99 | #### 4. Pretrain or restore model. 100 | 101 | If you want to pretrain model, revise the `db_config.py` 102 | 103 | cfg.TRAIN.RESTORE = False 104 | cfg.TRAIN.PRETRAINED_MODEL_PATH = 'pretrain model path' 105 | 106 | If you want to restore model, revise the `db_config.py` 107 | 108 | cfg.TRAIN.RESTORE = True 109 | cfg.TRAIN.RESTORE_CKPT_PATH = 'checkpoint path' 110 | 111 | #### 5. Start to train. 112 | 113 | python train.py 114 | 115 | #### 6. Tensorboard show 116 | 117 | cd 'tensorboard path' 118 | tensorboard --logdir=./ 119 | 120 | Red line is train logs, blue line is val logs. 121 | 122 | Losses show 123 | 124 | | binarize loss | threshold loss |threshold binary loss | 125 | |------------ |------- |------- | 126 | | ![binarize_loss](figures/bloss.png) | ![threshold loss](figures/tloss.png) |![thresh_binary_loss](figures/tbloss.png) | 127 | | model_loss | total_loss | | 128 | | ![model_loss](figures/mloss.png) | ![total_loss](figures/ttloss.png) | | 129 | 130 | 131 | Acc show 132 | 133 | | binarize acc | threshold binary acc | 134 | |------------ |------- | 135 | | ![binarize acc](figures/bacc.png) | ![threshold binary acc](figures/tbacc.png) | 136 | 137 | 138 | 139 | ## Experiment 140 | 141 | Test on RTX 2080 Ti. 142 | 143 | | BackBone | ASPP | Input Size | Infernce Time(ms) | PostProcess Time(ms) | FPS | 144 | |------------ |------ |-------- |------- |------- |------- | 145 | | ResNet-50 | × | 320 | 13.3 | 2.9 | 61.7 | 146 | | ResNet-50 | × | 512 | 19.2 | 4.5 | 42.2 | 147 | | ResNet-50 | × | 640 | 28.9 | 5.2 | 29.3 | 148 | | ResNet-50 | × | 736 | 33.2 | 5.7 | 25.7 | 149 | | ResNet-18 | × | 320 | 12.2 | 2.9 | 66.2 | 150 | | ResNet-18 | × | 512 | 16.9 | 4.5 | 46.7 | 151 | | ResNet-18 | × | 736 | 32.7 | 5.7 | 26 | 152 | | ResNet-50 | √ | 640 | 32.6 | --- | --- | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/__init__.py -------------------------------------------------------------------------------- /__pycache__/config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/__pycache__/config.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/nms.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/__pycache__/nms.cpython-36.pyc -------------------------------------------------------------------------------- /db_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from easydict import EasyDict as edict 3 | 4 | cfg = edict() 5 | 6 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~inference~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 7 | cfg.MEANS = [123.68, 116.78, 103.94] 8 | cfg.INPUT_MAX_SIZE = 640 9 | cfg.K = 10 10 | cfg.EPSILON_RATIO = 0.001 11 | cfg.SHRINK_RATIO = 0.4 12 | cfg.THRESH_MIN = 0.3 13 | cfg.THRESH_MAX = 0.7 14 | cfg.FILTER_MIN_AREA = 1e-4 15 | 16 | # ['resnet_v1_50', 'resnet_v1_18', 'resnet_v2_50', 'resnet_v2_18', 'mobilenet_v2', 'mobilenet_v3'] 17 | cfg.BACKBONE = 'resnet_v1_50' 18 | cfg.ASPP_LAYER = False 19 | # ~~~~~~~~~~~~~~~~~~z~~~~~~~~~~~~~train config~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 20 | 21 | cfg.TRAIN = edict() 22 | cfg.TRAIN.VERSION = 'aspp' 23 | # 多gpu训练 24 | cfg.TRAIN.VIS_GPU = '3,4' 25 | cfg.TRAIN.BATCH_SIZE_PER_GPU = 2 26 | cfg.TRAIN.LOSS_ALPHA = 1.0 27 | cfg.TRAIN.LOSS_BETA = 10.0 28 | 29 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~dataload & aug~~~~~~~~~~~~~~~~~~~~~~~~~~ 30 | cfg.TRAIN.IMG_DIR = '/hostpersistent/zzh/dataset/open_data/ctw1500/train/text_image' 31 | cfg.TRAIN.LABEL_DIR = '/hostpersistent/zzh/dataset/open_data/ctw1500/train/text_label_curve' 32 | cfg.TRAIN.IMG_SIZE = 640 33 | cfg.TRAIN.MIN_TEXT_SIZE = 1 34 | cfg.TRAIN.MIN_AREA = 1 35 | cfg.TRAIN.IMG_SCALE = [0.5, 1, 1, 1, 1.5, 2.0] 36 | cfg.TRAIN.CROP_PROB = 0.9 37 | cfg.TRAIN.MIN_CROP_SIDE_RATIO = 0.001 38 | cfg.TRAIN.NUM_READERS = 20 39 | cfg.TRAIN.DATA_AUG_PROB = 0.0 40 | cfg.TRAIN.AUG_TOOL = ['GaussianBlur', 41 | 'AverageBlur', 42 | 'MedianBlur', 43 | 'BilateralBlur', 44 | 'MotionBlur', 45 | #'ElasticTransformation', 46 | #'PerspectiveTransform', 47 | ] 48 | 49 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~save ckpt and log~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 50 | cfg.TRAIN.MAX_STEPS = 10000000 51 | cfg.TRAIN.SAVE_CHECKPOINT_STEPS = 2000 52 | cfg.TRAIN.SAVE_SUMMARY_STEPS = 100 53 | cfg.TRAIN.SAVE_MAX = 20 54 | cfg.TRAIN.TRAIN_LOGS = os.path.join('/hostpersistent/zzh/lab/DB-tf/', 'tf_logs') 55 | cfg.TRAIN.CHECKPOINTS_OUTPUT_DIR = os.path.join('/hostpersistent/zzh/lab/DB-tf/', 'ckpt') 56 | 57 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~restore and pretrain~~~~~~~~~~~~~~~~~~~~~ 58 | cfg.TRAIN.RESTORE = None 59 | cfg.TRAIN.RESTORE_CKPT_PATH = os.path.join('/hostpersistent/zzh/lab/DB-tf/', 'ckpt') 60 | cfg.TRAIN.PRETRAINED_MODEL_PATH = '/hostpersistent/zzh/lab/DB-tf/ckpt/DB_resnet_v1_50_1223_model.ckpt-121201' 61 | 62 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~super em~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 63 | cfg.TRAIN.LEARNING_RATE = 0.0001 64 | cfg.TRAIN.OPT = 'adam'#'momentum'# 65 | cfg.TRAIN.MOVING_AVERAGE_DECAY = 0.997 66 | 67 | 68 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ eval ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 69 | cfg.EVAL = edict() 70 | cfg.EVAL.IMG_DIR = '/hostpersistent/zzh/dataset/open_data/ctw1500/test/text_image' 71 | cfg.EVAL.LABEL_DIR = '/hostpersistent/zzh/dataset/open_data/ctw1500/test/text_label_circum' 72 | cfg.EVAL.NUM_READERS = 1 73 | cfg.EVAL.TEST_STEP = 5000 74 | -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import os 4 | import cv2 5 | import tqdm 6 | import numpy as np 7 | 8 | from inference import DB 9 | from db_config import cfg 10 | from lib.utils import quad_iou, compute_f1_score, load_ctw1500_labels, make_dir 11 | 12 | 13 | def load_pred_labels(path): 14 | pass 15 | 16 | def evaluate(gt_care_list, gt_dontcare_list, pred_list, overlap=0.5): 17 | """ 18 | 19 | :param gt_care_list: [-1, M, 2] 20 | :param gt_dontcare_list: [-1, M, 2] 21 | :param pred_list: [-1, M, 2] 22 | :param overlap: 23 | :return: 24 | """ 25 | 26 | pred_care_list =[] 27 | pred_dontcare_list = [] 28 | 29 | if len(gt_dontcare_list) != 0: 30 | for pred_box in pred_list: 31 | flag = False 32 | for gt_box in gt_dontcare_list: 33 | if quad_iou(gt_box, pred_box) > overlap: 34 | flag = True 35 | break 36 | 37 | if not flag: 38 | pred_care_list.append(pred_box) 39 | else: 40 | pred_dontcare_list.append(pred_box) 41 | else: 42 | pred_care_list = pred_list 43 | 44 | gt_care_flag_list = [False] * len(gt_care_list) 45 | pred_care_flag_list = [False] * len(pred_care_list) 46 | pairs_list = [] 47 | gt_not_pair_list = [] 48 | pred_not_pair_list = [] 49 | 50 | for gt_i, gt_box in enumerate(gt_care_list): 51 | for pred_i, pred_box in enumerate(pred_care_list): 52 | if pred_care_flag_list[pred_i]: 53 | continue 54 | else: 55 | iou = quad_iou(gt_box, pred_box) 56 | if iou > overlap: 57 | pair_dict = {} 58 | pair_dict['gt'] = gt_box 59 | pair_dict['pred'] = pred_box 60 | pair_dict['iou'] = iou 61 | pairs_list.append(pair_dict) 62 | pred_care_flag_list[pred_i] = True 63 | gt_care_flag_list[gt_i] = True 64 | 65 | TP = len(pairs_list) 66 | 67 | if len(gt_care_list) == 0: 68 | recall = 1.0 69 | precision = 1.0 if len(pred_care_list) == 0 else 0.0 70 | elif len(pred_care_list) == 0: 71 | recall = 0.0 72 | precision = 0.0 73 | else: 74 | recall = 1.0 * TP / len(gt_care_list) 75 | precision = 1.0 * TP / len(pred_care_list) 76 | 77 | f1_score = compute_f1_score(precision, recall) 78 | 79 | return precision, recall, f1_score, TP, len(gt_care_list), len(pred_care_list), pairs_list 80 | 81 | 82 | def evaluate_all(gt_file_dir, gt_img_dir, ckpt_path, gpuid='0'): 83 | db = DB(ckpt_path, gpuid) 84 | 85 | img_list = os.listdir(gt_img_dir) 86 | 87 | show = './eva' 88 | make_dir(show) 89 | 90 | total_TP = 0 91 | total_gt_care_num = 0 92 | total_pred_care_num = 0 93 | for img_name in tqdm.tqdm(img_list): 94 | img = cv2.imread(os.path.join(gt_img_dir, img_name)) 95 | 96 | pred_box_list, pred_score_list, _ = db.detect_img(os.path.join(gt_img_dir, img_name), 97 | ispoly=True, 98 | show_res=False) 99 | 100 | gt_file_name = os.path.splitext(img_name)[0] + '.txt' 101 | 102 | gt_boxes, tags = load_ctw1500_labels(os.path.join(gt_file_dir, gt_file_name)) 103 | 104 | gt_care_list = [] 105 | gt_dontcare_list = [] 106 | 107 | for i, box in enumerate(gt_boxes): 108 | box = box.reshape((-1, 2)).tolist() 109 | if tags[i] == False: 110 | gt_care_list.append(box) 111 | else: 112 | gt_dontcare_list.append(box) 113 | 114 | precision, recall, f1_score, TP, gt_care_num, pred_care_num, pairs_list = evaluate(gt_care_list, 115 | gt_dontcare_list, 116 | pred_box_list, 117 | overlap=0.5) 118 | 119 | for pair in pairs_list: 120 | cv2.polylines(img, [np.array(pair['gt'], np.int).reshape([-1, 1, 2])], True, (0, 255, 0)) 121 | cv2.polylines(img, [np.array(pair['pred'], np.int).reshape([-1, 1, 2])], True, (255, 0, 0)) 122 | 123 | cv2.imwrite(os.path.join(show, img_name), img) 124 | 125 | total_TP += TP 126 | total_gt_care_num += gt_care_num 127 | total_pred_care_num += pred_care_num 128 | 129 | total_precision = float(total_TP) / total_pred_care_num 130 | total_recall = float(total_TP) / total_gt_care_num 131 | total_f1_score = compute_f1_score(total_precision, total_recall) 132 | 133 | return total_precision, total_recall, total_f1_score 134 | 135 | if __name__ == '__main__': 136 | 137 | ckpt_path = '/hostpersistent/zzh/lab/DB-tf/ckpt/DB_resnet_v1_50_aspp_model.ckpt-303001' 138 | gt_img_dir = cfg.EVAL.IMG_DIR 139 | gt_file_dir = cfg.EVAL.LABEL_DIR 140 | 141 | precision, recall, f1_score = evaluate_all(gt_file_dir, gt_img_dir, ckpt_path) 142 | print(precision, recall, f1_score) 143 | 144 | 145 | 146 | 147 | 148 | 149 | -------------------------------------------------------------------------------- /figures/1039_bboxshow.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/1039_bboxshow.jpg -------------------------------------------------------------------------------- /figures/1039_binarize_map.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/1039_binarize_map.jpg -------------------------------------------------------------------------------- /figures/1039_polyshow.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/1039_polyshow.jpg -------------------------------------------------------------------------------- /figures/1039_thresh_binary.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/1039_thresh_binary.jpg -------------------------------------------------------------------------------- /figures/1039_threshold_map.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/1039_threshold_map.jpg -------------------------------------------------------------------------------- /figures/bacc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/bacc.png -------------------------------------------------------------------------------- /figures/bloss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/bloss.png -------------------------------------------------------------------------------- /figures/mloss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/mloss.png -------------------------------------------------------------------------------- /figures/net.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/net.png -------------------------------------------------------------------------------- /figures/org.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/org.jpg -------------------------------------------------------------------------------- /figures/tbacc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/tbacc.png -------------------------------------------------------------------------------- /figures/tbloss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/tbloss.png -------------------------------------------------------------------------------- /figures/tloss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/tloss.png -------------------------------------------------------------------------------- /figures/ttloss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/figures/ttloss.png -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import time 4 | import tqdm 5 | import argparse 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | from db_config import cfg 10 | from shapely.geometry import Polygon 11 | from lib.postprocess.post_process import SegDetectorRepresenter 12 | import lib.networks.model as model 13 | 14 | 15 | def get_args(): 16 | parser = argparse.ArgumentParser(description='DB-tf') 17 | parser.add_argument('--ckptpath', default='/hostpersistent/zzh/lab/DB-tf/ckpt/DB_resnet_v1_50_1223_model.ckpt-121201', 18 | type=str, 19 | help='load model') 20 | parser.add_argument('--imgpath', default='/hostpersistent/zzh/dataset/open_data/ctw1500/test/text_image/1012.jpg', 21 | type=str) 22 | parser.add_argument('--gpuid', default='0', 23 | type=str) 24 | parser.add_argument('--ispoly', default=True, 25 | type=bool) 26 | parser.add_argument('--show_res', default=True, 27 | type=bool) 28 | 29 | args = parser.parse_args() 30 | 31 | return args 32 | 33 | def make_dir(dir): 34 | if not os.path.exists(dir): 35 | os.makedirs(dir) 36 | 37 | class DB(): 38 | 39 | def __init__(self, ckpt_path, gpuid='0'): 40 | os.environ['CUDA_VISIBLE_DEVICES'] = gpuid 41 | tf.reset_default_graph() 42 | self._input_images = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_images') 43 | global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) 44 | 45 | self._binarize_map, self._threshold_map, self._thresh_binary = model.model(self._input_images, is_training=False) 46 | 47 | variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step) 48 | saver = tf.train.Saver(variable_averages.variables_to_restore()) 49 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) 50 | gpu_config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options, allow_soft_placement=True) 51 | self.sess = tf.Session(config=gpu_config) 52 | saver.restore(self.sess, ckpt_path) 53 | self.decoder = SegDetectorRepresenter() 54 | print('restore model from:', ckpt_path) 55 | 56 | def __del__(self): 57 | self.sess.close() 58 | 59 | def detect_img(self, img_path, ispoly=True, show_res=True): 60 | img = cv2.imread(img_path) 61 | h, w, _ = img.shape 62 | resized_img, ratio, size = self._resize_img(img) 63 | 64 | s = time.time() 65 | binarize_map, threshold_map, thresh_binary = self.sess.run([self._binarize_map, self._threshold_map, self._thresh_binary], 66 | feed_dict={self._input_images: [resized_img]}) 67 | net_time = time.time()-s 68 | 69 | s = time.time() 70 | boxes, scores = self.decoder([resized_img], binarize_map, ispoly) 71 | boxes = boxes[0] 72 | area = h * w 73 | res_boxes = [] 74 | res_scores = [] 75 | for i, box in enumerate(boxes): 76 | box[:, 0] *= ratio[1] 77 | box[:, 1] *= ratio[0] 78 | if Polygon(box).convex_hull.area > cfg.FILTER_MIN_AREA*area: 79 | res_boxes.append(box) 80 | res_scores.append(scores[0][i]) 81 | post_time = time.time()-s 82 | 83 | if show_res: 84 | img_name = os.path.splitext(os.path.split(img_path)[-1])[0] 85 | make_dir('./show') 86 | cv2.imwrite('show/' + img_name + '_binarize_map.jpg', binarize_map[0][0:size[0], 0:size[1], :]*255) 87 | cv2.imwrite('show/' + img_name + '_threshold_map.jpg', threshold_map[0][0:size[0], 0:size[1], :]*255) 88 | cv2.imwrite('show/' + img_name + '_thresh_binary.jpg', thresh_binary[0][0:size[0], 0:size[1], :]*255) 89 | for box in res_boxes: 90 | cv2.polylines(img, [box.astype(np.int).reshape([-1, 1, 2])], True, (0, 255, 0)) 91 | # print(Polygon(box).convex_hull.area, Polygon(box).convex_hull.area/area) 92 | cv2.imwrite('show/' + img_name + '_show.jpg', img) 93 | 94 | return res_boxes, res_scores, (net_time, post_time) 95 | 96 | 97 | def detect_batch(self, batch): 98 | pass 99 | 100 | def _resize_img(self, img, max_size=640): 101 | h, w, _ = img.shape 102 | 103 | ratio = float(max(h, w)) / max_size 104 | 105 | new_h = int((h / ratio // 32) * 32) 106 | new_w = int((w / ratio // 32) * 32) 107 | 108 | resized_img = cv2.resize(img, dsize=(new_w, new_h)) 109 | 110 | input_img = np.zeros([max_size, max_size, 3]) 111 | input_img[0:new_h, 0:new_w, :] = resized_img 112 | 113 | ratio_w = w / new_w 114 | ratio_h = h / new_h 115 | 116 | return input_img, (ratio_h, ratio_w), (new_h, new_w) 117 | 118 | 119 | if __name__ == "__main__": 120 | args = get_args() 121 | 122 | db = DB(args.ckptpath, args.gpuid) 123 | 124 | db.detect_img(args.imgpath, args.ispoly, args.show_res) 125 | 126 | img_list = os.listdir('/hostpersistent/zzh/dataset/open_data/ctw1500/test/text_image/') 127 | 128 | net_all = 0 129 | post_all = 0 130 | pipe_all = 0 131 | 132 | for i in tqdm.tqdm(img_list): 133 | _, _, (net_time, post_time) = db.detect_img(os.path.join('/hostpersistent/zzh/dataset/open_data/ctw1500/test/text_image/',i), args.ispoly, show_res=True) 134 | net_all += net_time 135 | post_all += post_time 136 | pipe_all += (net_time + post_time) 137 | 138 | print('net:', net_all/len(img_list)) 139 | print('post:', post_all/len(img_list)) 140 | print('pipe:', pipe_all/len(img_list)) -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/lib/__init__.py -------------------------------------------------------------------------------- /lib/dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/lib/dataset/__init__.py -------------------------------------------------------------------------------- /lib/dataset/__pycache__/data_util.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/lib/dataset/__pycache__/data_util.cpython-36.pyc -------------------------------------------------------------------------------- /lib/dataset/__pycache__/data_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/lib/dataset/__pycache__/data_utils.cpython-36.pyc -------------------------------------------------------------------------------- /lib/dataset/__pycache__/dataload.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamrishab/DB-tf/5f62e068ca23a4afb026359b25e914de5d0c852e/lib/dataset/__pycache__/dataload.cpython-36.pyc -------------------------------------------------------------------------------- /lib/dataset/dataloader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import tqdm 4 | import time 5 | import random 6 | import numpy as np 7 | 8 | from db_config import cfg 9 | from lib.dataset.label_maker import make_border_map, make_score_map 10 | from lib.dataset.generator_enqueuer import GeneratorEnqueuer 11 | from lib.dataset.img_aug import crop_area, det_aug 12 | from lib.utils import resize_img, load_ctw1500_labels, load_icdar_labels 13 | 14 | def load_labels(gt_path, data_name='ctw1500'): 15 | if data_name == 'ctw1500': 16 | return load_ctw1500_labels(gt_path) 17 | elif data_name == 'icdar': 18 | return load_icdar_labels 19 | 20 | def make_train_labels(polys, tags, h, w): 21 | """ 22 | 23 | :param polys: numpy [N, 2] 24 | :param tags: 25 | :param h: 26 | :param w: 27 | :return: 28 | """ 29 | 30 | threshold_map, thresh_mask = make_border_map(polys, tags, h, w) 31 | score_map, score_mask = make_score_map(polys, tags, h, w) 32 | 33 | return score_map, score_mask, threshold_map, thresh_mask 34 | 35 | def generator(batchsize, img_dir, label_dir, random_scale=np.array(cfg.TRAIN.IMG_SCALE), is_eval=False): 36 | 37 | img_list = os.listdir(img_dir) 38 | 39 | epoch = 0 40 | while True: 41 | train_imgs = [] 42 | train_score_maps = [] 43 | train_socre_masks = [] 44 | train_thresh_maps = [] 45 | train_thresh_masks = [] 46 | 47 | np.random.shuffle(img_list) 48 | 49 | for img_name in img_list: 50 | try: 51 | img_path = os.path.join(img_dir, img_name) 52 | label_path = os.path.join(label_dir, os.path.splitext(img_name)[0] + '.txt') 53 | 54 | img_input = np.zeros([cfg.TRAIN.IMG_SIZE, cfg.TRAIN.IMG_SIZE, 3], dtype=np.float32) 55 | 56 | img = cv2.imread(img_path)[:,:, ::-1] 57 | img, (ratio_h, ratio_w) = resize_img(img, cfg.TRAIN.IMG_SIZE) 58 | 59 | if random.random() < cfg.TRAIN.DATA_AUG_PROB and not is_eval: 60 | img = det_aug(img) 61 | 62 | polys, tags = load_labels(label_path) 63 | polys[:, :, 0] *= ratio_w 64 | polys[:, :, 1] *= ratio_h 65 | 66 | if (random.random() < cfg.TRAIN.CROP_PROB) and (not is_eval): 67 | img, polys, tags = crop_area(img, polys, tags) 68 | img, (ratio_h, ratio_w) = resize_img(img, cfg.TRAIN.IMG_SIZE) 69 | polys[:, :, 0] *= ratio_w 70 | polys[:, :, 1] *= ratio_h 71 | 72 | h, w, _ = img.shape 73 | img_input[:h, :w, :] = img 74 | h, w, _ = img_input.shape 75 | 76 | score_map, score_mask, threshold_map, thresh_mask = make_train_labels(polys, tags, h, w) 77 | 78 | train_imgs.append(img_input) 79 | train_score_maps.append(score_map[:, :, np.newaxis]) 80 | train_socre_masks.append(score_mask[:, :, np.newaxis]) 81 | train_thresh_maps.append(threshold_map[:, :, np.newaxis]) 82 | train_thresh_masks.append(thresh_mask[:, :, np.newaxis]) 83 | 84 | if len(train_imgs) == batchsize: 85 | if is_eval: 86 | yield train_imgs, train_score_maps, train_socre_masks, train_thresh_maps, train_thresh_masks, epoch 87 | else: 88 | yield train_imgs, train_score_maps, train_socre_masks, train_thresh_maps, train_thresh_masks 89 | train_imgs = [] 90 | train_score_maps = [] 91 | train_socre_masks = [] 92 | train_thresh_maps = [] 93 | train_thresh_masks = [] 94 | 95 | except Exception as e: 96 | import traceback 97 | traceback.print_exc() 98 | print(img_path) 99 | # print(polys[0]) 100 | # img_input = img_input.astype(np.int) 101 | # for poly in polys: 102 | # poly = np.array(poly, dtype=np.int) 103 | # cv2.polylines(img_input, [poly.reshape((-1, 1, 2))], True, (0, 255, 0)) 104 | # cv2.imwrite(img_name, img_input) 105 | continue 106 | epoch += 1 107 | 108 | 109 | def get_batch(num_workers, **kwargs): 110 | try: 111 | enqueuer = GeneratorEnqueuer(generator(**kwargs), use_multiprocessing=True) 112 | print('Generator use 10 batches for buffering, this may take a while, you can tune this yourself.') 113 | enqueuer.start(max_queue_size=10, workers=num_workers) 114 | generator_output = None 115 | while True: 116 | while enqueuer.is_running(): 117 | if not enqueuer.queue.empty(): 118 | generator_output = enqueuer.queue.get() 119 | break 120 | else: 121 | time.sleep(0.01) 122 | yield generator_output 123 | generator_output = None 124 | finally: 125 | if enqueuer is not None: 126 | enqueuer.stop() 127 | 128 | 129 | if __name__ =='__main__': 130 | img_dir = '/Users/zhangzihao/AI/research/datasets/ctw1500/train/text_image' 131 | label_dir = '/Users/zhangzihao/AI/research/datasets/ctw1500/train/text_label_curve' 132 | 133 | 134 | img_list = os.listdir(img_dir) 135 | label_list = os.listdir(label_dir) 136 | # np.random.shuffle(img_list) 137 | print(img_list[0]) 138 | img = cv2.imread(os.path.join(img_dir, img_list[0])) 139 | h, w, _ = img.shape 140 | polys, tags = load_labels(os.path.join(label_dir, os.path.splitext(img_list[0])[0] + '.txt')) 141 | threshold_map, thresh_mask = make_border_map(polys, tags, h, w) 142 | score_map, score_mask = make_score_map(polys, tags, h, w) 143 | 144 | # 145 | # for poly in polys: 146 | # poly = np.array(poly, dtype=np.int) 147 | # cv2.polylines(img, [poly.reshape((-1, 1, 2))], True, (0, 255, 0)) 148 | # 149 | # 150 | # threshold_map, thresh_mask = make_border_map(polys, tags, h, w) 151 | # 152 | # s = time.time() 153 | # score_map, score_mask = make_score_map(polys, tags, h, w) 154 | # print(time.time()-s) 155 | # 156 | # cv2.imwrite('s.jpg', score_map*255) 157 | # cv2.imwrite('t.jpg', threshold_map*255) 158 | # cv2.imwrite('sm.jpg', score_mask*255) 159 | # 160 | # cv2.imwrite('o.jpg', img) 161 | -------------------------------------------------------------------------------- /lib/dataset/generator_enqueuer.py: -------------------------------------------------------------------------------- 1 | ''' 2 | this file is modified from keras implemention of data process multi-threading, 3 | see https://github.com/fchollet/keras/blob/master/keras/utils/data_utils.py 4 | ''' 5 | import time 6 | import numpy as np 7 | import threading 8 | import multiprocessing 9 | try: 10 | import queue 11 | except ImportError: 12 | import Queue as queue 13 | 14 | 15 | class GeneratorEnqueuer(): 16 | """Builds a queue out of a data generator. 17 | 18 | Used in `fit_generator`, `evaluate_generator`, `predict_generator`. 19 | 20 | # Arguments 21 | generator: a generator function which endlessly yields data 22 | use_multiprocessing: use multiprocessing if True, otherwise threading 23 | wait_time: time to sleep in-between calls to `put()` 24 | random_seed: Initial seed for workers, 25 | will be incremented by one for each workers. 26 | """ 27 | 28 | def __init__(self, generator, 29 | use_multiprocessing=False, 30 | wait_time=0.05, 31 | random_seed=None): 32 | self.wait_time = wait_time 33 | self._generator = generator 34 | self._use_multiprocessing = use_multiprocessing 35 | self._threads = [] 36 | self._stop_event = None 37 | self.queue = None 38 | self.random_seed = random_seed 39 | 40 | def start(self, workers=1, max_queue_size=10): 41 | """Kicks off threads which add data from the generator into the queue. 42 | 43 | # Arguments 44 | workers: number of worker threads 45 | max_queue_size: queue size 46 | (when full, threads could block on `put()`) 47 | """ 48 | 49 | def data_generator_task(): 50 | while not self._stop_event.is_set(): 51 | try: 52 | if self._use_multiprocessing or self.queue.qsize() < max_queue_size: 53 | generator_output = next(self._generator) 54 | self.queue.put(generator_output) 55 | else: 56 | time.sleep(self.wait_time) 57 | except Exception: 58 | self._stop_event.set() 59 | raise 60 | 61 | try: 62 | if self._use_multiprocessing: 63 | self.queue = multiprocessing.Queue(maxsize=max_queue_size) 64 | self._stop_event = multiprocessing.Event() 65 | else: 66 | self.queue = queue.Queue() 67 | self._stop_event = threading.Event() 68 | 69 | for _ in range(workers): 70 | if self._use_multiprocessing: 71 | # Reset random seed else all children processes 72 | # share the same seed 73 | np.random.seed(self.random_seed) 74 | thread = multiprocessing.Process(target=data_generator_task) 75 | thread.daemon = True 76 | if self.random_seed is not None: 77 | self.random_seed += 1 78 | else: 79 | thread = threading.Thread(target=data_generator_task) 80 | self._threads.append(thread) 81 | thread.start() 82 | except: 83 | self.stop() 84 | raise 85 | 86 | def is_running(self): 87 | return self._stop_event is not None and not self._stop_event.is_set() 88 | 89 | def stop(self, timeout=None): 90 | """Stops running threads and wait for them to exit, if necessary. 91 | 92 | Should be called by the same thread which called `start()`. 93 | 94 | # Arguments 95 | timeout: maximum time to wait on `thread.join()`. 96 | """ 97 | if self.is_running(): 98 | self._stop_event.set() 99 | 100 | for thread in self._threads: 101 | if thread.is_alive(): 102 | if self._use_multiprocessing: 103 | thread.terminate() 104 | else: 105 | thread.join(timeout) 106 | 107 | if self._use_multiprocessing: 108 | if self.queue is not None: 109 | self.queue.close() 110 | 111 | self._threads = [] 112 | self._stop_event = None 113 | self.queue = None 114 | 115 | def get(self): 116 | """Creates a generator to extract data from the queue. 117 | 118 | Skip the data if it is `None`. 119 | 120 | # Returns 121 | A generator 122 | """ 123 | while self.is_running(): 124 | if not self.queue.empty(): 125 | inputs = self.queue.get() 126 | if inputs is not None: 127 | yield inputs 128 | else: 129 | time.sleep(self.wait_time) -------------------------------------------------------------------------------- /lib/dataset/img_aug.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import os 3 | import json 4 | import cv2 5 | import random 6 | import numpy as np 7 | import imageio 8 | import imgaug as ia 9 | import imgaug.augmenters as iaa 10 | from imgaug.augmentables.polys import Polygon 11 | from db_config import cfg 12 | 13 | 14 | def crop_area(im, polys, tags, crop_background=False, max_tries=50): 15 | ''' 16 | make random crop from the input image 17 | :param im: 18 | :param polys: 19 | :param tags: 20 | :param crop_background: 21 | :param max_tries: 22 | :return: 23 | ''' 24 | h, w, _ = im.shape 25 | pad_h = h // 10 26 | pad_w = w // 10 27 | h_array = np.zeros((h + pad_h * 2), dtype=np.int32) 28 | w_array = np.zeros((w + pad_w * 2), dtype=np.int32) 29 | for poly in polys: 30 | poly = np.round(poly, decimals=0).astype(np.int32) 31 | minx = np.min(poly[:, 0]) 32 | maxx = np.max(poly[:, 0]) 33 | w_array[minx + pad_w:maxx + pad_w] = 1 34 | miny = np.min(poly[:, 1]) 35 | maxy = np.max(poly[:, 1]) 36 | h_array[miny + pad_h:maxy + pad_h] = 1 37 | # ensure the cropped area not across a text 38 | h_axis = np.where(h_array == 0)[0] 39 | w_axis = np.where(w_array == 0)[0] 40 | if len(h_axis) == 0 or len(w_axis) == 0: 41 | return im, polys, tags 42 | for i in range(max_tries): 43 | xx = np.random.choice(w_axis, size=2) 44 | xmin = np.min(xx) - pad_w 45 | xmax = np.max(xx) - pad_w 46 | xmin = np.clip(xmin, 0, w - 1) 47 | xmax = np.clip(xmax, 0, w - 1) 48 | yy = np.random.choice(h_axis, size=2) 49 | ymin = np.min(yy) - pad_h 50 | ymax = np.max(yy) - pad_h 51 | ymin = np.clip(ymin, 0, h - 1) 52 | ymax = np.clip(ymax, 0, h - 1) 53 | 54 | if xmax - xmin < cfg.TRAIN.MIN_CROP_SIDE_RATIO * w or \ 55 | ymax - ymin < cfg.TRAIN.MIN_CROP_SIDE_RATIO * h: 56 | continue 57 | 58 | if polys.shape[0] != 0: 59 | poly_axis_in_area = (polys[:, :, 0] >= xmin) & (polys[:, :, 0] <= xmax) \ 60 | & (polys[:, :, 1] >= ymin) & (polys[:, :, 1] <= ymax) 61 | selected_polys = np.where(np.sum(poly_axis_in_area, axis=1) == 4)[0] 62 | else: 63 | selected_polys = [] 64 | if len(selected_polys) == 0: 65 | # no text in this area 66 | if crop_background: 67 | return im[ymin:ymax + 1, xmin:xmax + 1, :], polys[selected_polys], tags[selected_polys] 68 | else: 69 | continue 70 | im = im[ymin:ymax + 1, xmin:xmax + 1, :] 71 | polys = polys[selected_polys] 72 | tags = tags[selected_polys] 73 | polys[:, :, 0] -= xmin 74 | polys[:, :, 1] -= ymin 75 | return im, polys, tags 76 | 77 | return im, polys, tags 78 | 79 | 80 | 81 | def det_aug(image, polys_np=None): 82 | """ 83 | 随机对图像做以下的增强操作 84 | :param image: cv2 read 85 | :param polys_np:[N, 4, 2] 86 | :return: 87 | """ 88 | aug_sample = random.sample(cfg.TRAIN.AUG_TOOL, 1)[0] #从数组中随机取出一个增强的功能 89 | 90 | ###################################################################################################### 91 | # blur-模糊 92 | aug = None 93 | # 高斯滤波 sigma 为1-10的保留小数点后一位的float的随机值,可根据情况调整 94 | if aug_sample == 'GaussianBlur': 95 | sigma = random.uniform(1, 2) 96 | sigma = round(8, 10) 97 | aug = iaa.GaussianBlur(sigma) 98 | 99 | # 平均模糊 k 为1-10的随机 奇 数,范围根据情况调整 100 | if aug_sample == 'AverageBlur': 101 | k = random.randint(8, 10) * 2 + 1 102 | aug = iaa.AverageBlur(k) 103 | 104 | # 中值滤波 k 为1-10的随机 奇 数,范围根据情况调整 105 | if aug_sample == 'MedianBlur': 106 | k = random.randint(8, 10) * 2 + 1 107 | aug = iaa.MedianBlur(k) 108 | 109 | # 双边滤波 d=1 为 奇 数, sigma_color=(10, 250), sigma_space=(10, 250) 110 | if aug_sample == 'BilateralBlur': 111 | d = random.randint(0, 2) * 2 + 1 112 | sigma_color = random.randint(10, 250) 113 | sigma_space = random.randint(10, 250) 114 | aug = iaa.BilateralBlur(d, sigma_color, sigma_space) 115 | 116 | # 运动模糊 k=5 一定大于3 的 奇 数, angle=(0, 360), direction=(-1.0, 1.0) 117 | if aug_sample == 'MotionBlur': 118 | k = random.randint(15, 20) * 2 + 1 119 | angle = random.randint(0, 360) 120 | direction = random.uniform(-1, 1) 121 | direction = round(direction, 1) 122 | aug = iaa.MotionBlur(k, angle, direction) 123 | 124 | ###################################################################################################### 125 | # geometric 几何学 126 | 127 | # 弹性变换 128 | if aug_sample == 'ElasticTransformation': 129 | alpha = random.uniform(10, 20) 130 | alpha = round(alpha, 1) 131 | sigma = random.uniform(5, 10) 132 | sigma = round(sigma, 1) 133 | # print(alpha, sigma) 134 | aug = iaa.ElasticTransformation(alpha, sigma) 135 | 136 | # 透视 137 | if aug_sample == 'PerspectiveTransform': 138 | scale = random.uniform(0, 0.2) 139 | scale = round(scale, 3) 140 | aug = iaa.PerspectiveTransform(scale) 141 | 142 | # 旋转角度 143 | # if aug_sample == 'Affine_rot': 144 | # rotate = random.randint(-20, 20) 145 | # aug = iaa.Affine(rotate=rotate) 146 | 147 | # 缩放 148 | # if aug_sample == 'Affine_scale': 149 | # scale = random.uniform(0, 2) 150 | # scale = round(scale, 1) 151 | # aug = iaa.Affine(scale=scale) 152 | ###################################################################################################### 153 | # flip 镜像 154 | 155 | # 水平镜像 156 | # if aug_sample == 'Fliplr': 157 | # aug = iaa.Fliplr(1) 158 | # 159 | # 垂直镜像 160 | # if aug_sample == 'Flipud': 161 | # aug = iaa.Flipud(1) 162 | 163 | ###################################################################################################### 164 | # size 尺寸 165 | 166 | # if aug_sample == 'CropAndPad': 167 | # top = random.randint(0, 10) 168 | # right = random.randint(0, 10) 169 | # bottom = random.randint(0, 10) 170 | # left = random.randint(0, 10) 171 | # aug = iaa.CropAndPad(px=(top, right, bottom, left)) # 上 右 下 左 各crop多少像素,然后进行padding 172 | 173 | if aug_sample == 'Crop': 174 | top = random.randint(0, 10) 175 | right = random.randint(0, 10) 176 | bottom = random.randint(0, 10) 177 | left = random.randint(0, 10) 178 | aug = iaa.Crop(px=(top, right, bottom, left)) # 上 右 下 左 179 | 180 | if aug_sample == 'Pad': 181 | top = random.randint(0, 10) 182 | right = random.randint(0, 10) 183 | bottom = random.randint(0, 10) 184 | left = random.randint(0, 10) 185 | aug = iaa.Pad(px=(top, right, bottom, left)) # 上 右 下 左 186 | 187 | # if aug_sample == 'PadToFixedSize': 188 | # height = image.shape[0] + 32 189 | # width = image.shape[1] + 100 190 | # aug = iaa.PadToFixedSize(width=width, height=height)z 191 | 192 | # if aug_sample == 'CropToFixedSize': 193 | # height = image.shape[0] - 32 194 | # width = image.shape[1] - 100 195 | # aug = iaa.CropToFixedSize(width=width, height=height) 196 | 197 | if polys_np is not None: 198 | if aug is not None: 199 | # print(aug_sample) 200 | h, w, _ = image.shape 201 | boxes_info_list = [] 202 | for box in polys_np: 203 | boxes_info_list.append(Polygon(box)) 204 | 205 | psoi = ia.PolygonsOnImage(boxes_info_list, shape=image.shape) # 生成单个图像上所有多边形的对象 206 | image, psoi_aug = aug(image=image, polygons=psoi) 207 | 208 | pts_list = [] 209 | for each_poly in psoi_aug.polygons: 210 | pts_list.append(np.array(each_poly.exterior).reshape((4, 2))) 211 | return image, np.array(pts_list, np.float32).reshape((-1, 4, 2)) 212 | else: 213 | 214 | return image, polys_np 215 | else: 216 | image = aug(image=image) 217 | return image 218 | -------------------------------------------------------------------------------- /lib/dataset/label_maker.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import numpy as np 4 | from shapely.geometry import Polygon 5 | import pyclipper 6 | from db_config import cfg 7 | 8 | import warnings 9 | warnings.filterwarnings('ignore') 10 | 11 | def _distance(xs, ys, point_1, point_2): 12 | ''' 13 | compute the distance from point to a line 14 | ys: coordinates in the first axis 15 | xs: coordinates in the second axis 16 | point_1, point_2: (x, y), the end of the line 17 | ''' 18 | square_distance_1 = np.square(xs - point_1[0]) + np.square(ys - point_1[1]) 19 | square_distance_2 = np.square(xs - point_2[0]) + np.square(ys - point_2[1]) 20 | square_distance = np.square(point_1[0] - point_2[0]) + np.square(point_1[1] - point_2[1]) 21 | 22 | cosin = (square_distance - square_distance_1 - square_distance_2) / (2 * np.sqrt(square_distance_1 * square_distance_2)) 23 | square_sin = 1 - np.square(cosin) 24 | square_sin = np.nan_to_num(square_sin) 25 | 26 | result = np.sqrt(square_distance_1 * square_distance_2 * square_sin / square_distance) 27 | result[cosin < 0] = np.sqrt(np.fmin(square_distance_1, square_distance_2))[cosin < 0] 28 | return result 29 | 30 | def _extend_line(point1, point2, result): 31 | ex_point_1 = (int(round(point1[0] + (point1[0] - point2[0]) * (1 + cfg.SHRINK_RATIO))), 32 | int(round(point1[1] + (point1[1] - point2[1]) * (1 + cfg.SHRINK_RATIO)))) 33 | cv2.line(result, tuple(ex_point_1), tuple(point1), 4096.0, 1, lineType=cv2.LINE_AA, shift=0) 34 | ex_point_2 = (int(round(point2[0] + (point2[0] - point1[0]) * (1 + cfg.SHRINK_RATIO))), 35 | int(round(point2[1] + (point2[1] - point1[1]) * (1 + cfg.SHRINK_RATIO)))) 36 | cv2.line(result, tuple(ex_point_2), tuple(point2), 4096.0, 1, lineType=cv2.LINE_AA, shift=0) 37 | return ex_point_1, ex_point_2 38 | 39 | def _validate_polygons(polys, tags, h, w): 40 | 41 | if len(polys) == 0: 42 | return polys, tags 43 | for poly in polys: 44 | poly[:, 0] = np.clip(poly[:, 0], 0, w - 1) 45 | poly[:, 1] = np.clip(poly[:, 1], 0, h - 1) 46 | 47 | for i in range(len(polys)): 48 | area = Polygon(polys[i]).convex_hull.area 49 | # area = _polygon_area(polys[i]) 50 | # if abs(area) < 1: 51 | # tags[i] = True 52 | # if area > 0: 53 | # polys[i] = polys[i][::-1, :] 54 | if area <= cfg.TRAIN.MIN_AREA: 55 | tags[i] = True 56 | return polys, tags 57 | 58 | def _polygon_area(poly): 59 | edge = 0 60 | for i in range(poly.shape[0]): 61 | next_index = (i + 1) % poly.shape[0] 62 | edge += (poly[next_index, 0] - poly[i, 0]) * (poly[next_index, 1] - poly[i, 1]) 63 | return edge / 2. 64 | 65 | 66 | def make_score_map(text_polys, tags, h, w): 67 | min_text_size = cfg.TRAIN.MIN_TEXT_SIZE 68 | shrink_ratio = cfg.SHRINK_RATIO 69 | 70 | text_polys, ignore_tags = _validate_polygons(text_polys, tags, h, w) 71 | score_map = np.zeros((h, w), dtype=np.float32) 72 | mask = np.ones((h, w), dtype=np.float32) 73 | 74 | for i in range(len(text_polys)): 75 | polygon = text_polys[i] 76 | height = max(polygon[:, 1]) - min(polygon[:, 1]) 77 | width = max(polygon[:, 0]) - min(polygon[:, 0]) 78 | if ignore_tags[i] or min(height, width) < min_text_size: 79 | cv2.fillPoly(mask, polygon.astype(np.int32)[np.newaxis, :, :], 0) 80 | ignore_tags[i] = True 81 | else: 82 | polygon_shape = Polygon(polygon) 83 | distance = polygon_shape.area * (1 - np.power(shrink_ratio, 2)) / polygon_shape.length 84 | subject = [tuple(l) for l in text_polys[i]] 85 | padding = pyclipper.PyclipperOffset() 86 | padding.AddPath(subject, pyclipper.JT_ROUND, 87 | pyclipper.ET_CLOSEDPOLYGON) 88 | shrinked = padding.Execute(-distance) 89 | if shrinked == []: 90 | cv2.fillPoly(mask, polygon.astype(np.int32)[np.newaxis, :, :], 0) 91 | ignore_tags[i] = True 92 | continue 93 | shrinked = np.array(shrinked[0]).reshape(-1, 2) 94 | cv2.fillPoly(score_map, [shrinked.astype(np.int32)], 1) 95 | 96 | return score_map, mask 97 | 98 | def make_border_map(text_polys, tags, h, w): 99 | 100 | canvas = np.zeros([h, w], dtype=np.float32) 101 | mask = np.zeros([h, w], dtype=np.float32) 102 | 103 | for i in range(len(text_polys)): 104 | if tags[i]: 105 | continue 106 | canvas, mask = _draw_border_map(text_polys[i], canvas, mask) 107 | threshold_map = canvas * (cfg.THRESH_MAX - cfg.THRESH_MIN) + cfg.THRESH_MIN 108 | 109 | return threshold_map, mask 110 | 111 | def _draw_border_map(poly, canvas, mask): 112 | poly = np.array(poly).copy() 113 | assert poly.ndim == 2 114 | assert poly.shape[1] == 2 115 | 116 | poly_shape = Polygon(poly) 117 | if poly_shape.area <= 0: 118 | return 119 | distance = poly_shape.area * (1 - np.power(cfg.SHRINK_RATIO, 2)) / poly_shape.length 120 | subject = [tuple(l) for l in poly] 121 | padding = pyclipper.PyclipperOffset() 122 | padding.AddPath(subject, pyclipper.JT_ROUND, 123 | pyclipper.ET_CLOSEDPOLYGON) 124 | 125 | padded_polygon = np.array(padding.Execute(distance)[0]) 126 | cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0) 127 | 128 | xmin = padded_polygon[:, 0].min() 129 | xmax = padded_polygon[:, 0].max() 130 | ymin = padded_polygon[:, 1].min() 131 | ymax = padded_polygon[:, 1].max() 132 | width = xmax - xmin + 1 133 | height = ymax - ymin + 1 134 | 135 | poly[:, 0] = poly[:, 0] - xmin 136 | poly[:, 1] = poly[:, 1] - ymin 137 | 138 | xs = np.broadcast_to( 139 | np.linspace(0, width - 1, num=width).reshape(1, width), (height, width)) 140 | ys = np.broadcast_to( 141 | np.linspace(0, height - 1, num=height).reshape(height, 1), (height, width)) 142 | 143 | distance_map = np.zeros( 144 | (poly.shape[0], height, width), dtype=np.float32) 145 | for i in range(poly.shape[0]): 146 | j = (i + 1) % poly.shape[0] 147 | absolute_distance = _distance(xs, ys, poly[i], poly[j]) 148 | distance_map[i] = np.clip(absolute_distance / distance, 0, 1) 149 | distance_map = distance_map.min(axis=0) 150 | 151 | xmin_valid = min(max(0, xmin), canvas.shape[1] - 1) 152 | xmax_valid = min(max(0, xmax), canvas.shape[1] - 1) 153 | ymin_valid = min(max(0, ymin), canvas.shape[0] - 1) 154 | ymax_valid = min(max(0, ymax), canvas.shape[0] - 1) 155 | # print(xmin_valid, xmax_valid, ymin_valid, ymax_valid) 156 | # print(xmin, xmax, ymin, ymax) 157 | # print(distance_map.shape) 158 | # print(distance_map[ 159 | # ymin_valid - ymin:ymax_valid - ymax + height, 160 | # xmin_valid - xmin:xmax_valid - xmax + width].shape) 161 | canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax( 162 | 1 - distance_map[ymin_valid - ymin:ymax_valid - ymax + height, xmin_valid - xmin:xmax_valid - xmax + width], 163 | canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1]) 164 | 165 | return canvas, mask -------------------------------------------------------------------------------- /lib/networks/losses.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from db_config import cfg 3 | 4 | 5 | def dice_coefficient_loss(y_true_cls, y_pred_cls, 6 | training_mask): 7 | ''' 8 | dice loss 9 | :param y_true_cls: 10 | :param y_pred_cls: 11 | :param training_mask: 12 | :return: 13 | ''' 14 | eps = 1e-6 15 | intersection = tf.reduce_sum(y_true_cls * y_pred_cls * training_mask) 16 | union = tf.reduce_sum(y_true_cls * training_mask) + tf.reduce_sum(y_pred_cls * training_mask) + eps 17 | loss = 1. - (2 * intersection / union) 18 | return loss 19 | 20 | 21 | def balance_cross_entropy_loss(gt, pred, mask, 22 | negative_ratio=3.0, eps=1e-6): 23 | positive = gt * mask 24 | negative = (1 - gt) * mask 25 | positive_count = tf.reduce_sum(positive) 26 | negative_count = tf.minimum(tf.reduce_sum(negative), positive_count * negative_ratio) 27 | negative_count = tf.cast(negative_count, tf.int32) 28 | gt = tf.reshape(gt, [-1, 1]) 29 | pred = tf.reshape(pred, [-1, 1]) 30 | cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=gt, logits=pred) 31 | positive_loss = cross_entropy * positive 32 | negative_loss = cross_entropy * negative 33 | negative_loss, _ = tf.nn.top_k(tf.reshape(negative_loss, [-1]), negative_count) 34 | 35 | negative_count = tf.cast(negative_count, tf.float32) 36 | balance_loss = (tf.reduce_sum(positive_loss) + tf.reduce_sum(negative_loss)) / (positive_count + negative_count + eps) 37 | 38 | return balance_loss 39 | 40 | def softmax_cross_entropy_loss(y_true_cls, y_pred_cls, training_mask): 41 | ''' 42 | softmax_cross_entropy(SCE) loss 43 | :param y_true_cls:[bs,w,h,N] 44 | :param y_pred_cls:[bs,w,h,N] 45 | :param training_mask: 46 | :return: 47 | ''' 48 | re_mask = 1 - training_mask 49 | zero_mask = tf.zeros(tf.shape(re_mask)) 50 | add_mask = tf.concat((re_mask, zero_mask, zero_mask), axis=3) 51 | 52 | y_true_cls = y_true_cls * training_mask + add_mask 53 | y_pred_cls = y_pred_cls * training_mask + add_mask 54 | 55 | y_true_cls = tf.reshape(y_true_cls, [-1, tf.shape(y_true_cls)[-1]]) 56 | y_pred_cls = tf.reshape(y_pred_cls, [-1, tf.shape(y_true_cls)[-1]]) 57 | 58 | cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_true_cls, logits=y_pred_cls) 59 | cls_loss = tf.reduce_mean(cross_entropy) 60 | 61 | return cls_loss 62 | 63 | def l1_loss(pred, gt, mask): 64 | 65 | loss = tf.reduce_mean(tf.abs(pred - gt) * mask) + 1e-6 66 | 67 | return loss 68 | 69 | 70 | def smooth_l1_loss(pred, gt, mask, sigma=1.0): 71 | ''' 72 | 73 | :param pred: 74 | :param gt: shape is same as pred 75 | :param sigma: 76 | :return: 77 | ''' 78 | sigma2 = sigma**2 79 | 80 | diff = pred * mask - gt 81 | 82 | with tf.name_scope('smooth_l1_loss'): 83 | deltas_abs = tf.abs(diff) 84 | smoothL1_sign = tf.cast(tf.less(deltas_abs, 1.0 / sigma2), tf.float32) 85 | return tf.reduce_mean(tf.square(diff) * 0.5 * sigma2 * smoothL1_sign + \ 86 | (deltas_abs - 0.5 / sigma2) * tf.abs(smoothL1_sign - 1)) 87 | 88 | def compute_cls_acc(pred, gt, mask): 89 | 90 | zero = tf.zeros_like(pred, tf.float32) 91 | one = tf.ones_like(pred, tf.float32) 92 | 93 | pred = tf.where(pred < 0.3, x=zero, y=one) 94 | acc = tf.reduce_mean(tf.cast(tf.equal(pred * mask, gt * mask), tf.float32)) 95 | 96 | return acc 97 | 98 | 99 | def compute_loss(binarize_map, threshold_map, thresh_binary, 100 | gt_score_maps, gt_threshold_map, gt_score_mask, gt_thresh_mask): 101 | 102 | binarize_loss = dice_coefficient_loss(gt_score_maps, binarize_map, gt_score_mask) 103 | threshold_loss = l1_loss(threshold_map, gt_threshold_map, gt_thresh_mask) 104 | thresh_binary_loss = dice_coefficient_loss(gt_score_maps, thresh_binary, gt_score_mask) 105 | 106 | model_loss = cfg.TRAIN.LOSS_ALPHA * binarize_loss + cfg.TRAIN.LOSS_BETA * threshold_loss + thresh_binary_loss 107 | 108 | tf.summary.scalar('losses/binarize_loss', binarize_loss) 109 | tf.summary.scalar('losses/threshold_loss', threshold_loss) 110 | tf.summary.scalar('losses/thresh_binary_loss', thresh_binary_loss) 111 | return model_loss 112 | 113 | def compute_acc(binarize_map, threshold_map, thresh_binary, 114 | gt_score_maps, gt_threshold_map, gt_score_mask, gt_thresh_mask): 115 | binarize_acc = compute_cls_acc(binarize_map, gt_score_maps, gt_score_mask) 116 | thresh_binary_acc = compute_cls_acc(thresh_binary, gt_score_maps, gt_score_mask) 117 | 118 | tf.summary.scalar('acc/binarize_acc', binarize_acc) 119 | tf.summary.scalar('acc/thresh_binary_acc', thresh_binary_acc) 120 | 121 | return binarize_acc, thresh_binary_acc 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /lib/networks/mobilenet/conv_blocks.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Convolution blocks for mobilenet.""" 16 | import contextlib 17 | import functools 18 | 19 | import tensorflow as tf 20 | from tensorflow.contrib import slim as contrib_slim 21 | 22 | slim = contrib_slim 23 | 24 | 25 | def _fixed_padding(inputs, kernel_size, rate=1): 26 | """Pads the input along the spatial dimensions independently of input size. 27 | Pads the input such that if it was used in a convolution with 'VALID' padding, 28 | the output would have the same dimensions as if the unpadded input was used 29 | in a convolution with 'SAME' padding. 30 | Args: 31 | inputs: A tensor of size [batch, height_in, width_in, channels]. 32 | kernel_size: The kernel to be used in the conv2d or max_pool2d operation. 33 | rate: An integer, rate for atrous convolution. 34 | Returns: 35 | output: A tensor of size [batch, height_out, width_out, channels] with the 36 | input, either intact (if kernel_size == 1) or padded (if kernel_size > 1). 37 | """ 38 | kernel_size_effective = [kernel_size[0] + (kernel_size[0] - 1) * (rate - 1), 39 | kernel_size[0] + (kernel_size[0] - 1) * (rate - 1)] 40 | pad_total = [kernel_size_effective[0] - 1, kernel_size_effective[1] - 1] 41 | pad_beg = [pad_total[0] // 2, pad_total[1] // 2] 42 | pad_end = [pad_total[0] - pad_beg[0], pad_total[1] - pad_beg[1]] 43 | padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg[0], pad_end[0]], 44 | [pad_beg[1], pad_end[1]], [0, 0]]) 45 | return padded_inputs 46 | 47 | 48 | def _make_divisible(v, divisor, min_value=None): 49 | if min_value is None: 50 | min_value = divisor 51 | new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) 52 | # Make sure that round down does not go down by more than 10%. 53 | if new_v < 0.9 * v: 54 | new_v += divisor 55 | return new_v 56 | 57 | 58 | def _split_divisible(num, num_ways, divisible_by=8): 59 | """Evenly splits num, num_ways so each piece is a multiple of divisible_by.""" 60 | assert num % divisible_by == 0 61 | assert num / num_ways >= divisible_by 62 | # Note: want to round down, we adjust each split to match the total. 63 | base = num // num_ways // divisible_by * divisible_by 64 | result = [] 65 | accumulated = 0 66 | for i in range(num_ways): 67 | r = base 68 | while accumulated + r < num * (i + 1) / num_ways: 69 | r += divisible_by 70 | result.append(r) 71 | accumulated += r 72 | assert accumulated == num 73 | return result 74 | 75 | 76 | @contextlib.contextmanager 77 | def _v1_compatible_scope_naming(scope): 78 | if scope is None: # Create uniqified separable blocks. 79 | with tf.variable_scope(None, default_name='separable') as s, \ 80 | tf.name_scope(s.original_name_scope): 81 | yield '' 82 | else: 83 | # We use scope_depthwise, scope_pointwise for compatibility with V1 ckpts. 84 | # which provide numbered scopes. 85 | scope += '_' 86 | yield scope 87 | 88 | 89 | @slim.add_arg_scope 90 | def split_separable_conv2d(input_tensor, 91 | num_outputs, 92 | scope=None, 93 | normalizer_fn=None, 94 | stride=1, 95 | rate=1, 96 | endpoints=None, 97 | use_explicit_padding=False): 98 | """Separable mobilenet V1 style convolution. 99 | Depthwise convolution, with default non-linearity, 100 | followed by 1x1 depthwise convolution. This is similar to 101 | slim.separable_conv2d, but differs in tha it applies batch 102 | normalization and non-linearity to depthwise. This matches 103 | the basic building of Mobilenet Paper 104 | (https://arxiv.org/abs/1704.04861) 105 | Args: 106 | input_tensor: input 107 | num_outputs: number of outputs 108 | scope: optional name of the scope. Note if provided it will use 109 | scope_depthwise for deptwhise, and scope_pointwise for pointwise. 110 | normalizer_fn: which normalizer function to use for depthwise/pointwise 111 | stride: stride 112 | rate: output rate (also known as dilation rate) 113 | endpoints: optional, if provided, will export additional tensors to it. 114 | use_explicit_padding: Use 'VALID' padding for convolutions, but prepad 115 | inputs so that the output dimensions are the same as if 'SAME' padding 116 | were used. 117 | Returns: 118 | output tesnor 119 | """ 120 | 121 | with _v1_compatible_scope_naming(scope) as scope: 122 | dw_scope = scope + 'depthwise' 123 | endpoints = endpoints if endpoints is not None else {} 124 | kernel_size = [3, 3] 125 | padding = 'SAME' 126 | if use_explicit_padding: 127 | padding = 'VALID' 128 | input_tensor = _fixed_padding(input_tensor, kernel_size, rate) 129 | net = slim.separable_conv2d( 130 | input_tensor, 131 | None, 132 | kernel_size, 133 | depth_multiplier=1, 134 | stride=stride, 135 | rate=rate, 136 | normalizer_fn=normalizer_fn, 137 | padding=padding, 138 | scope=dw_scope) 139 | 140 | endpoints[dw_scope] = net 141 | 142 | pw_scope = scope + 'pointwise' 143 | net = slim.conv2d( 144 | net, 145 | num_outputs, [1, 1], 146 | stride=1, 147 | normalizer_fn=normalizer_fn, 148 | scope=pw_scope) 149 | endpoints[pw_scope] = net 150 | return net 151 | 152 | 153 | def expand_input_by_factor(n, divisible_by=8): 154 | return lambda num_inputs, **_: _make_divisible(num_inputs * n, divisible_by) 155 | 156 | 157 | def split_conv(input_tensor, 158 | num_outputs, 159 | num_ways, 160 | scope, 161 | divisible_by=8, 162 | **kwargs): 163 | """Creates a split convolution. 164 | Split convolution splits the input and output into 165 | 'num_blocks' blocks of approximately the same size each, 166 | and only connects $i$-th input to $i$ output. 167 | Args: 168 | input_tensor: input tensor 169 | num_outputs: number of output filters 170 | num_ways: num blocks to split by. 171 | scope: scope for all the operators. 172 | divisible_by: make sure that every part is divisiable by this. 173 | **kwargs: will be passed directly into conv2d operator 174 | Returns: 175 | tensor 176 | """ 177 | b = input_tensor.get_shape().as_list()[3] 178 | 179 | if num_ways == 1 or min(b // num_ways, 180 | num_outputs // num_ways) < divisible_by: 181 | # Don't do any splitting if we end up with less than 8 filters 182 | # on either side. 183 | return slim.conv2d(input_tensor, num_outputs, [1, 1], scope=scope, **kwargs) 184 | 185 | outs = [] 186 | input_splits = _split_divisible(b, num_ways, divisible_by=divisible_by) 187 | output_splits = _split_divisible( 188 | num_outputs, num_ways, divisible_by=divisible_by) 189 | inputs = tf.split(input_tensor, input_splits, axis=3, name='split_' + scope) 190 | base = scope 191 | for i, (input_tensor, out_size) in enumerate(zip(inputs, output_splits)): 192 | scope = base + '_part_%d' % (i,) 193 | n = slim.conv2d(input_tensor, out_size, [1, 1], scope=scope, **kwargs) 194 | n = tf.identity(n, scope + '_output') 195 | outs.append(n) 196 | return tf.concat(outs, 3, name=scope + '_concat') 197 | 198 | 199 | @slim.add_arg_scope 200 | def expanded_conv(input_tensor, 201 | num_outputs, 202 | expansion_size=expand_input_by_factor(6), 203 | stride=1, 204 | rate=1, 205 | kernel_size=(3, 3), 206 | residual=True, 207 | normalizer_fn=None, 208 | split_projection=1, 209 | split_expansion=1, 210 | split_divisible_by=8, 211 | expansion_transform=None, 212 | depthwise_location='expansion', 213 | depthwise_channel_multiplier=1, 214 | endpoints=None, 215 | use_explicit_padding=False, 216 | padding='SAME', 217 | inner_activation_fn=None, 218 | depthwise_activation_fn=None, 219 | project_activation_fn=tf.identity, 220 | depthwise_fn=slim.separable_conv2d, 221 | expansion_fn=split_conv, 222 | projection_fn=split_conv, 223 | scope=None): 224 | """Depthwise Convolution Block with expansion. 225 | Builds a composite convolution that has the following structure 226 | expansion (1x1) -> depthwise (kernel_size) -> projection (1x1) 227 | Args: 228 | input_tensor: input 229 | num_outputs: number of outputs in the final layer. 230 | expansion_size: the size of expansion, could be a constant or a callable. 231 | If latter it will be provided 'num_inputs' as an input. For forward 232 | compatibility it should accept arbitrary keyword arguments. 233 | Default will expand the input by factor of 6. 234 | stride: depthwise stride 235 | rate: depthwise rate 236 | kernel_size: depthwise kernel 237 | residual: whether to include residual connection between input 238 | and output. 239 | normalizer_fn: batchnorm or otherwise 240 | split_projection: how many ways to split projection operator 241 | (that is conv expansion->bottleneck) 242 | split_expansion: how many ways to split expansion op 243 | (that is conv bottleneck->expansion) ops will keep depth divisible 244 | by this value. 245 | split_divisible_by: make sure every split group is divisible by this number. 246 | expansion_transform: Optional function that takes expansion 247 | as a single input and returns output. 248 | depthwise_location: where to put depthwise covnvolutions supported 249 | values None, 'input', 'output', 'expansion' 250 | depthwise_channel_multiplier: depthwise channel multiplier: 251 | each input will replicated (with different filters) 252 | that many times. So if input had c channels, 253 | output will have c x depthwise_channel_multpilier. 254 | endpoints: An optional dictionary into which intermediate endpoints are 255 | placed. The keys "expansion_output", "depthwise_output", 256 | "projection_output" and "expansion_transform" are always populated, even 257 | if the corresponding functions are not invoked. 258 | use_explicit_padding: Use 'VALID' padding for convolutions, but prepad 259 | inputs so that the output dimensions are the same as if 'SAME' padding 260 | were used. 261 | padding: Padding type to use if `use_explicit_padding` is not set. 262 | inner_activation_fn: activation function to use in all inner convolutions. 263 | If none, will rely on slim default scopes. 264 | depthwise_activation_fn: activation function to use for deptwhise only. 265 | If not provided will rely on slim default scopes. If both 266 | inner_activation_fn and depthwise_activation_fn are provided, 267 | depthwise_activation_fn takes precedence over inner_activation_fn. 268 | project_activation_fn: activation function for the project layer. 269 | (note this layer is not affected by inner_activation_fn) 270 | depthwise_fn: Depthwise convolution function. 271 | expansion_fn: Expansion convolution function. If use custom function then 272 | "split_expansion" and "split_divisible_by" will be ignored. 273 | projection_fn: Projection convolution function. If use custom function then 274 | "split_projection" and "split_divisible_by" will be ignored. 275 | scope: optional scope. 276 | Returns: 277 | Tensor of depth num_outputs 278 | Raises: 279 | TypeError: on inval 280 | """ 281 | conv_defaults = {} 282 | dw_defaults = {} 283 | if inner_activation_fn is not None: 284 | conv_defaults['activation_fn'] = inner_activation_fn 285 | dw_defaults['activation_fn'] = inner_activation_fn 286 | if depthwise_activation_fn is not None: 287 | dw_defaults['activation_fn'] = depthwise_activation_fn 288 | # pylint: disable=g-backslash-continuation 289 | with tf.variable_scope(scope, default_name='expanded_conv') as s, \ 290 | tf.name_scope(s.original_name_scope), \ 291 | slim.arg_scope((slim.conv2d,), **conv_defaults), \ 292 | slim.arg_scope((slim.separable_conv2d,), **dw_defaults): 293 | prev_depth = input_tensor.get_shape().as_list()[3] 294 | if depthwise_location not in [None, 'input', 'output', 'expansion']: 295 | raise TypeError('%r is unknown value for depthwise_location' % 296 | depthwise_location) 297 | if use_explicit_padding: 298 | if padding != 'SAME': 299 | raise TypeError('`use_explicit_padding` should only be used with ' 300 | '"SAME" padding.') 301 | padding = 'VALID' 302 | depthwise_func = functools.partial( 303 | depthwise_fn, 304 | num_outputs=None, 305 | kernel_size=kernel_size, 306 | depth_multiplier=depthwise_channel_multiplier, 307 | stride=stride, 308 | rate=rate, 309 | normalizer_fn=normalizer_fn, 310 | padding=padding, 311 | scope='depthwise') 312 | # b1 -> b2 * r -> b2 313 | # i -> (o * r) (bottleneck) -> o 314 | input_tensor = tf.identity(input_tensor, 'input') 315 | net = input_tensor 316 | 317 | if depthwise_location == 'input': 318 | if use_explicit_padding: 319 | net = _fixed_padding(net, kernel_size, rate) 320 | net = depthwise_func(net, activation_fn=None) 321 | net = tf.identity(net, name='depthwise_output') 322 | if endpoints is not None: 323 | endpoints['depthwise_output'] = net 324 | 325 | if callable(expansion_size): 326 | inner_size = expansion_size(num_inputs=prev_depth) 327 | else: 328 | inner_size = expansion_size 329 | 330 | if inner_size > net.shape[3]: 331 | if expansion_fn == split_conv: 332 | expansion_fn = functools.partial( 333 | expansion_fn, 334 | num_ways=split_expansion, 335 | divisible_by=split_divisible_by, 336 | stride=1) 337 | net = expansion_fn( 338 | net, 339 | inner_size, 340 | scope='expand', 341 | normalizer_fn=normalizer_fn) 342 | net = tf.identity(net, 'expansion_output') 343 | if endpoints is not None: 344 | endpoints['expansion_output'] = net 345 | 346 | if depthwise_location == 'expansion': 347 | if use_explicit_padding: 348 | net = _fixed_padding(net, kernel_size, rate) 349 | net = depthwise_func(net) 350 | net = tf.identity(net, name='depthwise_output') 351 | if endpoints is not None: 352 | endpoints['depthwise_output'] = net 353 | 354 | if expansion_transform: 355 | net = expansion_transform(expansion_tensor=net, input_tensor=input_tensor) 356 | # Note in contrast with expansion, we always have 357 | # projection to produce the desired output size. 358 | if projection_fn == split_conv: 359 | projection_fn = functools.partial( 360 | projection_fn, 361 | num_ways=split_projection, 362 | divisible_by=split_divisible_by, 363 | stride=1) 364 | net = projection_fn( 365 | net, 366 | num_outputs, 367 | scope='project', 368 | normalizer_fn=normalizer_fn, 369 | activation_fn=project_activation_fn) 370 | if endpoints is not None: 371 | endpoints['projection_output'] = net 372 | if depthwise_location == 'output': 373 | if use_explicit_padding: 374 | net = _fixed_padding(net, kernel_size, rate) 375 | net = depthwise_func(net, activation_fn=None) 376 | net = tf.identity(net, name='depthwise_output') 377 | if endpoints is not None: 378 | endpoints['depthwise_output'] = net 379 | 380 | if callable(residual): # custom residual 381 | net = residual(input_tensor=input_tensor, output_tensor=net) 382 | elif (residual and 383 | # stride check enforces that we don't add residuals when spatial 384 | # dimensions are None 385 | stride == 1 and 386 | # Depth matches 387 | net.get_shape().as_list()[3] == 388 | input_tensor.get_shape().as_list()[3]): 389 | net += input_tensor 390 | return tf.identity(net, name='output') 391 | 392 | 393 | @slim.add_arg_scope 394 | def squeeze_excite(input_tensor, 395 | divisible_by=8, 396 | squeeze_factor=3, 397 | inner_activation_fn=tf.nn.relu, 398 | gating_fn=tf.sigmoid, 399 | squeeze_input_tensor=None, 400 | pool=None): 401 | """Squeeze excite block for Mobilenet V3. 402 | Args: 403 | input_tensor: input tensor to apply SE block to. 404 | divisible_by: ensures all inner dimensions are divisible by this number. 405 | squeeze_factor: the factor of squeezing in the inner fully connected layer 406 | inner_activation_fn: non-linearity to be used in inner layer. 407 | gating_fn: non-linearity to be used for final gating function 408 | squeeze_input_tensor: custom tensor to use for computing gating activation. 409 | If provided the result will be input_tensor * SE(squeeze_input_tensor) 410 | instead of input_tensor * SE(input_tensor). 411 | pool: if number is provided will average pool with that kernel size 412 | to compute inner tensor, followed by bilinear upsampling. 413 | Returns: 414 | Gated input_tensor. (e.g. X * SE(X)) 415 | """ 416 | with tf.variable_scope('squeeze_excite'): 417 | if squeeze_input_tensor is None: 418 | squeeze_input_tensor = input_tensor 419 | input_size = input_tensor.shape.as_list()[1:3] 420 | pool_height, pool_width = squeeze_input_tensor.shape.as_list()[1:3] 421 | stride = 1 422 | if pool is not None and pool_height >= pool: 423 | pool_height, pool_width, stride = pool, pool, pool 424 | input_channels = squeeze_input_tensor.shape.as_list()[3] 425 | output_channels = input_tensor.shape.as_list()[3] 426 | squeeze_channels = _make_divisible( 427 | input_channels / squeeze_factor, divisor=divisible_by) 428 | 429 | pooled = tf.nn.avg_pool(squeeze_input_tensor, 430 | (1, pool_height, pool_width, 1), 431 | strides=(1, stride, stride, 1), 432 | padding='VALID') 433 | squeeze = slim.conv2d( 434 | pooled, 435 | kernel_size=(1, 1), 436 | num_outputs=squeeze_channels, 437 | normalizer_fn=None, 438 | activation_fn=inner_activation_fn) 439 | excite_outputs = output_channels 440 | excite = slim.conv2d(squeeze, num_outputs=excite_outputs, 441 | kernel_size=[1, 1], 442 | normalizer_fn=None, 443 | activation_fn=gating_fn) 444 | if pool is not None: 445 | # Note: As of 03/20/2019 only BILINEAR (the default) with 446 | # align_corners=True has gradients implemented in TPU. 447 | excite = tf.image.resize_images( 448 | excite, input_size, 449 | align_corners=True) 450 | result = input_tensor * excite 451 | return result -------------------------------------------------------------------------------- /lib/networks/mobilenet/mobilenet.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Mobilenet Base Class.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | import collections 21 | import contextlib 22 | import copy 23 | import os 24 | 25 | import tensorflow as tf 26 | from tensorflow.contrib import slim as contrib_slim 27 | 28 | slim = contrib_slim 29 | 30 | 31 | @slim.add_arg_scope 32 | def apply_activation(x, name=None, activation_fn=None): 33 | return activation_fn(x, name=name) if activation_fn else x 34 | 35 | 36 | def _fixed_padding(inputs, kernel_size, rate=1): 37 | """Pads the input along the spatial dimensions independently of input size. 38 | 39 | Pads the input such that if it was used in a convolution with 'VALID' padding, 40 | the output would have the same dimensions as if the unpadded input was used 41 | in a convolution with 'SAME' padding. 42 | 43 | Args: 44 | inputs: A tensor of size [batch, height_in, width_in, channels]. 45 | kernel_size: The kernel to be used in the conv2d or max_pool2d operation. 46 | rate: An integer, rate for atrous convolution. 47 | 48 | Returns: 49 | output: A tensor of size [batch, height_out, width_out, channels] with the 50 | input, either intact (if kernel_size == 1) or padded (if kernel_size > 1). 51 | """ 52 | kernel_size_effective = [kernel_size[0] + (kernel_size[0] - 1) * (rate - 1), 53 | kernel_size[0] + (kernel_size[0] - 1) * (rate - 1)] 54 | pad_total = [kernel_size_effective[0] - 1, kernel_size_effective[1] - 1] 55 | pad_beg = [pad_total[0] // 2, pad_total[1] // 2] 56 | pad_end = [pad_total[0] - pad_beg[0], pad_total[1] - pad_beg[1]] 57 | padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg[0], pad_end[0]], 58 | [pad_beg[1], pad_end[1]], [0, 0]]) 59 | return padded_inputs 60 | 61 | 62 | def _make_divisible(v, divisor, min_value=None): 63 | if min_value is None: 64 | min_value = divisor 65 | new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) 66 | # Make sure that round down does not go down by more than 10%. 67 | if new_v < 0.9 * v: 68 | new_v += divisor 69 | return int(new_v) 70 | 71 | 72 | @contextlib.contextmanager 73 | def _set_arg_scope_defaults(defaults): 74 | """Sets arg scope defaults for all items present in defaults. 75 | 76 | Args: 77 | defaults: dictionary/list of pairs, containing a mapping from 78 | function to a dictionary of default args. 79 | 80 | Yields: 81 | context manager where all defaults are set. 82 | """ 83 | if hasattr(defaults, 'items'): 84 | items = list(defaults.items()) 85 | else: 86 | items = defaults 87 | if not items: 88 | yield 89 | else: 90 | func, default_arg = items[0] 91 | with slim.arg_scope(func, **default_arg): 92 | with _set_arg_scope_defaults(items[1:]): 93 | yield 94 | 95 | 96 | @slim.add_arg_scope 97 | def depth_multiplier(output_params, 98 | multiplier, 99 | divisible_by=8, 100 | min_depth=8, 101 | **unused_kwargs): 102 | if 'num_outputs' not in output_params: 103 | return 104 | d = output_params['num_outputs'] 105 | output_params['num_outputs'] = _make_divisible(d * multiplier, divisible_by, 106 | min_depth) 107 | 108 | 109 | _Op = collections.namedtuple('Op', ['op', 'params', 'multiplier_func']) 110 | 111 | 112 | def op(opfunc, multiplier_func=depth_multiplier, **params): 113 | multiplier = params.pop('multiplier_transform', multiplier_func) 114 | return _Op(opfunc, params=params, multiplier_func=multiplier) 115 | 116 | 117 | class NoOpScope(object): 118 | """No-op context manager.""" 119 | 120 | def __enter__(self): 121 | return None 122 | 123 | def __exit__(self, exc_type, exc_value, traceback): 124 | return False 125 | 126 | 127 | def safe_arg_scope(funcs, **kwargs): 128 | """Returns `slim.arg_scope` with all None arguments removed. 129 | 130 | Arguments: 131 | funcs: Functions to pass to `arg_scope`. 132 | **kwargs: Arguments to pass to `arg_scope`. 133 | 134 | Returns: 135 | arg_scope or No-op context manager. 136 | 137 | Note: can be useful if None value should be interpreted as "do not overwrite 138 | this parameter value". 139 | """ 140 | filtered_args = {name: value for name, value in kwargs.items() 141 | if value is not None} 142 | if filtered_args: 143 | return slim.arg_scope(funcs, **filtered_args) 144 | else: 145 | return NoOpScope() 146 | 147 | 148 | @slim.add_arg_scope 149 | def mobilenet_base( # pylint: disable=invalid-name 150 | inputs, 151 | conv_defs, 152 | multiplier=1.0, 153 | final_endpoint=None, 154 | output_stride=None, 155 | use_explicit_padding=False, 156 | scope=None, 157 | is_training=False): 158 | """Mobilenet base network. 159 | 160 | Constructs a network from inputs to the given final endpoint. By default 161 | the network is constructed in inference mode. To create network 162 | in training mode use: 163 | 164 | with slim.arg_scope(mobilenet.training_scope()): 165 | logits, endpoints = mobilenet_base(...) 166 | 167 | Args: 168 | inputs: a tensor of shape [batch_size, height, width, channels]. 169 | conv_defs: A list of op(...) layers specifying the net architecture. 170 | multiplier: Float multiplier for the depth (number of channels) 171 | for all convolution ops. The value must be greater than zero. Typical 172 | usage will be to set this value in (0, 1) to reduce the number of 173 | parameters or computation cost of the model. 174 | final_endpoint: The name of last layer, for early termination for 175 | for V1-based networks: last layer is "layer_14", for V2: "layer_20" 176 | output_stride: An integer that specifies the requested ratio of input to 177 | output spatial resolution. If not None, then we invoke atrous convolution 178 | if necessary to prevent the network from reducing the spatial resolution 179 | of the activation maps. Allowed values are 1 or any even number, excluding 180 | zero. Typical values are 8 (accurate fully convolutional mode), 16 181 | (fast fully convolutional mode), and 32 (classification mode). 182 | 183 | NOTE- output_stride relies on all consequent operators to support dilated 184 | operators via "rate" parameter. This might require wrapping non-conv 185 | operators to operate properly. 186 | 187 | use_explicit_padding: Use 'VALID' padding for convolutions, but prepad 188 | inputs so that the output dimensions are the same as if 'SAME' padding 189 | were used. 190 | scope: optional variable scope. 191 | is_training: How to setup batch_norm and other ops. Note: most of the time 192 | this does not need be set directly. Use mobilenet.training_scope() to set 193 | up training instead. This parameter is here for backward compatibility 194 | only. It is safe to set it to the value matching 195 | training_scope(is_training=...). It is also safe to explicitly set 196 | it to False, even if there is outer training_scope set to to training. 197 | (The network will be built in inference mode). If this is set to None, 198 | no arg_scope is added for slim.batch_norm's is_training parameter. 199 | 200 | Returns: 201 | tensor_out: output tensor. 202 | end_points: a set of activations for external use, for example summaries or 203 | losses. 204 | 205 | Raises: 206 | ValueError: depth_multiplier <= 0, or the target output_stride is not 207 | allowed. 208 | """ 209 | if multiplier <= 0: 210 | raise ValueError('multiplier is not greater than zero.') 211 | 212 | # Set conv defs defaults and overrides. 213 | conv_defs_defaults = conv_defs.get('defaults', {}) 214 | conv_defs_overrides = conv_defs.get('overrides', {}) 215 | if use_explicit_padding: 216 | conv_defs_overrides = copy.deepcopy(conv_defs_overrides) 217 | conv_defs_overrides[ 218 | (slim.conv2d, slim.separable_conv2d)] = {'padding': 'VALID'} 219 | 220 | if output_stride is not None: 221 | if output_stride == 0 or (output_stride > 1 and output_stride % 2): 222 | raise ValueError('Output stride must be None, 1 or a multiple of 2.') 223 | 224 | # a) Set the tensorflow scope 225 | # b) set padding to default: note we might consider removing this 226 | # since it is also set by mobilenet_scope 227 | # c) set all defaults 228 | # d) set all extra overrides. 229 | # pylint: disable=g-backslash-continuation 230 | with _scope_all(scope, default_scope='Mobilenet'), \ 231 | safe_arg_scope([slim.batch_norm], is_training=is_training), \ 232 | _set_arg_scope_defaults(conv_defs_defaults), \ 233 | _set_arg_scope_defaults(conv_defs_overrides): 234 | # The current_stride variable keeps track of the output stride of the 235 | # activations, i.e., the running product of convolution strides up to the 236 | # current network layer. This allows us to invoke atrous convolution 237 | # whenever applying the next convolution would result in the activations 238 | # having output stride larger than the target output_stride. 239 | current_stride = 1 240 | 241 | # The atrous convolution rate parameter. 242 | rate = 1 243 | 244 | net = inputs 245 | # Insert default parameters before the base scope which includes 246 | # any custom overrides set in mobilenet. 247 | end_points = {} 248 | scopes = {} 249 | for i, opdef in enumerate(conv_defs['spec']): 250 | params = dict(opdef.params) 251 | opdef.multiplier_func(params, multiplier) 252 | stride = params.get('stride', 1) 253 | if output_stride is not None and current_stride == output_stride: 254 | # If we have reached the target output_stride, then we need to employ 255 | # atrous convolution with stride=1 and multiply the atrous rate by the 256 | # current unit's stride for use in subsequent layers. 257 | layer_stride = 1 258 | layer_rate = rate 259 | rate *= stride 260 | else: 261 | layer_stride = stride 262 | layer_rate = 1 263 | current_stride *= stride 264 | # Update params. 265 | params['stride'] = layer_stride 266 | # Only insert rate to params if rate > 1 and kernel size is not [1, 1]. 267 | if layer_rate > 1: 268 | if tuple(params.get('kernel_size', [])) != (1, 1): 269 | # We will apply atrous rate in the following cases: 270 | # 1) When kernel_size is not in params, the operation then uses 271 | # default kernel size 3x3. 272 | # 2) When kernel_size is in params, and if the kernel_size is not 273 | # equal to (1, 1) (there is no need to apply atrous convolution to 274 | # any 1x1 convolution). 275 | params['rate'] = layer_rate 276 | # Set padding 277 | if use_explicit_padding: 278 | if 'kernel_size' in params: 279 | net = _fixed_padding(net, params['kernel_size'], layer_rate) 280 | else: 281 | params['use_explicit_padding'] = True 282 | 283 | end_point = 'layer_%d' % (i + 1) 284 | try: 285 | net = opdef.op(net, **params) 286 | except Exception: 287 | print('Failed to create op %i: %r params: %r' % (i, opdef, params)) 288 | raise 289 | end_points[end_point] = net 290 | scope = os.path.dirname(net.name) 291 | scopes[scope] = end_point 292 | if final_endpoint is not None and end_point == final_endpoint: 293 | break 294 | 295 | # Add all tensors that end with 'output' to 296 | # endpoints 297 | for t in net.graph.get_operations(): 298 | scope = os.path.dirname(t.name) 299 | bn = os.path.basename(t.name) 300 | if scope in scopes and t.name.endswith('output'): 301 | end_points[scopes[scope] + '/' + bn] = t.outputs[0] 302 | return net, end_points 303 | 304 | 305 | @contextlib.contextmanager 306 | def _scope_all(scope, default_scope=None): 307 | with tf.variable_scope(scope, default_name=default_scope) as s,\ 308 | tf.name_scope(s.original_name_scope): 309 | yield s 310 | 311 | 312 | @slim.add_arg_scope 313 | def mobilenet(inputs, 314 | num_classes=1001, 315 | prediction_fn=slim.softmax, 316 | reuse=None, 317 | scope='Mobilenet', 318 | base_only=False, 319 | **mobilenet_args): 320 | """Mobilenet model for classification, supports both V1 and V2. 321 | 322 | Note: default mode is inference, use mobilenet.training_scope to create 323 | training network. 324 | 325 | 326 | Args: 327 | inputs: a tensor of shape [batch_size, height, width, channels]. 328 | num_classes: number of predicted classes. If 0 or None, the logits layer 329 | is omitted and the input features to the logits layer (before dropout) 330 | are returned instead. 331 | prediction_fn: a function to get predictions out of logits 332 | (default softmax). 333 | reuse: whether or not the network and its variables should be reused. To be 334 | able to reuse 'scope' must be given. 335 | scope: Optional variable_scope. 336 | base_only: if True will only create the base of the network (no pooling 337 | and no logits). 338 | **mobilenet_args: passed to mobilenet_base verbatim. 339 | - conv_defs: list of conv defs 340 | - multiplier: Float multiplier for the depth (number of channels) 341 | for all convolution ops. The value must be greater than zero. Typical 342 | usage will be to set this value in (0, 1) to reduce the number of 343 | parameters or computation cost of the model. 344 | - output_stride: will ensure that the last layer has at most total stride. 345 | If the architecture calls for more stride than that provided 346 | (e.g. output_stride=16, but the architecture has 5 stride=2 operators), 347 | it will replace output_stride with fractional convolutions using Atrous 348 | Convolutions. 349 | 350 | Returns: 351 | logits: the pre-softmax activations, a tensor of size 352 | [batch_size, num_classes] 353 | end_points: a dictionary from components of the network to the corresponding 354 | activation tensor. 355 | 356 | Raises: 357 | ValueError: Input rank is invalid. 358 | """ 359 | is_training = mobilenet_args.get('is_training', False) 360 | input_shape = inputs.get_shape().as_list() 361 | if len(input_shape) != 4: 362 | raise ValueError('Expected rank 4 input, was: %d' % len(input_shape)) 363 | 364 | with tf.variable_scope(scope, 'Mobilenet', reuse=reuse) as scope: 365 | inputs = tf.identity(inputs, 'input') 366 | net, end_points = mobilenet_base(inputs, scope=scope, **mobilenet_args) 367 | if base_only: 368 | return net, end_points 369 | 370 | net = tf.identity(net, name='embedding') 371 | 372 | with tf.variable_scope('Logits'): 373 | net = global_pool(net) 374 | end_points['global_pool'] = net 375 | if not num_classes: 376 | return net, end_points 377 | net = slim.dropout(net, scope='Dropout', is_training=is_training) 378 | # 1 x 1 x num_classes 379 | # Note: legacy scope name. 380 | logits = slim.conv2d( 381 | net, 382 | num_classes, [1, 1], 383 | activation_fn=None, 384 | normalizer_fn=None, 385 | biases_initializer=tf.zeros_initializer(), 386 | scope='Conv2d_1c_1x1') 387 | 388 | logits = tf.squeeze(logits, [1, 2]) 389 | 390 | logits = tf.identity(logits, name='output') 391 | end_points['Logits'] = logits 392 | if prediction_fn: 393 | end_points['Predictions'] = prediction_fn(logits, 'Predictions') 394 | return logits, end_points 395 | 396 | 397 | def global_pool(input_tensor, pool_op=tf.nn.avg_pool): 398 | """Applies avg pool to produce 1x1 output. 399 | 400 | NOTE: This function is funcitonally equivalenet to reduce_mean, but it has 401 | baked in average pool which has better support across hardware. 402 | 403 | Args: 404 | input_tensor: input tensor 405 | pool_op: pooling op (avg pool is default) 406 | Returns: 407 | a tensor batch_size x 1 x 1 x depth. 408 | """ 409 | shape = input_tensor.get_shape().as_list() 410 | if shape[1] is None or shape[2] is None: 411 | kernel_size = tf.convert_to_tensor( 412 | [1, tf.shape(input_tensor)[1], 413 | tf.shape(input_tensor)[2], 1]) 414 | else: 415 | kernel_size = [1, shape[1], shape[2], 1] 416 | output = pool_op( 417 | input_tensor, ksize=kernel_size, strides=[1, 1, 1, 1], padding='VALID') 418 | # Recover output shape, for unknown shape. 419 | output.set_shape([None, 1, 1, None]) 420 | return output 421 | 422 | 423 | def training_scope(is_training=True, 424 | weight_decay=0.00004, 425 | stddev=0.09, 426 | dropout_keep_prob=0.8, 427 | bn_decay=0.997): 428 | """Defines Mobilenet training scope. 429 | 430 | Usage: 431 | with tf.contrib.slim.arg_scope(mobilenet.training_scope()): 432 | logits, endpoints = mobilenet_v2.mobilenet(input_tensor) 433 | 434 | # the network created will be trainble with dropout/batch norm 435 | # initialized appropriately. 436 | Args: 437 | is_training: if set to False this will ensure that all customizations are 438 | set to non-training mode. This might be helpful for code that is reused 439 | across both training/evaluation, but most of the time training_scope with 440 | value False is not needed. If this is set to None, the parameters is not 441 | added to the batch_norm arg_scope. 442 | 443 | weight_decay: The weight decay to use for regularizing the model. 444 | stddev: Standard deviation for initialization, if negative uses xavier. 445 | dropout_keep_prob: dropout keep probability (not set if equals to None). 446 | bn_decay: decay for the batch norm moving averages (not set if equals to 447 | None). 448 | 449 | Returns: 450 | An argument scope to use via arg_scope. 451 | """ 452 | # Note: do not introduce parameters that would change the inference 453 | # model here (for example whether to use bias), modify conv_def instead. 454 | batch_norm_params = { 455 | 'decay': bn_decay, 456 | 'is_training': is_training 457 | } 458 | if stddev < 0: 459 | weight_intitializer = slim.initializers.xavier_initializer() 460 | else: 461 | weight_intitializer = tf.truncated_normal_initializer(stddev=stddev) 462 | 463 | # Set weight_decay for weights in Conv and FC layers. 464 | with slim.arg_scope( 465 | [slim.conv2d, slim.fully_connected, slim.separable_conv2d], 466 | weights_initializer=weight_intitializer, 467 | normalizer_fn=slim.batch_norm), \ 468 | slim.arg_scope([mobilenet_base, mobilenet], is_training=is_training),\ 469 | safe_arg_scope([slim.batch_norm], **batch_norm_params), \ 470 | safe_arg_scope([slim.dropout], is_training=is_training, 471 | keep_prob=dropout_keep_prob), \ 472 | slim.arg_scope([slim.conv2d], \ 473 | weights_regularizer=slim.l2_regularizer(weight_decay)), \ 474 | slim.arg_scope([slim.separable_conv2d], weights_regularizer=None) as s: 475 | return s -------------------------------------------------------------------------------- /lib/networks/mobilenet/mobilenet_v2.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Implementation of Mobilenet V2. 16 | 17 | Architecture: https://arxiv.org/abs/1801.04381 18 | 19 | The base model gives 72.2% accuracy on ImageNet, with 300MMadds, 20 | 3.4 M parameters. 21 | """ 22 | 23 | from __future__ import absolute_import 24 | from __future__ import division 25 | from __future__ import print_function 26 | 27 | import copy 28 | import functools 29 | 30 | import tensorflow as tf 31 | from tensorflow.contrib import layers as contrib_layers 32 | from tensorflow.contrib import slim as contrib_slim 33 | 34 | from lib.networks.mobilenet import conv_blocks as ops 35 | from lib.networks.mobilenet import mobilenet as lib 36 | 37 | slim = contrib_slim 38 | op = lib.op 39 | 40 | expand_input = ops.expand_input_by_factor 41 | 42 | # pyformat: disable 43 | # Architecture: https://arxiv.org/abs/1801.04381 44 | V2_DEF = dict( 45 | defaults={ 46 | # Note: these parameters of batch norm affect the architecture 47 | # that's why they are here and not in training_scope. 48 | (slim.batch_norm,): {'center': True, 'scale': True}, 49 | (slim.conv2d, slim.fully_connected, slim.separable_conv2d): { 50 | 'normalizer_fn': slim.batch_norm, 'activation_fn': tf.nn.relu6 51 | }, 52 | (ops.expanded_conv,): { 53 | 'expansion_size': expand_input(6), 54 | 'split_expansion': 1, 55 | 'normalizer_fn': slim.batch_norm, 56 | 'residual': True 57 | }, 58 | (slim.conv2d, slim.separable_conv2d): {'padding': 'SAME'} 59 | }, 60 | spec=[ 61 | op(slim.conv2d, stride=2, num_outputs=32, kernel_size=[3, 3]), 62 | op(ops.expanded_conv, 63 | expansion_size=expand_input(1, divisible_by=1), 64 | num_outputs=16), 65 | op(ops.expanded_conv, stride=2, num_outputs=24), 66 | op(ops.expanded_conv, stride=1, num_outputs=24), 67 | op(ops.expanded_conv, stride=2, num_outputs=32), 68 | op(ops.expanded_conv, stride=1, num_outputs=32), 69 | op(ops.expanded_conv, stride=1, num_outputs=32), 70 | op(ops.expanded_conv, stride=2, num_outputs=64), 71 | op(ops.expanded_conv, stride=1, num_outputs=64), 72 | op(ops.expanded_conv, stride=1, num_outputs=64), 73 | op(ops.expanded_conv, stride=1, num_outputs=64), 74 | op(ops.expanded_conv, stride=1, num_outputs=96), 75 | op(ops.expanded_conv, stride=1, num_outputs=96), 76 | op(ops.expanded_conv, stride=1, num_outputs=96), 77 | op(ops.expanded_conv, stride=2, num_outputs=160), 78 | op(ops.expanded_conv, stride=1, num_outputs=160), 79 | op(ops.expanded_conv, stride=1, num_outputs=160), 80 | op(ops.expanded_conv, stride=1, num_outputs=320), 81 | op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=1280) 82 | ], 83 | ) 84 | # pyformat: enable 85 | 86 | # Mobilenet v2 Definition with group normalization. 87 | V2_DEF_GROUP_NORM = copy.deepcopy(V2_DEF) 88 | V2_DEF_GROUP_NORM['defaults'] = { 89 | (contrib_slim.conv2d, contrib_slim.fully_connected, 90 | contrib_slim.separable_conv2d): { 91 | 'normalizer_fn': contrib_layers.group_norm, # pylint: disable=C0330 92 | 'activation_fn': tf.nn.relu6, # pylint: disable=C0330 93 | }, # pylint: disable=C0330 94 | (ops.expanded_conv,): { 95 | 'expansion_size': ops.expand_input_by_factor(6), 96 | 'split_expansion': 1, 97 | 'normalizer_fn': contrib_layers.group_norm, 98 | 'residual': True 99 | }, 100 | (contrib_slim.conv2d, contrib_slim.separable_conv2d): { 101 | 'padding': 'SAME' 102 | } 103 | } 104 | 105 | 106 | @slim.add_arg_scope 107 | def mobilenet(input_tensor, 108 | num_classes=1001, 109 | depth_multiplier=1.0, 110 | scope='MobilenetV2', 111 | conv_defs=None, 112 | finegrain_classification_mode=False, 113 | min_depth=None, 114 | divisible_by=None, 115 | activation_fn=None, 116 | **kwargs): 117 | """Creates mobilenet V2 network. 118 | 119 | Inference mode is created by default. To create training use training_scope 120 | below. 121 | 122 | with tf.contrib.slim.arg_scope(mobilenet_v2.training_scope()): 123 | logits, endpoints = mobilenet_v2.mobilenet(input_tensor) 124 | 125 | Args: 126 | input_tensor: The input tensor 127 | num_classes: number of classes 128 | depth_multiplier: The multiplier applied to scale number of 129 | channels in each layer. 130 | scope: Scope of the operator 131 | conv_defs: Allows to override default conv def. 132 | finegrain_classification_mode: When set to True, the model 133 | will keep the last layer large even for small multipliers. Following 134 | https://arxiv.org/abs/1801.04381 135 | suggests that it improves performance for ImageNet-type of problems. 136 | *Note* ignored if final_endpoint makes the builder exit earlier. 137 | min_depth: If provided, will ensure that all layers will have that 138 | many channels after application of depth multiplier. 139 | divisible_by: If provided will ensure that all layers # channels 140 | will be divisible by this number. 141 | activation_fn: Activation function to use, defaults to tf.nn.relu6 if not 142 | specified. 143 | **kwargs: passed directly to mobilenet.mobilenet: 144 | prediction_fn- what prediction function to use. 145 | reuse-: whether to reuse variables (if reuse set to true, scope 146 | must be given). 147 | Returns: 148 | logits/endpoints pair 149 | 150 | Raises: 151 | ValueError: On invalid arguments 152 | """ 153 | if conv_defs is None: 154 | conv_defs = V2_DEF 155 | if 'multiplier' in kwargs: 156 | raise ValueError('mobilenetv2 doesn\'t support generic ' 157 | 'multiplier parameter use "depth_multiplier" instead.') 158 | if finegrain_classification_mode: 159 | conv_defs = copy.deepcopy(conv_defs) 160 | if depth_multiplier < 1: 161 | conv_defs['spec'][-1].params['num_outputs'] /= depth_multiplier 162 | if activation_fn: 163 | conv_defs = copy.deepcopy(conv_defs) 164 | defaults = conv_defs['defaults'] 165 | conv_defaults = ( 166 | defaults[(slim.conv2d, slim.fully_connected, slim.separable_conv2d)]) 167 | conv_defaults['activation_fn'] = activation_fn 168 | 169 | depth_args = {} 170 | # NB: do not set depth_args unless they are provided to avoid overriding 171 | # whatever default depth_multiplier might have thanks to arg_scope. 172 | if min_depth is not None: 173 | depth_args['min_depth'] = min_depth 174 | if divisible_by is not None: 175 | depth_args['divisible_by'] = divisible_by 176 | 177 | with slim.arg_scope((lib.depth_multiplier,), **depth_args): 178 | return lib.mobilenet( 179 | input_tensor, 180 | num_classes=num_classes, 181 | conv_defs=conv_defs, 182 | scope=scope, 183 | multiplier=depth_multiplier, 184 | **kwargs) 185 | 186 | mobilenet.default_image_size = 224 187 | 188 | 189 | def wrapped_partial(func, *args, **kwargs): 190 | partial_func = functools.partial(func, *args, **kwargs) 191 | functools.update_wrapper(partial_func, func) 192 | return partial_func 193 | 194 | 195 | # Wrappers for mobilenet v2 with depth-multipliers. Be noticed that 196 | # 'finegrain_classification_mode' is set to True, which means the embedding 197 | # layer will not be shrinked when given a depth-multiplier < 1.0. 198 | mobilenet_v2_140 = wrapped_partial(mobilenet, depth_multiplier=1.4) 199 | mobilenet_v2_050 = wrapped_partial(mobilenet, depth_multiplier=0.50, 200 | finegrain_classification_mode=True) 201 | mobilenet_v2_035 = wrapped_partial(mobilenet, depth_multiplier=0.35, 202 | finegrain_classification_mode=True) 203 | 204 | 205 | @slim.add_arg_scope 206 | def mobilenet_base(input_tensor, depth_multiplier=1.0, **kwargs): 207 | """Creates base of the mobilenet (no pooling and no logits) .""" 208 | return mobilenet(input_tensor, 209 | depth_multiplier=depth_multiplier, 210 | base_only=True, **kwargs) 211 | 212 | 213 | @slim.add_arg_scope 214 | def mobilenet_base_group_norm(input_tensor, depth_multiplier=1.0, **kwargs): 215 | """Creates base of the mobilenet (no pooling and no logits) .""" 216 | kwargs['conv_defs'] = V2_DEF_GROUP_NORM 217 | kwargs['conv_defs']['defaults'].update({ 218 | (contrib_layers.group_norm,): { 219 | 'groups': kwargs.pop('groups', 8) 220 | } 221 | }) 222 | return mobilenet( 223 | input_tensor, depth_multiplier=depth_multiplier, base_only=True, **kwargs) 224 | 225 | 226 | def training_scope(**kwargs): 227 | """Defines MobilenetV2 training scope. 228 | 229 | Usage: 230 | with tf.contrib.slim.arg_scope(mobilenet_v2.training_scope()): 231 | logits, endpoints = mobilenet_v2.mobilenet(input_tensor) 232 | 233 | with slim. 234 | 235 | Args: 236 | **kwargs: Passed to mobilenet.training_scope. The following parameters 237 | are supported: 238 | weight_decay- The weight decay to use for regularizing the model. 239 | stddev- Standard deviation for initialization, if negative uses xavier. 240 | dropout_keep_prob- dropout keep probability 241 | bn_decay- decay for the batch norm moving averages. 242 | 243 | Returns: 244 | An `arg_scope` to use for the mobilenet v2 model. 245 | """ 246 | return lib.training_scope(**kwargs) 247 | 248 | 249 | __all__ = ['training_scope', 'mobilenet_base', 'mobilenet', 'V2_DEF'] -------------------------------------------------------------------------------- /lib/networks/mobilenet/mobilenet_v3.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Mobilenet V3 conv defs and helper functions.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import copy 22 | import functools 23 | import numpy as np 24 | 25 | import tensorflow as tf 26 | from tensorflow.contrib import slim as contrib_slim 27 | 28 | from lib.networks.mobilenet import conv_blocks as ops 29 | from lib.networks.mobilenet import mobilenet as lib 30 | 31 | slim = contrib_slim 32 | op = lib.op 33 | expand_input = ops.expand_input_by_factor 34 | 35 | # Squeeze Excite with all parameters filled-in, we use hard-sigmoid 36 | # for gating function and relu for inner activation function. 37 | squeeze_excite = functools.partial( 38 | ops.squeeze_excite, squeeze_factor=4, 39 | inner_activation_fn=tf.nn.relu, 40 | gating_fn=lambda x: tf.nn.relu6(x+3)*0.16667) 41 | 42 | # Wrap squeeze excite op as expansion_transform that takes 43 | # both expansion and input tensor. 44 | _se4 = lambda expansion_tensor, input_tensor: squeeze_excite(expansion_tensor) 45 | 46 | 47 | def hard_swish(x): 48 | with tf.name_scope('hard_swish'): 49 | return x * tf.nn.relu6(x + np.float32(3)) * np.float32(1. / 6.) 50 | 51 | 52 | def reduce_to_1x1(input_tensor, default_size=7, **kwargs): 53 | h, w = input_tensor.shape.as_list()[1:3] 54 | if h is not None and w == h: 55 | k = [h, h] 56 | else: 57 | k = [default_size, default_size] 58 | return slim.avg_pool2d(input_tensor, kernel_size=k, **kwargs) 59 | 60 | 61 | def mbv3_op(ef, n, k, s=1, act=tf.nn.relu, se=None, **kwargs): 62 | """Defines a single Mobilenet V3 convolution block. 63 | 64 | Args: 65 | ef: expansion factor 66 | n: number of output channels 67 | k: stride of depthwise 68 | s: stride 69 | act: activation function in inner layers 70 | se: squeeze excite function. 71 | **kwargs: passed to expanded_conv 72 | 73 | Returns: 74 | An object (lib._Op) for inserting in conv_def, representing this operation. 75 | """ 76 | return op( 77 | ops.expanded_conv, 78 | expansion_size=expand_input(ef), 79 | kernel_size=(k, k), 80 | stride=s, 81 | num_outputs=n, 82 | inner_activation_fn=act, 83 | expansion_transform=se, 84 | **kwargs) 85 | 86 | 87 | def mbv3_fused(ef, n, k, s=1, **kwargs): 88 | """Defines a single Mobilenet V3 convolution block. 89 | 90 | Args: 91 | ef: expansion factor 92 | n: number of output channels 93 | k: stride of depthwise 94 | s: stride 95 | **kwargs: will be passed to mbv3_op 96 | 97 | Returns: 98 | An object (lib._Op) for inserting in conv_def, representing this operation. 99 | """ 100 | expansion_fn = functools.partial(slim.conv2d, kernel_size=k, stride=s) 101 | return mbv3_op( 102 | ef, 103 | n, 104 | k=1, 105 | s=s, 106 | depthwise_location=None, 107 | expansion_fn=expansion_fn, 108 | **kwargs) 109 | 110 | 111 | mbv3_op_se = functools.partial(mbv3_op, se=_se4) 112 | 113 | 114 | DEFAULTS = { 115 | (ops.expanded_conv,): 116 | dict( 117 | normalizer_fn=slim.batch_norm, 118 | residual=True), 119 | (slim.conv2d, slim.fully_connected, slim.separable_conv2d): { 120 | 'normalizer_fn': slim.batch_norm, 121 | 'activation_fn': tf.nn.relu, 122 | }, 123 | (slim.batch_norm,): { 124 | 'center': True, 125 | 'scale': True 126 | }, 127 | } 128 | 129 | # Compatible checkpoint: http://mldash/5511169891790690458#scalars 130 | V3_LARGE = dict( 131 | defaults=dict(DEFAULTS), 132 | spec=([ 133 | # stage 1 134 | op(slim.conv2d, stride=2, num_outputs=16, kernel_size=(3, 3), 135 | activation_fn=hard_swish), 136 | mbv3_op(ef=1, n=16, k=3), 137 | mbv3_op(ef=4, n=24, k=3, s=2), 138 | mbv3_op(ef=3, n=24, k=3, s=1), 139 | mbv3_op_se(ef=3, n=40, k=5, s=2), 140 | mbv3_op_se(ef=3, n=40, k=5, s=1), 141 | mbv3_op_se(ef=3, n=40, k=5, s=1), 142 | mbv3_op(ef=6, n=80, k=3, s=2, act=hard_swish), 143 | mbv3_op(ef=2.5, n=80, k=3, s=1, act=hard_swish), 144 | mbv3_op(ef=184/80., n=80, k=3, s=1, act=hard_swish), 145 | mbv3_op(ef=184/80., n=80, k=3, s=1, act=hard_swish), 146 | mbv3_op_se(ef=6, n=112, k=3, s=1, act=hard_swish), 147 | mbv3_op_se(ef=6, n=112, k=3, s=1, act=hard_swish), 148 | mbv3_op_se(ef=6, n=160, k=5, s=2, act=hard_swish), 149 | mbv3_op_se(ef=6, n=160, k=5, s=1, act=hard_swish), 150 | mbv3_op_se(ef=6, n=160, k=5, s=1, act=hard_swish), 151 | op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=960, 152 | activation_fn=hard_swish), 153 | op(reduce_to_1x1, default_size=7, stride=1, padding='VALID'), 154 | op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=1280, 155 | normalizer_fn=None, activation_fn=hard_swish) 156 | ])) 157 | 158 | # 72.2% accuracy. 159 | V3_LARGE_MINIMALISTIC = dict( 160 | defaults=dict(DEFAULTS), 161 | spec=([ 162 | # stage 1 163 | op(slim.conv2d, stride=2, num_outputs=16, kernel_size=(3, 3)), 164 | mbv3_op(ef=1, n=16, k=3), 165 | mbv3_op(ef=4, n=24, k=3, s=2), 166 | mbv3_op(ef=3, n=24, k=3, s=1), 167 | mbv3_op(ef=3, n=40, k=3, s=2), 168 | mbv3_op(ef=3, n=40, k=3, s=1), 169 | mbv3_op(ef=3, n=40, k=3, s=1), 170 | mbv3_op(ef=6, n=80, k=3, s=2), 171 | mbv3_op(ef=2.5, n=80, k=3, s=1), 172 | mbv3_op(ef=184 / 80., n=80, k=3, s=1), 173 | mbv3_op(ef=184 / 80., n=80, k=3, s=1), 174 | mbv3_op(ef=6, n=112, k=3, s=1), 175 | mbv3_op(ef=6, n=112, k=3, s=1), 176 | mbv3_op(ef=6, n=160, k=3, s=2), 177 | mbv3_op(ef=6, n=160, k=3, s=1), 178 | mbv3_op(ef=6, n=160, k=3, s=1), 179 | op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=960), 180 | op(reduce_to_1x1, default_size=7, stride=1, padding='VALID'), 181 | op(slim.conv2d, 182 | stride=1, 183 | kernel_size=[1, 1], 184 | num_outputs=1280, 185 | normalizer_fn=None) 186 | ])) 187 | 188 | # Compatible run: http://mldash/2023283040014348118#scalars 189 | V3_SMALL = dict( 190 | defaults=dict(DEFAULTS), 191 | spec=([ 192 | # stage 1 193 | op(slim.conv2d, stride=2, num_outputs=16, kernel_size=(3, 3), 194 | activation_fn=hard_swish), 195 | mbv3_op_se(ef=1, n=16, k=3, s=2), 196 | mbv3_op(ef=72./16, n=24, k=3, s=2), 197 | mbv3_op(ef=(88./24), n=24, k=3, s=1), 198 | mbv3_op_se(ef=4, n=40, k=5, s=2, act=hard_swish), 199 | mbv3_op_se(ef=6, n=40, k=5, s=1, act=hard_swish), 200 | mbv3_op_se(ef=6, n=40, k=5, s=1, act=hard_swish), 201 | mbv3_op_se(ef=3, n=48, k=5, s=1, act=hard_swish), 202 | mbv3_op_se(ef=3, n=48, k=5, s=1, act=hard_swish), 203 | mbv3_op_se(ef=6, n=96, k=5, s=2, act=hard_swish), 204 | mbv3_op_se(ef=6, n=96, k=5, s=1, act=hard_swish), 205 | mbv3_op_se(ef=6, n=96, k=5, s=1, act=hard_swish), 206 | op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=576, 207 | activation_fn=hard_swish), 208 | op(reduce_to_1x1, default_size=7, stride=1, padding='VALID'), 209 | op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=1024, 210 | normalizer_fn=None, activation_fn=hard_swish) 211 | ])) 212 | 213 | # 62% accuracy. 214 | V3_SMALL_MINIMALISTIC = dict( 215 | defaults=dict(DEFAULTS), 216 | spec=([ 217 | # stage 1 218 | op(slim.conv2d, stride=2, num_outputs=16, kernel_size=(3, 3)), 219 | mbv3_op(ef=1, n=16, k=3, s=2), 220 | mbv3_op(ef=72. / 16, n=24, k=3, s=2), 221 | mbv3_op(ef=(88. / 24), n=24, k=3, s=1), 222 | mbv3_op(ef=4, n=40, k=3, s=2), 223 | mbv3_op(ef=6, n=40, k=3, s=1), 224 | mbv3_op(ef=6, n=40, k=3, s=1), 225 | mbv3_op(ef=3, n=48, k=3, s=1), 226 | mbv3_op(ef=3, n=48, k=3, s=1), 227 | mbv3_op(ef=6, n=96, k=3, s=2), 228 | mbv3_op(ef=6, n=96, k=3, s=1), 229 | mbv3_op(ef=6, n=96, k=3, s=1), 230 | op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=576), 231 | op(reduce_to_1x1, default_size=7, stride=1, padding='VALID'), 232 | op(slim.conv2d, 233 | stride=1, 234 | kernel_size=[1, 1], 235 | num_outputs=1024, 236 | normalizer_fn=None) 237 | ])) 238 | 239 | 240 | # EdgeTPU friendly variant of MobilenetV3 that uses fused convolutions 241 | # instead of depthwise in the early layers. 242 | V3_EDGETPU = dict( 243 | defaults=dict(DEFAULTS), 244 | spec=[ 245 | op(slim.conv2d, stride=2, num_outputs=32, kernel_size=(3, 3)), 246 | mbv3_fused(k=3, s=1, ef=1, n=16), 247 | mbv3_fused(k=3, s=2, ef=8, n=32), 248 | mbv3_fused(k=3, s=1, ef=4, n=32), 249 | mbv3_fused(k=3, s=1, ef=4, n=32), 250 | mbv3_fused(k=3, s=1, ef=4, n=32), 251 | mbv3_fused(k=3, s=2, ef=8, n=48), 252 | mbv3_fused(k=3, s=1, ef=4, n=48), 253 | mbv3_fused(k=3, s=1, ef=4, n=48), 254 | mbv3_fused(k=3, s=1, ef=4, n=48), 255 | mbv3_op(k=3, s=2, ef=8, n=96), 256 | mbv3_op(k=3, s=1, ef=4, n=96), 257 | mbv3_op(k=3, s=1, ef=4, n=96), 258 | mbv3_op(k=3, s=1, ef=4, n=96), 259 | mbv3_op(k=3, s=1, ef=8, n=96, residual=False), 260 | mbv3_op(k=3, s=1, ef=4, n=96), 261 | mbv3_op(k=3, s=1, ef=4, n=96), 262 | mbv3_op(k=3, s=1, ef=4, n=96), 263 | mbv3_op(k=5, s=2, ef=8, n=160), 264 | mbv3_op(k=5, s=1, ef=4, n=160), 265 | mbv3_op(k=5, s=1, ef=4, n=160), 266 | mbv3_op(k=5, s=1, ef=4, n=160), 267 | mbv3_op(k=3, s=1, ef=8, n=192), 268 | op(slim.conv2d, stride=1, num_outputs=1280, kernel_size=(1, 1)), 269 | ]) 270 | 271 | 272 | @slim.add_arg_scope 273 | def mobilenet(input_tensor, 274 | num_classes=1001, 275 | depth_multiplier=1.0, 276 | scope='MobilenetV3', 277 | conv_defs=None, 278 | finegrain_classification_mode=False, 279 | **kwargs): 280 | """Creates mobilenet V3 network. 281 | 282 | Inference mode is created by default. To create training use training_scope 283 | below. 284 | 285 | with tf.contrib.slim.arg_scope(mobilenet_v3.training_scope()): 286 | logits, endpoints = mobilenet_v3.mobilenet(input_tensor) 287 | 288 | Args: 289 | input_tensor: The input tensor 290 | num_classes: number of classes 291 | depth_multiplier: The multiplier applied to scale number of 292 | channels in each layer. 293 | scope: Scope of the operator 294 | conv_defs: Which version to create. Could be large/small or 295 | any conv_def (see mobilenet_v3.py for examples). 296 | finegrain_classification_mode: When set to True, the model 297 | will keep the last layer large even for small multipliers. Following 298 | https://arxiv.org/abs/1801.04381 299 | it improves performance for ImageNet-type of problems. 300 | *Note* ignored if final_endpoint makes the builder exit earlier. 301 | **kwargs: passed directly to mobilenet.mobilenet: 302 | prediction_fn- what prediction function to use. 303 | reuse-: whether to reuse variables (if reuse set to true, scope 304 | must be given). 305 | Returns: 306 | logits/endpoints pair 307 | 308 | Raises: 309 | ValueError: On invalid arguments 310 | """ 311 | if conv_defs is None: 312 | conv_defs = V3_LARGE 313 | if 'multiplier' in kwargs: 314 | raise ValueError('mobilenetv2 doesn\'t support generic ' 315 | 'multiplier parameter use "depth_multiplier" instead.') 316 | if finegrain_classification_mode: 317 | conv_defs = copy.deepcopy(conv_defs) 318 | conv_defs['spec'][-1] = conv_defs['spec'][-1]._replace( 319 | multiplier_func=lambda params, multiplier: params) 320 | depth_args = {} 321 | with slim.arg_scope((lib.depth_multiplier,), **depth_args): 322 | return lib.mobilenet( 323 | input_tensor, 324 | num_classes=num_classes, 325 | conv_defs=conv_defs, 326 | scope=scope, 327 | multiplier=depth_multiplier, 328 | **kwargs) 329 | 330 | mobilenet.default_image_size = 224 331 | training_scope = lib.training_scope 332 | 333 | 334 | @slim.add_arg_scope 335 | def mobilenet_base(input_tensor, depth_multiplier=1.0, **kwargs): 336 | """Creates base of the mobilenet (no pooling and no logits) .""" 337 | return mobilenet( 338 | input_tensor, depth_multiplier=depth_multiplier, base_only=True, **kwargs) 339 | 340 | 341 | def wrapped_partial(func, new_defaults=None, 342 | **kwargs): 343 | """Partial function with new default parameters and updated docstring.""" 344 | if not new_defaults: 345 | new_defaults = {} 346 | def func_wrapper(*f_args, **f_kwargs): 347 | new_kwargs = dict(new_defaults) 348 | new_kwargs.update(f_kwargs) 349 | return func(*f_args, **new_kwargs) 350 | functools.update_wrapper(func_wrapper, func) 351 | partial_func = functools.partial(func_wrapper, **kwargs) 352 | functools.update_wrapper(partial_func, func) 353 | return partial_func 354 | 355 | 356 | large = wrapped_partial(mobilenet, conv_defs=V3_LARGE) 357 | small = wrapped_partial(mobilenet, conv_defs=V3_SMALL) 358 | edge_tpu = wrapped_partial(mobilenet, 359 | new_defaults={'scope': 'MobilenetEdgeTPU'}, 360 | conv_defs=V3_EDGETPU) 361 | 362 | # Minimalistic model that does not have Squeeze Excite blocks, 363 | # Hardswish, or 5x5 depthwise convolution. 364 | # This makes the model very friendly for a wide range of hardware 365 | large_minimalistic = wrapped_partial(mobilenet, conv_defs=V3_LARGE_MINIMALISTIC) 366 | small_minimalistic = wrapped_partial(mobilenet, conv_defs=V3_SMALL_MINIMALISTIC) 367 | 368 | 369 | def _reduce_consecutive_layers(conv_defs, start_id, end_id, multiplier=0.5): 370 | """Reduce the outputs of consecutive layers with multiplier. 371 | 372 | Args: 373 | conv_defs: Mobilenet conv_defs. 374 | start_id: 0-based index of the starting conv_def to be reduced. 375 | end_id: 0-based index of the last conv_def to be reduced. 376 | multiplier: The multiplier by which to reduce the conv_defs. 377 | 378 | Returns: 379 | Mobilenet conv_defs where the output sizes from layers [start_id, end_id], 380 | inclusive, are reduced by multiplier. 381 | 382 | Raises: 383 | ValueError if any layer to be reduced does not have the 'num_outputs' 384 | attribute. 385 | """ 386 | defs = copy.deepcopy(conv_defs) 387 | for d in defs['spec'][start_id:end_id+1]: 388 | d.params.update({ 389 | 'num_outputs': np.int(np.round(d.params['num_outputs'] * multiplier)) 390 | }) 391 | return defs 392 | 393 | 394 | V3_LARGE_DETECTION = _reduce_consecutive_layers(V3_LARGE, 13, 16) 395 | V3_SMALL_DETECTION = _reduce_consecutive_layers(V3_SMALL, 9, 12) 396 | 397 | 398 | __all__ = ['training_scope', 'mobilenet', 'V3_LARGE', 'V3_SMALL', 'large', 399 | 'small', 'V3_LARGE_DETECTION', 'V3_SMALL_DETECTION'] -------------------------------------------------------------------------------- /lib/networks/model.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import tensorflow as tf 3 | from tensorflow.contrib import slim 4 | from db_config import cfg 5 | 6 | import lib.networks.resnet.resnet_v1 as resnet_v1 7 | import lib.networks.resnet.resnet_v1_tiny as resnet_v1_tiny 8 | 9 | 10 | def unpool(inputs, ratio=2): 11 | return tf.image.resize_bilinear(inputs, size=[tf.shape(inputs)[1] * ratio, tf.shape(inputs)[2] * ratio]) 12 | 13 | 14 | def mean_image_subtraction(images, means=[123.68, 116.78, 103.94]): 15 | ''' 16 | :param images: 17 | :param means: 18 | :return: 19 | ''' 20 | num_channels = images.get_shape().as_list()[-1] 21 | if len(means) != num_channels: 22 | raise ValueError('len(means) must match the number of channels') 23 | channels = tf.split(axis=3, num_or_size_splits=num_channels, value=images) 24 | for i in range(num_channels): 25 | channels[i] -= means[i] 26 | return tf.concat(axis=3, values=channels) 27 | 28 | def backbone(input, weight_decay, is_training, backbone_name=cfg.BACKBONE): 29 | # ['resnet_v1_50', 'resnet_v1_18', 'resnet_v2_50', 'resnet_v2_18', 'mobilenet_v2', 'mobilenet_v3'] 30 | 31 | if backbone_name == 'resnet_v1_50': 32 | with slim.arg_scope(resnet_v1.resnet_arg_scope(weight_decay=weight_decay)): 33 | logits, end_points = resnet_v1.resnet_v1_50(input, is_training=is_training, scope=backbone_name) 34 | return logits, end_points 35 | elif backbone_name == 'resnet_v1_18': 36 | with slim.arg_scope(resnet_v1_tiny.resnet_arg_scope(weight_decay=weight_decay)): 37 | logits, end_points = resnet_v1_tiny.resnet_v1_18(input, is_training=is_training, scope=backbone_name) 38 | return logits, end_points 39 | else: 40 | print('{} is error backbone name, not support!'.format(backbone_name)) 41 | assert 0 42 | 43 | 44 | def model(images, weight_decay=1e-5, is_training=True): 45 | """ 46 | resnet-50 47 | :param images: 48 | :param weight_decay: 49 | :param is_training: 50 | :return: 51 | """ 52 | 53 | images = mean_image_subtraction(images) 54 | 55 | logits, end_points = backbone(images, weight_decay, is_training) 56 | 57 | with tf.variable_scope('feature_fusion', values=[end_points.values]): 58 | batch_norm_params = {'decay': cfg["TRAIN"]["MOVING_AVERAGE_DECAY"], 59 | 'epsilon': 1e-5, 60 | 'scale': True, 61 | 'is_training': is_training} 62 | 63 | with slim.arg_scope([slim.conv2d, slim.conv2d_transpose], 64 | activation_fn=tf.nn.relu, 65 | normalizer_fn=slim.batch_norm, 66 | normalizer_params=batch_norm_params, 67 | weights_regularizer=slim.l2_regularizer(weight_decay)): 68 | 69 | f = [end_points['pool5'], end_points['pool4'], 70 | end_points['pool3'], end_points['pool2']] 71 | 72 | g = [None, None, None, None] 73 | h = [None, None, None, None] 74 | 75 | num_outputs = [None, 128, 64, 32] 76 | 77 | # size = K+(K-1)*(r-1) 78 | if cfg.ASPP_LAYER: 79 | with tf.variable_scope('aspp_layer'): 80 | f_32x = f[0] 81 | f_32x_1 = slim.conv2d(f_32x, 256, 1) 82 | f_32x_2 = slim.conv2d(f_32x, 256, 3) 83 | f_32x_3 = slim.conv2d(f_32x, 256, 3, rate=3) 84 | f_32x_4 = slim.conv2d(f_32x, 256, 3, rate=6) 85 | aspp_32x = tf.concat([f_32x_1, f_32x_2, f_32x_3, f_32x_4], axis=-1) 86 | f[0] = slim.conv2d(aspp_32x, 2048, 1) 87 | 88 | for i in range(len(f)): 89 | if i == 0: 90 | h[i] = f[i] 91 | else: 92 | c1_1 = slim.conv2d(tf.concat([g[i - 1], f[i]], axis=-1), num_outputs[i], 1) 93 | h[i] = slim.conv2d(c1_1, num_outputs[i], 3) 94 | if i <= 2: 95 | g[i] = unpool(h[i]) 96 | else: 97 | g[i] = slim.conv2d(h[i], num_outputs[i], 3) 98 | 99 | with tf.variable_scope('concat_branch'): 100 | features = [g[3], h[2], h[1], h[0]] 101 | 102 | concat_feature = None 103 | 104 | for i, f in enumerate(features): 105 | if i is 0: 106 | conv_f = slim.conv2d(f, 64, 3) 107 | concat_feature = conv_f 108 | else: 109 | up_f = slim.conv2d(f, 64, 3) 110 | up_f = unpool(up_f, 2**i) 111 | concat_feature = tf.concat([concat_feature, up_f], axis=-1) 112 | 113 | final_f = slim.conv2d(concat_feature, 64, 3) 114 | 115 | with tf.variable_scope('binarize_branch'): 116 | b_conv = slim.conv2d(final_f, 64, 3) 117 | b_conv = slim.conv2d_transpose(b_conv, 64, 2, 2) 118 | binarize_map = slim.conv2d_transpose(b_conv, 1, 2, 2, activation_fn=tf.nn.sigmoid) 119 | 120 | with tf.variable_scope('threshold_branch'): 121 | b_conv = slim.conv2d(final_f, 64, 3) 122 | b_conv = slim.conv2d_transpose(b_conv, 256, 2, 2) 123 | threshold_map = slim.conv2d_transpose(b_conv, 1, 2, 2, activation_fn=tf.nn.sigmoid) 124 | 125 | with tf.variable_scope('thresh_binary_branch'): 126 | thresh_binary = tf.reciprocal(1 + tf.exp(-cfg.K * (binarize_map-threshold_map)), name='thresh_binary') 127 | 128 | return binarize_map, threshold_map, thresh_binary 129 | 130 | -------------------------------------------------------------------------------- /lib/networks/resnet/resnet_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Contains building blocks for various versions of Residual Networks. 16 | Residual networks (ResNets) were proposed in: 17 | Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 18 | Deep Residual Learning for Image Recognition. arXiv:1512.03385, 2015 19 | More variants were introduced in: 20 | Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 21 | Identity Mappings in Deep Residual Networks. arXiv: 1603.05027, 2016 22 | We can obtain different ResNet variants by changing the network depth, width, 23 | and form of residual unit. This module implements the infrastructure for 24 | building them. Concrete ResNet units and full ResNet networks are implemented in 25 | the accompanying resnet_v1.py and resnet_v2.py modules. 26 | Compared to https://github.com/KaimingHe/deep-residual-networks, in the current 27 | implementation we subsample the output activations in the last residual unit of 28 | each block, instead of subsampling the input activations in the first residual 29 | unit of each block. The two implementations give identical results but our 30 | implementation is more memory efficient. 31 | """ 32 | from __future__ import absolute_import 33 | from __future__ import division 34 | from __future__ import print_function 35 | 36 | import collections 37 | import tensorflow as tf 38 | from tensorflow.contrib import slim as contrib_slim 39 | 40 | slim = contrib_slim 41 | 42 | 43 | class Block(collections.namedtuple('Block', ['scope', 'unit_fn', 'args'])): 44 | """A named tuple describing a ResNet block. 45 | Its parts are: 46 | scope: The scope of the `Block`. 47 | unit_fn: The ResNet unit function which takes as input a `Tensor` and 48 | returns another `Tensor` with the output of the ResNet unit. 49 | args: A list of length equal to the number of units in the `Block`. The list 50 | contains one (depth, depth_bottleneck, stride) tuple for each unit in the 51 | block to serve as argument to unit_fn. 52 | """ 53 | 54 | 55 | def subsample(inputs, factor, scope=None): 56 | """Subsamples the input along the spatial dimensions. 57 | Args: 58 | inputs: A `Tensor` of size [batch, height_in, width_in, channels]. 59 | factor: The subsampling factor. 60 | scope: Optional variable_scope. 61 | Returns: 62 | output: A `Tensor` of size [batch, height_out, width_out, channels] with the 63 | input, either intact (if factor == 1) or subsampled (if factor > 1). 64 | """ 65 | if factor == 1: 66 | return inputs 67 | else: 68 | return slim.max_pool2d(inputs, [1, 1], stride=factor, scope=scope) 69 | 70 | 71 | def conv2d_same(inputs, num_outputs, kernel_size, stride, rate=1, scope=None): 72 | """Strided 2-D convolution with 'SAME' padding. 73 | When stride > 1, then we do explicit zero-padding, followed by conv2d with 74 | 'VALID' padding. 75 | Note that 76 | net = conv2d_same(inputs, num_outputs, 3, stride=stride) 77 | is equivalent to 78 | net = slim.conv2d(inputs, num_outputs, 3, stride=1, padding='SAME') 79 | net = subsample(net, factor=stride) 80 | whereas 81 | net = slim.conv2d(inputs, num_outputs, 3, stride=stride, padding='SAME') 82 | is different when the input's height or width is even, which is why we add the 83 | current function. For more details, see ResnetUtilsTest.testConv2DSameEven(). 84 | Args: 85 | inputs: A 4-D tensor of size [batch, height_in, width_in, channels]. 86 | num_outputs: An integer, the number of output filters. 87 | kernel_size: An int with the kernel_size of the filters. 88 | stride: An integer, the output stride. 89 | rate: An integer, rate for atrous convolution. 90 | scope: Scope. 91 | Returns: 92 | output: A 4-D tensor of size [batch, height_out, width_out, channels] with 93 | the convolution output. 94 | """ 95 | if stride == 1: 96 | return slim.conv2d(inputs, num_outputs, kernel_size, stride=1, rate=rate, 97 | padding='SAME', scope=scope) 98 | else: 99 | kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1) 100 | pad_total = kernel_size_effective - 1 101 | pad_beg = pad_total // 2 102 | pad_end = pad_total - pad_beg 103 | inputs = tf.pad(inputs, 104 | [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) 105 | return slim.conv2d(inputs, num_outputs, kernel_size, stride=stride, 106 | rate=rate, padding='VALID', scope=scope) 107 | 108 | 109 | @slim.add_arg_scope 110 | def stack_blocks_dense(net, blocks, output_stride=None, 111 | store_non_strided_activations=False, 112 | outputs_collections=None): 113 | """Stacks ResNet `Blocks` and controls output feature density. 114 | First, this function creates scopes for the ResNet in the form of 115 | 'block_name/unit_1', 'block_name/unit_2', etc. 116 | Second, this function allows the user to explicitly control the ResNet 117 | output_stride, which is the ratio of the input to output spatial resolution. 118 | This is useful for dense prediction tasks such as semantic segmentation or 119 | object detection. 120 | Most ResNets consist of 4 ResNet blocks and subsample the activations by a 121 | factor of 2 when transitioning between consecutive ResNet blocks. This results 122 | to a nominal ResNet output_stride equal to 8. If we set the output_stride to 123 | half the nominal network stride (e.g., output_stride=4), then we compute 124 | responses twice. 125 | Control of the output feature density is implemented by atrous convolution. 126 | Args: 127 | net: A `Tensor` of size [batch, height, width, channels]. 128 | blocks: A list of length equal to the number of ResNet `Blocks`. Each 129 | element is a ResNet `Block` object describing the units in the `Block`. 130 | output_stride: If `None`, then the output will be computed at the nominal 131 | network stride. If output_stride is not `None`, it specifies the requested 132 | ratio of input to output spatial resolution, which needs to be equal to 133 | the product of unit strides from the start up to some level of the ResNet. 134 | For example, if the ResNet employs units with strides 1, 2, 1, 3, 4, 1, 135 | then valid values for the output_stride are 1, 2, 6, 24 or None (which 136 | is equivalent to output_stride=24). 137 | store_non_strided_activations: If True, we compute non-strided (undecimated) 138 | activations at the last unit of each block and store them in the 139 | `outputs_collections` before subsampling them. This gives us access to 140 | higher resolution intermediate activations which are useful in some 141 | dense prediction problems but increases 4x the computation and memory cost 142 | at the last unit of each block. 143 | outputs_collections: Collection to add the ResNet block outputs. 144 | Returns: 145 | net: Output tensor with stride equal to the specified output_stride. 146 | Raises: 147 | ValueError: If the target output_stride is not valid. 148 | """ 149 | # The current_stride variable keeps track of the effective stride of the 150 | # activations. This allows us to invoke atrous convolution whenever applying 151 | # the next residual unit would result in the activations having stride larger 152 | # than the target output_stride. 153 | current_stride = 1 154 | 155 | # The atrous convolution rate parameter. 156 | rate = 1 157 | 158 | for block in blocks: 159 | with tf.variable_scope(block.scope, 'block', [net]) as sc: 160 | block_stride = 1 161 | for i, unit in enumerate(block.args): 162 | if store_non_strided_activations and i == len(block.args) - 1: 163 | # Move stride from the block's last unit to the end of the block. 164 | block_stride = unit.get('stride', 1) 165 | unit = dict(unit, stride=1) 166 | 167 | with tf.variable_scope('unit_%d' % (i + 1), values=[net]): 168 | # If we have reached the target output_stride, then we need to employ 169 | # atrous convolution with stride=1 and multiply the atrous rate by the 170 | # current unit's stride for use in subsequent layers. 171 | if output_stride is not None and current_stride == output_stride: 172 | net = block.unit_fn(net, rate=rate, **dict(unit, stride=1)) 173 | rate *= unit.get('stride', 1) 174 | 175 | else: 176 | net = block.unit_fn(net, rate=1, **unit) 177 | current_stride *= unit.get('stride', 1) 178 | if output_stride is not None and current_stride > output_stride: 179 | raise ValueError('The target output_stride cannot be reached.') 180 | 181 | # Collect activations at the block's end before performing subsampling. 182 | net = slim.utils.collect_named_outputs(outputs_collections, sc.name, net) 183 | 184 | # Subsampling of the block's output activations. 185 | if output_stride is not None and current_stride == output_stride: 186 | rate *= block_stride 187 | else: 188 | net = subsample(net, block_stride) 189 | current_stride *= block_stride 190 | if output_stride is not None and current_stride > output_stride: 191 | raise ValueError('The target output_stride cannot be reached.') 192 | 193 | if output_stride is not None and current_stride != output_stride: 194 | raise ValueError('The target output_stride cannot be reached.') 195 | 196 | return net 197 | 198 | 199 | def resnet_arg_scope(weight_decay=0.0001, 200 | batch_norm_decay=0.997, 201 | batch_norm_epsilon=1e-5, 202 | batch_norm_scale=True, 203 | activation_fn=tf.nn.relu, 204 | use_batch_norm=True, 205 | batch_norm_updates_collections=tf.GraphKeys.UPDATE_OPS): 206 | """Defines the default ResNet arg scope. 207 | TODO(gpapan): The batch-normalization related default values above are 208 | appropriate for use in conjunction with the reference ResNet models 209 | released at https://github.com/KaimingHe/deep-residual-networks. When 210 | training ResNets from scratch, they might need to be tuned. 211 | Args: 212 | weight_decay: The weight decay to use for regularizing the model. 213 | batch_norm_decay: The moving average decay when estimating layer activation 214 | statistics in batch normalization. 215 | batch_norm_epsilon: Small constant to prevent division by zero when 216 | normalizing activations by their variance in batch normalization. 217 | batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the 218 | activations in the batch normalization layer. 219 | activation_fn: The activation function which is used in ResNet. 220 | use_batch_norm: Whether or not to use batch normalization. 221 | batch_norm_updates_collections: Collection for the update ops for 222 | batch norm. 223 | Returns: 224 | An `arg_scope` to use for the resnet models. 225 | """ 226 | batch_norm_params = { 227 | 'decay': batch_norm_decay, 228 | 'epsilon': batch_norm_epsilon, 229 | 'scale': batch_norm_scale, 230 | 'updates_collections': batch_norm_updates_collections, 231 | 'fused': None, # Use fused batch norm if possible. 232 | } 233 | 234 | with slim.arg_scope( 235 | [slim.conv2d], 236 | weights_regularizer=slim.l2_regularizer(weight_decay), 237 | weights_initializer=slim.variance_scaling_initializer(), 238 | activation_fn=activation_fn, 239 | normalizer_fn=slim.batch_norm if use_batch_norm else None, 240 | normalizer_params=batch_norm_params): 241 | with slim.arg_scope([slim.batch_norm], **batch_norm_params): 242 | # The following implies padding='SAME' for pool1, which makes feature 243 | # alignment easier for dense prediction tasks. This is also used in 244 | # https://github.com/facebook/fb.resnet.torch. However the accompanying 245 | # code of 'Deep Residual Learning for Image Recognition' uses 246 | # padding='VALID' for pool1. You can switch to that choice by setting 247 | # slim.arg_scope([slim.max_pool2d], padding='VALID'). 248 | with slim.arg_scope([slim.max_pool2d], padding='SAME') as arg_sc: 249 | return arg_sc -------------------------------------------------------------------------------- /lib/networks/resnet/resnet_v1.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Contains definitions for the original form of Residual Networks. 16 | The 'v1' residual networks (ResNets) implemented in this module were proposed 17 | by: 18 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 19 | Deep Residual Learning for Image Recognition. arXiv:1512.03385 20 | Other variants were introduced in: 21 | [2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 22 | Identity Mappings in Deep Residual Networks. arXiv: 1603.05027 23 | The networks defined in this module utilize the bottleneck building block of 24 | [1] with projection shortcuts only for increasing depths. They employ batch 25 | normalization *after* every weight layer. This is the architecture used by 26 | MSRA in the Imagenet and MSCOCO 2016 competition models ResNet-101 and 27 | ResNet-152. See [2; Fig. 1a] for a comparison between the current 'v1' 28 | architecture and the alternative 'v2' architecture of [2] which uses batch 29 | normalization *before* every weight layer in the so-called full pre-activation 30 | units. 31 | Typical use: 32 | from tensorflow.contrib.slim.nets import resnet_v1 33 | ResNet-101 for image classification into 1000 classes: 34 | # inputs has shape [batch, 224, 224, 3] 35 | with slim.arg_scope(resnet_v1.resnet_arg_scope()): 36 | net, end_points = resnet_v1.resnet_v1_101(inputs, 1000, is_training=False) 37 | ResNet-101 for semantic segmentation into 21 classes: 38 | # inputs has shape [batch, 513, 513, 3] 39 | with slim.arg_scope(resnet_v1.resnet_arg_scope()): 40 | net, end_points = resnet_v1.resnet_v1_101(inputs, 41 | 21, 42 | is_training=False, 43 | global_pool=False, 44 | output_stride=16) 45 | """ 46 | from __future__ import absolute_import 47 | from __future__ import division 48 | from __future__ import print_function 49 | 50 | import tensorflow as tf 51 | from tensorflow.contrib import slim as contrib_slim 52 | 53 | from lib.networks.resnet import resnet_utils 54 | 55 | 56 | resnet_arg_scope = resnet_utils.resnet_arg_scope 57 | slim = contrib_slim 58 | 59 | 60 | class NoOpScope(object): 61 | """No-op context manager.""" 62 | 63 | def __enter__(self): 64 | return None 65 | 66 | def __exit__(self, exc_type, exc_value, traceback): 67 | return False 68 | 69 | 70 | @slim.add_arg_scope 71 | def bottleneck(inputs, 72 | depth, 73 | depth_bottleneck, 74 | stride, 75 | rate=1, 76 | outputs_collections=None, 77 | scope=None, 78 | use_bounded_activations=False): 79 | """Bottleneck residual unit variant with BN after convolutions. 80 | This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for 81 | its definition. Note that we use here the bottleneck variant which has an 82 | extra bottleneck layer. 83 | When putting together two consecutive ResNet blocks that use this unit, one 84 | should use stride = 2 in the last unit of the first block. 85 | Args: 86 | inputs: A tensor of size [batch, height, width, channels]. 87 | depth: The depth of the ResNet unit output. 88 | depth_bottleneck: The depth of the bottleneck layers. 89 | stride: The ResNet unit's stride. Determines the amount of downsampling of 90 | the units output compared to its input. 91 | rate: An integer, rate for atrous convolution. 92 | outputs_collections: Collection to add the ResNet unit output. 93 | scope: Optional variable_scope. 94 | use_bounded_activations: Whether or not to use bounded activations. Bounded 95 | activations better lend themselves to quantized inference. 96 | Returns: 97 | The ResNet unit's output. 98 | """ 99 | with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc: 100 | depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4) 101 | if depth == depth_in: 102 | shortcut = resnet_utils.subsample(inputs, stride, 'shortcut') 103 | else: 104 | shortcut = slim.conv2d( 105 | inputs, 106 | depth, [1, 1], 107 | stride=stride, 108 | activation_fn=tf.nn.relu6 if use_bounded_activations else None, 109 | scope='shortcut') 110 | 111 | residual = slim.conv2d(inputs, depth_bottleneck, [1, 1], stride=1, 112 | scope='conv1') 113 | residual = resnet_utils.conv2d_same(residual, depth_bottleneck, 3, stride, 114 | rate=rate, scope='conv2') 115 | residual = slim.conv2d(residual, depth, [1, 1], stride=1, 116 | activation_fn=None, scope='conv3') 117 | 118 | if use_bounded_activations: 119 | # Use clip_by_value to simulate bandpass activation. 120 | residual = tf.clip_by_value(residual, -6.0, 6.0) 121 | output = tf.nn.relu6(shortcut + residual) 122 | else: 123 | output = tf.nn.relu(shortcut + residual) 124 | 125 | return slim.utils.collect_named_outputs(outputs_collections, 126 | sc.name, 127 | output) 128 | 129 | 130 | def resnet_v1(inputs, 131 | blocks, 132 | num_classes=None, 133 | is_training=True, 134 | global_pool=True, 135 | output_stride=None, 136 | include_root_block=True, 137 | spatial_squeeze=True, 138 | store_non_strided_activations=False, 139 | reuse=None, 140 | scope=None): 141 | """Generator for v1 ResNet models. 142 | This function generates a family of ResNet v1 models. See the resnet_v1_*() 143 | methods for specific model instantiations, obtained by selecting different 144 | block instantiations that produce ResNets of various depths. 145 | Training for image classification on Imagenet is usually done with [224, 224] 146 | inputs, resulting in [7, 7] feature maps at the output of the last ResNet 147 | block for the ResNets defined in [1] that have nominal stride equal to 32. 148 | However, for dense prediction tasks we advise that one uses inputs with 149 | spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In 150 | this case the feature maps at the ResNet output will have spatial shape 151 | [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1] 152 | and corners exactly aligned with the input image corners, which greatly 153 | facilitates alignment of the features to the image. Using as input [225, 225] 154 | images results in [8, 8] feature maps at the output of the last ResNet block. 155 | For dense prediction tasks, the ResNet needs to run in fully-convolutional 156 | (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all 157 | have nominal stride equal to 32 and a good choice in FCN mode is to use 158 | output_stride=16 in order to increase the density of the computed features at 159 | small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915. 160 | Args: 161 | inputs: A tensor of size [batch, height_in, width_in, channels]. 162 | blocks: A list of length equal to the number of ResNet blocks. Each element 163 | is a resnet_utils.Block object describing the units in the block. 164 | num_classes: Number of predicted classes for classification tasks. 165 | If 0 or None, we return the features before the logit layer. 166 | is_training: whether batch_norm layers are in training mode. If this is set 167 | to None, the callers can specify slim.batch_norm's is_training parameter 168 | from an outer slim.arg_scope. 169 | global_pool: If True, we perform global average pooling before computing the 170 | logits. Set to True for image classification, False for dense prediction. 171 | output_stride: If None, then the output will be computed at the nominal 172 | network stride. If output_stride is not None, it specifies the requested 173 | ratio of input to output spatial resolution. 174 | include_root_block: If True, include the initial convolution followed by 175 | max-pooling, if False excludes it. 176 | spatial_squeeze: if True, logits is of shape [B, C], if false logits is 177 | of shape [B, 1, 1, C], where B is batch_size and C is number of classes. 178 | To use this parameter, the input images must be smaller than 300x300 179 | pixels, in which case the output logit layer does not contain spatial 180 | information and can be removed. 181 | store_non_strided_activations: If True, we compute non-strided (undecimated) 182 | activations at the last unit of each block and store them in the 183 | `outputs_collections` before subsampling them. This gives us access to 184 | higher resolution intermediate activations which are useful in some 185 | dense prediction problems but increases 4x the computation and memory cost 186 | at the last unit of each block. 187 | reuse: whether or not the network and its variables should be reused. To be 188 | able to reuse 'scope' must be given. 189 | scope: Optional variable_scope. 190 | Returns: 191 | net: A rank-4 tensor of size [batch, height_out, width_out, channels_out]. 192 | If global_pool is False, then height_out and width_out are reduced by a 193 | factor of output_stride compared to the respective height_in and width_in, 194 | else both height_out and width_out equal one. If num_classes is 0 or None, 195 | then net is the output of the last ResNet block, potentially after global 196 | average pooling. If num_classes a non-zero integer, net contains the 197 | pre-softmax activations. 198 | end_points: A dictionary from components of the network to the corresponding 199 | activation. 200 | Raises: 201 | ValueError: If the target output_stride is not valid. 202 | """ 203 | with tf.variable_scope(scope, 'resnet_v1', [inputs], reuse=reuse) as sc: 204 | end_points_collection = sc.original_name_scope + '_end_points' 205 | with slim.arg_scope([slim.conv2d, bottleneck, 206 | resnet_utils.stack_blocks_dense], 207 | outputs_collections=end_points_collection): 208 | with (slim.arg_scope([slim.batch_norm], is_training=is_training) 209 | if is_training is not None else NoOpScope()): 210 | net = inputs 211 | if include_root_block: 212 | if output_stride is not None: 213 | if output_stride % 4 != 0: 214 | raise ValueError('The output_stride needs to be a multiple of 4.') 215 | output_stride /= 4 216 | net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1') 217 | net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1') 218 | net = slim.utils.collect_named_outputs(end_points_collection, 'pool2', net) 219 | net = resnet_utils.stack_blocks_dense(net, blocks, output_stride, 220 | store_non_strided_activations) 221 | # Convert end_points_collection into a dictionary of end_points. 222 | end_points = slim.utils.convert_collection_to_dict( 223 | end_points_collection) 224 | 225 | try: 226 | end_points['pool3'] = end_points[scope + '/block1'] 227 | end_points['pool4'] = end_points[scope + '/block2'] 228 | except: 229 | end_points['pool3'] = end_points['Detection/' + scope + '/block1'] 230 | end_points['pool4'] = end_points['Detection/' + scope + '/block1'] 231 | end_points['pool5'] = net 232 | 233 | return net, end_points 234 | resnet_v1.default_image_size = 224 235 | 236 | 237 | def resnet_v1_block(scope, base_depth, num_units, stride): 238 | """Helper function for creating a resnet_v1 bottleneck block. 239 | Args: 240 | scope: The scope of the block. 241 | base_depth: The depth of the bottleneck layer for each unit. 242 | num_units: The number of units in the block. 243 | stride: The stride of the block, implemented as a stride in the last unit. 244 | All other units have stride=1. 245 | Returns: 246 | A resnet_v1 bottleneck block. 247 | """ 248 | return resnet_utils.Block(scope, bottleneck, [{ 249 | 'depth': base_depth * 4, 250 | 'depth_bottleneck': base_depth, 251 | 'stride': 1 252 | }] * (num_units - 1) + [{ 253 | 'depth': base_depth * 4, 254 | 'depth_bottleneck': base_depth, 255 | 'stride': stride 256 | }]) 257 | 258 | 259 | def resnet_v1_50(inputs, 260 | num_classes=None, 261 | is_training=True, 262 | global_pool=True, 263 | output_stride=None, 264 | spatial_squeeze=True, 265 | store_non_strided_activations=False, 266 | min_base_depth=8, 267 | depth_multiplier=1, 268 | reuse=None, 269 | scope='resnet_v1_50'): 270 | """ResNet-50 model of [1]. See resnet_v1() for arg and return description.""" 271 | depth_func = lambda d: max(int(d * depth_multiplier), min_base_depth) 272 | blocks = [ 273 | resnet_v1_block('block1', base_depth=depth_func(64), num_units=3, 274 | stride=2), 275 | resnet_v1_block('block2', base_depth=depth_func(128), num_units=4, 276 | stride=2), 277 | resnet_v1_block('block3', base_depth=depth_func(256), num_units=6, 278 | stride=2), 279 | resnet_v1_block('block4', base_depth=depth_func(512), num_units=3, 280 | stride=1), 281 | ] 282 | return resnet_v1(inputs, blocks, num_classes, is_training, 283 | global_pool=global_pool, output_stride=output_stride, 284 | include_root_block=True, spatial_squeeze=spatial_squeeze, 285 | store_non_strided_activations=store_non_strided_activations, 286 | reuse=reuse, scope=scope) 287 | resnet_v1_50.default_image_size = resnet_v1.default_image_size 288 | 289 | 290 | def resnet_v1_101(inputs, 291 | num_classes=None, 292 | is_training=True, 293 | global_pool=True, 294 | output_stride=None, 295 | spatial_squeeze=True, 296 | store_non_strided_activations=False, 297 | min_base_depth=8, 298 | depth_multiplier=1, 299 | reuse=None, 300 | scope='resnet_v1_101'): 301 | """ResNet-101 model of [1]. See resnet_v1() for arg and return description.""" 302 | depth_func = lambda d: max(int(d * depth_multiplier), min_base_depth) 303 | blocks = [ 304 | resnet_v1_block('block1', base_depth=depth_func(64), num_units=3, 305 | stride=2), 306 | resnet_v1_block('block2', base_depth=depth_func(128), num_units=4, 307 | stride=2), 308 | resnet_v1_block('block3', base_depth=depth_func(256), num_units=23, 309 | stride=2), 310 | resnet_v1_block('block4', base_depth=depth_func(512), num_units=3, 311 | stride=1), 312 | ] 313 | return resnet_v1(inputs, blocks, num_classes, is_training, 314 | global_pool=global_pool, output_stride=output_stride, 315 | include_root_block=True, spatial_squeeze=spatial_squeeze, 316 | store_non_strided_activations=store_non_strided_activations, 317 | reuse=reuse, scope=scope) 318 | resnet_v1_101.default_image_size = resnet_v1.default_image_size 319 | 320 | 321 | def resnet_v1_152(inputs, 322 | num_classes=None, 323 | is_training=True, 324 | global_pool=True, 325 | output_stride=None, 326 | store_non_strided_activations=False, 327 | spatial_squeeze=True, 328 | min_base_depth=8, 329 | depth_multiplier=1, 330 | reuse=None, 331 | scope='resnet_v1_152'): 332 | """ResNet-152 model of [1]. See resnet_v1() for arg and return description.""" 333 | depth_func = lambda d: max(int(d * depth_multiplier), min_base_depth) 334 | blocks = [ 335 | resnet_v1_block('block1', base_depth=depth_func(64), num_units=3, 336 | stride=2), 337 | resnet_v1_block('block2', base_depth=depth_func(128), num_units=8, 338 | stride=2), 339 | resnet_v1_block('block3', base_depth=depth_func(256), num_units=36, 340 | stride=2), 341 | resnet_v1_block('block4', base_depth=depth_func(512), num_units=3, 342 | stride=1), 343 | ] 344 | return resnet_v1(inputs, blocks, num_classes, is_training, 345 | global_pool=global_pool, output_stride=output_stride, 346 | include_root_block=True, spatial_squeeze=spatial_squeeze, 347 | store_non_strided_activations=store_non_strided_activations, 348 | reuse=reuse, scope=scope) 349 | resnet_v1_152.default_image_size = resnet_v1.default_image_size 350 | 351 | 352 | def resnet_v1_200(inputs, 353 | num_classes=None, 354 | is_training=True, 355 | global_pool=True, 356 | output_stride=None, 357 | store_non_strided_activations=False, 358 | spatial_squeeze=True, 359 | min_base_depth=8, 360 | depth_multiplier=1, 361 | reuse=None, 362 | scope='resnet_v1_200'): 363 | """ResNet-200 model of [2]. See resnet_v1() for arg and return description.""" 364 | depth_func = lambda d: max(int(d * depth_multiplier), min_base_depth) 365 | blocks = [ 366 | resnet_v1_block('block1', base_depth=depth_func(64), num_units=3, 367 | stride=2), 368 | resnet_v1_block('block2', base_depth=depth_func(128), num_units=24, 369 | stride=2), 370 | resnet_v1_block('block3', base_depth=depth_func(256), num_units=36, 371 | stride=2), 372 | resnet_v1_block('block4', base_depth=depth_func(512), num_units=3, 373 | stride=1), 374 | ] 375 | return resnet_v1(inputs, blocks, num_classes, is_training, 376 | global_pool=global_pool, output_stride=output_stride, 377 | include_root_block=True, spatial_squeeze=spatial_squeeze, 378 | store_non_strided_activations=store_non_strided_activations, 379 | reuse=reuse, scope=scope) 380 | resnet_v1_200.default_image_size = resnet_v1.default_image_size -------------------------------------------------------------------------------- /lib/networks/resnet/resnet_v1_tiny.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Contains definitions for the original form of Residual Networks. 16 | The 'v1' residual networks (ResNets) implemented in this module were proposed 17 | by: 18 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 19 | Deep Residual Learning for Image Recognition. arXiv:1512.03385 20 | Other variants were introduced in: 21 | [2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 22 | Identity Mappings in Deep Residual Networks. arXiv: 1603.05027 23 | The networks defined in this module utilize the bottleneck building block of 24 | [1] with projection shortcuts only for increasing depths. They employ batch 25 | normalization *after* every weight layer. This is the architecture used by 26 | MSRA in the Imagenet and MSCOCO 2016 competition models ResNet-101 and 27 | ResNet-152. See [2; Fig. 1a] for a comparison between the current 'v1' 28 | architecture and the alternative 'v2' architecture of [2] which uses batch 29 | normalization *before* every weight layer in the so-called full pre-activation 30 | units. 31 | Typical use: 32 | from tensorflow.contrib.slim.nets import resnet_v1 33 | ResNet-101 for image classification into 1000 classes: 34 | # inputs has shape [batch, 224, 224, 3] 35 | with slim.arg_scope(resnet_v1.resnet_arg_scope()): 36 | net, end_points = resnet_v1.resnet_v1_101(inputs, 1000, is_training=False) 37 | ResNet-101 for semantic segmentation into 21 classes: 38 | # inputs has shape [batch, 513, 513, 3] 39 | with slim.arg_scope(resnet_v1.resnet_arg_scope()): 40 | net, end_points = resnet_v1.resnet_v1_101(inputs, 41 | 21, 42 | is_training=False, 43 | global_pool=False, 44 | output_stride=16) 45 | """ 46 | from __future__ import absolute_import 47 | from __future__ import division 48 | from __future__ import print_function 49 | 50 | import tensorflow as tf 51 | from tensorflow.contrib import slim as contrib_slim 52 | 53 | from lib.networks.resnet import resnet_utils 54 | 55 | 56 | resnet_arg_scope = resnet_utils.resnet_arg_scope 57 | slim = contrib_slim 58 | 59 | 60 | class NoOpScope(object): 61 | """No-op context manager.""" 62 | 63 | def __enter__(self): 64 | return None 65 | 66 | def __exit__(self, exc_type, exc_value, traceback): 67 | return False 68 | 69 | 70 | @slim.add_arg_scope 71 | def bottleneck_tiny(inputs, 72 | depth, 73 | depth_bottleneck, 74 | stride, 75 | rate=1, 76 | outputs_collections=None, 77 | scope=None, 78 | use_bounded_activations=False): 79 | """Bottleneck residual unit variant with BN after convolutions. 80 | This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for 81 | its definition. Note that we use here the bottleneck variant which has an 82 | extra bottleneck layer. 83 | When putting together two consecutive ResNet blocks that use this unit, one 84 | should use stride = 2 in the last unit of the first block. 85 | Args: 86 | inputs: A tensor of size [batch, height, width, channels]. 87 | depth: The depth of the ResNet unit output. 88 | depth_bottleneck: The depth of the bottleneck layers. 89 | stride: The ResNet unit's stride. Determines the amount of downsampling of 90 | the units output compared to its input. 91 | rate: An integer, rate for atrous convolution. 92 | outputs_collections: Collection to add the ResNet unit output. 93 | scope: Optional variable_scope. 94 | use_bounded_activations: Whether or not to use bounded activations. Bounded 95 | activations better lend themselves to quantized inference. 96 | Returns: 97 | The ResNet unit's output. 98 | """ 99 | with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc: 100 | depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4) 101 | if depth == depth_in: 102 | shortcut = resnet_utils.subsample(inputs, stride, 'shortcut') 103 | else: 104 | shortcut = slim.conv2d( 105 | inputs, 106 | depth, [1, 1], 107 | stride=stride, 108 | activation_fn=tf.nn.relu6 if use_bounded_activations else None, 109 | scope='shortcut') 110 | # zzh: for resnet 18, 24. It is diff from 50, 101, 251 etc 111 | residual = resnet_utils.conv2d_same(inputs, depth_bottleneck, 3, 1, 112 | rate=rate, scope='conv1') 113 | residual = resnet_utils.conv2d_same(residual, depth, 3, stride, 114 | rate=rate, scope='conv2') 115 | 116 | # residual = slim.conv2d(inputs, depth_bottleneck, [1, 1], stride=1, 117 | # scope='conv1') 118 | # residual = resnet_utils.conv2d_same(residual, depth_bottleneck, 3, stride, 119 | # rate=rate, scope='conv2') 120 | # residual = slim.conv2d(residual, depth, [1, 1], stride=1, 121 | # activation_fn=None, scope='conv3') 122 | 123 | 124 | if use_bounded_activations: 125 | # Use clip_by_value to simulate bandpass activation. 126 | residual = tf.clip_by_value(residual, -6.0, 6.0) 127 | output = tf.nn.relu6(shortcut + residual) 128 | else: 129 | output = tf.nn.relu(shortcut + residual) 130 | 131 | return slim.utils.collect_named_outputs(outputs_collections, 132 | sc.name, 133 | output) 134 | 135 | 136 | def resnet_v1_tiny(inputs, 137 | blocks, 138 | num_classes=None, 139 | is_training=True, 140 | global_pool=True, 141 | output_stride=None, 142 | include_root_block=True, 143 | spatial_squeeze=True, 144 | store_non_strided_activations=False, 145 | reuse=None, 146 | scope=None): 147 | """Generator for v1 ResNet models. 148 | This function generates a family of ResNet v1 models. See the resnet_v1_*() 149 | methods for specific model instantiations, obtained by selecting different 150 | block instantiations that produce ResNets of various depths. 151 | Training for image classification on Imagenet is usually done with [224, 224] 152 | inputs, resulting in [7, 7] feature maps at the output of the last ResNet 153 | block for the ResNets defined in [1] that have nominal stride equal to 32. 154 | However, for dense prediction tasks we advise that one uses inputs with 155 | spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In 156 | this case the feature maps at the ResNet output will have spatial shape 157 | [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1] 158 | and corners exactly aligned with the input image corners, which greatly 159 | facilitates alignment of the features to the image. Using as input [225, 225] 160 | images results in [8, 8] feature maps at the output of the last ResNet block. 161 | For dense prediction tasks, the ResNet needs to run in fully-convolutional 162 | (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all 163 | have nominal stride equal to 32 and a good choice in FCN mode is to use 164 | output_stride=16 in order to increase the density of the computed features at 165 | small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915. 166 | Args: 167 | inputs: A tensor of size [batch, height_in, width_in, channels]. 168 | blocks: A list of length equal to the number of ResNet blocks. Each element 169 | is a resnet_utils.Block object describing the units in the block. 170 | num_classes: Number of predicted classes for classification tasks. 171 | If 0 or None, we return the features before the logit layer. 172 | is_training: whether batch_norm layers are in training mode. If this is set 173 | to None, the callers can specify slim.batch_norm's is_training parameter 174 | from an outer slim.arg_scope. 175 | global_pool: If True, we perform global average pooling before computing the 176 | logits. Set to True for image classification, False for dense prediction. 177 | output_stride: If None, then the output will be computed at the nominal 178 | network stride. If output_stride is not None, it specifies the requested 179 | ratio of input to output spatial resolution. 180 | include_root_block: If True, include the initial convolution followed by 181 | max-pooling, if False excludes it. 182 | spatial_squeeze: if True, logits is of shape [B, C], if false logits is 183 | of shape [B, 1, 1, C], where B is batch_size and C is number of classes. 184 | To use this parameter, the input images must be smaller than 300x300 185 | pixels, in which case the output logit layer does not contain spatial 186 | information and can be removed. 187 | store_non_strided_activations: If True, we compute non-strided (undecimated) 188 | activations at the last unit of each block and store them in the 189 | `outputs_collections` before subsampling them. This gives us access to 190 | higher resolution intermediate activations which are useful in some 191 | dense prediction problems but increases 4x the computation and memory cost 192 | at the last unit of each block. 193 | reuse: whether or not the network and its variables should be reused. To be 194 | able to reuse 'scope' must be given. 195 | scope: Optional variable_scope. 196 | Returns: 197 | net: A rank-4 tensor of size [batch, height_out, width_out, channels_out]. 198 | If global_pool is False, then height_out and width_out are reduced by a 199 | factor of output_stride compared to the respective height_in and width_in, 200 | else both height_out and width_out equal one. If num_classes is 0 or None, 201 | then net is the output of the last ResNet block, potentially after global 202 | average pooling. If num_classes a non-zero integer, net contains the 203 | pre-softmax activations. 204 | end_points: A dictionary from components of the network to the corresponding 205 | activation. 206 | Raises: 207 | ValueError: If the target output_stride is not valid. 208 | """ 209 | with tf.variable_scope(scope, 'resnet_v1', [inputs], reuse=reuse) as sc: 210 | end_points_collection = sc.original_name_scope + '_end_points' 211 | with slim.arg_scope([slim.conv2d, bottleneck_tiny, 212 | resnet_utils.stack_blocks_dense], 213 | outputs_collections=end_points_collection): 214 | with (slim.arg_scope([slim.batch_norm], is_training=is_training) 215 | if is_training is not None else NoOpScope()): 216 | net = inputs 217 | if include_root_block: 218 | if output_stride is not None: 219 | if output_stride % 4 != 0: 220 | raise ValueError('The output_stride needs to be a multiple of 4.') 221 | output_stride /= 4 222 | net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1') 223 | net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1') 224 | net = slim.utils.collect_named_outputs(end_points_collection, 'pool2', net) 225 | net = resnet_utils.stack_blocks_dense(net, blocks, output_stride, 226 | store_non_strided_activations) 227 | # Convert end_points_collection into a dictionary of end_points. 228 | end_points = slim.utils.convert_collection_to_dict( 229 | end_points_collection) 230 | 231 | try: 232 | end_points['pool3'] = end_points[scope + '/block1'] 233 | end_points['pool4'] = end_points[scope + '/block2'] 234 | except: 235 | end_points['pool3'] = end_points['Detection/' + scope + '/block1'] 236 | end_points['pool4'] = end_points['Detection/' + scope + '/block1'] 237 | end_points['pool5'] = net 238 | 239 | return net, end_points 240 | resnet_v1_tiny.default_image_size = 224 241 | 242 | 243 | def resnet_v1_block(scope, base_depth, num_units, stride): 244 | """Helper function for creating a resnet_v1 bottleneck block. 245 | Args: 246 | scope: The scope of the block. 247 | base_depth: The depth of the bottleneck layer for each unit. 248 | num_units: The number of units in the block. 249 | stride: The stride of the block, implemented as a stride in the last unit. 250 | All other units have stride=1. 251 | Returns: 252 | A resnet_v1 bottleneck block. 253 | """ 254 | return resnet_utils.Block(scope, bottleneck_tiny, [{ 255 | 'depth': base_depth * 4, 256 | 'depth_bottleneck': base_depth, 257 | 'stride': 1 258 | }] * (num_units - 1) + [{ 259 | 'depth': base_depth * 4, 260 | 'depth_bottleneck': base_depth, 261 | 'stride': stride 262 | }]) 263 | 264 | def resnet_v1_18(inputs, 265 | num_classes=None, 266 | is_training=True, 267 | global_pool=True, 268 | output_stride=None, 269 | spatial_squeeze=True, 270 | store_non_strided_activations=False, 271 | min_base_depth=8, 272 | depth_multiplier=1, 273 | reuse=None, 274 | scope='resnet_v1_18'): 275 | """ResNet-18 model of [1]. See resnet_v1() for arg and return description.""" 276 | depth_func = lambda d: max(int(d * depth_multiplier), min_base_depth) 277 | blocks = [ 278 | resnet_v1_block('block1', base_depth=depth_func(64), num_units=2, 279 | stride=2), 280 | resnet_v1_block('block2', base_depth=depth_func(128), num_units=2, 281 | stride=2), 282 | resnet_v1_block('block3', base_depth=depth_func(256), num_units=2, 283 | stride=2), 284 | resnet_v1_block('block4', base_depth=depth_func(512), num_units=2, 285 | stride=1), 286 | ] 287 | return resnet_v1_tiny(inputs, blocks, num_classes, is_training, 288 | global_pool=global_pool, output_stride=output_stride, 289 | include_root_block=True, spatial_squeeze=spatial_squeeze, 290 | store_non_strided_activations=store_non_strided_activations, 291 | reuse=reuse, scope=scope) 292 | resnet_v1_18.default_image_size = resnet_v1_tiny.default_image_size 293 | 294 | def resnet_v1_34(inputs, 295 | num_classes=None, 296 | is_training=True, 297 | global_pool=True, 298 | output_stride=None, 299 | spatial_squeeze=True, 300 | store_non_strided_activations=False, 301 | min_base_depth=8, 302 | depth_multiplier=1, 303 | reuse=None, 304 | scope='resnet_v1_18'): 305 | """ResNet-18 model of [1]. See resnet_v1() for arg and return description.""" 306 | depth_func = lambda d: max(int(d * depth_multiplier), min_base_depth) 307 | blocks = [ 308 | resnet_v1_block('block1', base_depth=depth_func(64), num_units=3, 309 | stride=2), 310 | resnet_v1_block('block2', base_depth=depth_func(128), num_units=4, 311 | stride=2), 312 | resnet_v1_block('block3', base_depth=depth_func(256), num_units=6, 313 | stride=2), 314 | resnet_v1_block('block4', base_depth=depth_func(512), num_units=3, 315 | stride=1), 316 | ] 317 | return resnet_v1_tiny(inputs, blocks, num_classes, is_training, 318 | global_pool=global_pool, output_stride=output_stride, 319 | include_root_block=True, spatial_squeeze=spatial_squeeze, 320 | store_non_strided_activations=store_non_strided_activations, 321 | reuse=reuse, scope=scope) 322 | resnet_v1_34.default_image_size = resnet_v1_tiny.default_image_size 323 | -------------------------------------------------------------------------------- /lib/networks/resnet/resnet_v2.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Contains definitions for the preactivation form of Residual Networks. 16 | Residual networks (ResNets) were originally proposed in: 17 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 18 | Deep Residual Learning for Image Recognition. arXiv:1512.03385 19 | The full preactivation 'v2' ResNet variant implemented in this module was 20 | introduced by: 21 | [2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 22 | Identity Mappings in Deep Residual Networks. arXiv: 1603.05027 23 | The key difference of the full preactivation 'v2' variant compared to the 24 | 'v1' variant in [1] is the use of batch normalization before every weight layer. 25 | Typical use: 26 | from tensorflow.contrib.slim.nets import resnet_v2 27 | ResNet-101 for image classification into 1000 classes: 28 | # inputs has shape [batch, 224, 224, 3] 29 | with slim.arg_scope(resnet_v2.resnet_arg_scope()): 30 | net, end_points = resnet_v2.resnet_v2_101(inputs, 1000, is_training=False) 31 | ResNet-101 for semantic segmentation into 21 classes: 32 | # inputs has shape [batch, 513, 513, 3] 33 | with slim.arg_scope(resnet_v2.resnet_arg_scope()): 34 | net, end_points = resnet_v2.resnet_v2_101(inputs, 35 | 21, 36 | is_training=False, 37 | global_pool=False, 38 | output_stride=16) 39 | """ 40 | from __future__ import absolute_import 41 | from __future__ import division 42 | from __future__ import print_function 43 | 44 | import tensorflow as tf 45 | from tensorflow.contrib import slim as contrib_slim 46 | 47 | from lib.networks.resnet import resnet_utils 48 | 49 | slim = contrib_slim 50 | resnet_arg_scope = resnet_utils.resnet_arg_scope 51 | 52 | 53 | @slim.add_arg_scope 54 | def bottleneck(inputs, depth, depth_bottleneck, stride, rate=1, 55 | outputs_collections=None, scope=None): 56 | """Bottleneck residual unit variant with BN before convolutions. 57 | This is the full preactivation residual unit variant proposed in [2]. See 58 | Fig. 1(b) of [2] for its definition. Note that we use here the bottleneck 59 | variant which has an extra bottleneck layer. 60 | When putting together two consecutive ResNet blocks that use this unit, one 61 | should use stride = 2 in the last unit of the first block. 62 | Args: 63 | inputs: A tensor of size [batch, height, width, channels]. 64 | depth: The depth of the ResNet unit output. 65 | depth_bottleneck: The depth of the bottleneck layers. 66 | stride: The ResNet unit's stride. Determines the amount of downsampling of 67 | the units output compared to its input. 68 | rate: An integer, rate for atrous convolution. 69 | outputs_collections: Collection to add the ResNet unit output. 70 | scope: Optional variable_scope. 71 | Returns: 72 | The ResNet unit's output. 73 | """ 74 | with tf.variable_scope(scope, 'bottleneck_v2', [inputs]) as sc: 75 | depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4) 76 | preact = slim.batch_norm(inputs, activation_fn=tf.nn.relu, scope='preact') 77 | if depth == depth_in: 78 | shortcut = resnet_utils.subsample(inputs, stride, 'shortcut') 79 | else: 80 | shortcut = slim.conv2d(preact, depth, [1, 1], stride=stride, 81 | normalizer_fn=None, activation_fn=None, 82 | scope='shortcut') 83 | 84 | residual = slim.conv2d(preact, depth_bottleneck, [1, 1], stride=1, 85 | scope='conv1') 86 | residual = resnet_utils.conv2d_same(residual, depth_bottleneck, 3, stride, 87 | rate=rate, scope='conv2') 88 | residual = slim.conv2d(residual, depth, [1, 1], stride=1, 89 | normalizer_fn=None, activation_fn=None, 90 | scope='conv3') 91 | 92 | output = shortcut + residual 93 | 94 | return slim.utils.collect_named_outputs(outputs_collections, 95 | sc.name, 96 | output) 97 | 98 | 99 | def resnet_v2(inputs, 100 | blocks, 101 | num_classes=None, 102 | is_training=True, 103 | global_pool=True, 104 | output_stride=None, 105 | include_root_block=True, 106 | spatial_squeeze=True, 107 | reuse=None, 108 | scope=None): 109 | """Generator for v2 (preactivation) ResNet models. 110 | This function generates a family of ResNet v2 models. See the resnet_v2_*() 111 | methods for specific model instantiations, obtained by selecting different 112 | block instantiations that produce ResNets of various depths. 113 | Training for image classification on Imagenet is usually done with [224, 224] 114 | inputs, resulting in [7, 7] feature maps at the output of the last ResNet 115 | block for the ResNets defined in [1] that have nominal stride equal to 32. 116 | However, for dense prediction tasks we advise that one uses inputs with 117 | spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In 118 | this case the feature maps at the ResNet output will have spatial shape 119 | [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1] 120 | and corners exactly aligned with the input image corners, which greatly 121 | facilitates alignment of the features to the image. Using as input [225, 225] 122 | images results in [8, 8] feature maps at the output of the last ResNet block. 123 | For dense prediction tasks, the ResNet needs to run in fully-convolutional 124 | (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all 125 | have nominal stride equal to 32 and a good choice in FCN mode is to use 126 | output_stride=16 in order to increase the density of the computed features at 127 | small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915. 128 | Args: 129 | inputs: A tensor of size [batch, height_in, width_in, channels]. 130 | blocks: A list of length equal to the number of ResNet blocks. Each element 131 | is a resnet_utils.Block object describing the units in the block. 132 | num_classes: Number of predicted classes for classification tasks. 133 | If 0 or None, we return the features before the logit layer. 134 | is_training: whether batch_norm layers are in training mode. 135 | global_pool: If True, we perform global average pooling before computing the 136 | logits. Set to True for image classification, False for dense prediction. 137 | output_stride: If None, then the output will be computed at the nominal 138 | network stride. If output_stride is not None, it specifies the requested 139 | ratio of input to output spatial resolution. 140 | include_root_block: If True, include the initial convolution followed by 141 | max-pooling, if False excludes it. If excluded, `inputs` should be the 142 | results of an activation-less convolution. 143 | spatial_squeeze: if True, logits is of shape [B, C], if false logits is 144 | of shape [B, 1, 1, C], where B is batch_size and C is number of classes. 145 | To use this parameter, the input images must be smaller than 300x300 146 | pixels, in which case the output logit layer does not contain spatial 147 | information and can be removed. 148 | reuse: whether or not the network and its variables should be reused. To be 149 | able to reuse 'scope' must be given. 150 | scope: Optional variable_scope. 151 | Returns: 152 | net: A rank-4 tensor of size [batch, height_out, width_out, channels_out]. 153 | If global_pool is False, then height_out and width_out are reduced by a 154 | factor of output_stride compared to the respective height_in and width_in, 155 | else both height_out and width_out equal one. If num_classes is 0 or None, 156 | then net is the output of the last ResNet block, potentially after global 157 | average pooling. If num_classes is a non-zero integer, net contains the 158 | pre-softmax activations. 159 | end_points: A dictionary from components of the network to the corresponding 160 | activation. 161 | Raises: 162 | ValueError: If the target output_stride is not valid. 163 | """ 164 | with tf.variable_scope(scope, 'resnet_v2', [inputs], reuse=reuse) as sc: 165 | end_points_collection = sc.original_name_scope + '_end_points' 166 | with slim.arg_scope([slim.conv2d, bottleneck, 167 | resnet_utils.stack_blocks_dense], 168 | outputs_collections=end_points_collection): 169 | with slim.arg_scope([slim.batch_norm], is_training=is_training): 170 | net = inputs 171 | if include_root_block: 172 | if output_stride is not None: 173 | if output_stride % 4 != 0: 174 | raise ValueError('The output_stride needs to be a multiple of 4.') 175 | output_stride /= 4 176 | # We do not include batch normalization or activation functions in 177 | # conv1 because the first ResNet unit will perform these. Cf. 178 | # Appendix of [2]. 179 | with slim.arg_scope([slim.conv2d], 180 | activation_fn=None, normalizer_fn=None): 181 | net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1') 182 | net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1') 183 | net = slim.utils.collect_named_outputs(end_points_collection, 'pool2', net) 184 | net = resnet_utils.stack_blocks_dense(net, blocks, output_stride) 185 | # This is needed because the pre-activation variant does not have batch 186 | # normalization or activation functions in the residual unit output. See 187 | # Appendix of [2]. 188 | net = slim.batch_norm(net, activation_fn=tf.nn.relu, scope='postnorm') 189 | # Convert end_points_collection into a dictionary of end_points. 190 | end_points = slim.utils.convert_collection_to_dict( 191 | end_points_collection) 192 | 193 | try: 194 | end_points['pool3'] = end_points[scope + '/block1'] 195 | end_points['pool4'] = end_points[scope + '/block2'] 196 | except: 197 | end_points['pool3'] = end_points['Detection/' + scope + '/block1'] 198 | end_points['pool4'] = end_points['Detection/' + scope + '/block1'] 199 | end_points['pool5'] = net 200 | 201 | # if global_pool: 202 | # # Global average pooling. 203 | # net = tf.reduce_mean(net, [1, 2], name='pool5', keep_dims=True) 204 | # end_points['global_pool'] = net 205 | # if num_classes: 206 | # net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, 207 | # normalizer_fn=None, scope='logits') 208 | # end_points[sc.name + '/logits'] = net 209 | # if spatial_squeeze: 210 | # net = tf.squeeze(net, [1, 2], name='SpatialSqueeze') 211 | # end_points[sc.name + '/spatial_squeeze'] = net 212 | # end_points['predictions'] = slim.softmax(net, scope='predictions') 213 | return net, end_points 214 | resnet_v2.default_image_size = 224 215 | 216 | 217 | def resnet_v2_block(scope, base_depth, num_units, stride): 218 | """Helper function for creating a resnet_v2 bottleneck block. 219 | Args: 220 | scope: The scope of the block. 221 | base_depth: The depth of the bottleneck layer for each unit. 222 | num_units: The number of units in the block. 223 | stride: The stride of the block, implemented as a stride in the last unit. 224 | All other units have stride=1. 225 | Returns: 226 | A resnet_v2 bottleneck block. 227 | """ 228 | return resnet_utils.Block(scope, bottleneck, [{ 229 | 'depth': base_depth * 4, 230 | 'depth_bottleneck': base_depth, 231 | 'stride': 1 232 | }] * (num_units - 1) + [{ 233 | 'depth': base_depth * 4, 234 | 'depth_bottleneck': base_depth, 235 | 'stride': stride 236 | }]) 237 | resnet_v2.default_image_size = 224 238 | 239 | 240 | def resnet_v2_50(inputs, 241 | num_classes=None, 242 | is_training=True, 243 | global_pool=True, 244 | output_stride=None, 245 | spatial_squeeze=True, 246 | reuse=None, 247 | scope='resnet_v2_50'): 248 | """ResNet-50 model of [1]. See resnet_v2() for arg and return description.""" 249 | blocks = [ 250 | resnet_v2_block('block1', base_depth=64, num_units=3, stride=2), 251 | resnet_v2_block('block2', base_depth=128, num_units=4, stride=2), 252 | resnet_v2_block('block3', base_depth=256, num_units=6, stride=2), 253 | resnet_v2_block('block4', base_depth=512, num_units=3, stride=1), 254 | ] 255 | return resnet_v2(inputs, blocks, num_classes, is_training=is_training, 256 | global_pool=global_pool, output_stride=output_stride, 257 | include_root_block=True, spatial_squeeze=spatial_squeeze, 258 | reuse=reuse, scope=scope) 259 | resnet_v2_50.default_image_size = resnet_v2.default_image_size 260 | 261 | 262 | def resnet_v2_101(inputs, 263 | num_classes=None, 264 | is_training=True, 265 | global_pool=True, 266 | output_stride=None, 267 | spatial_squeeze=True, 268 | reuse=None, 269 | scope='resnet_v2_101'): 270 | """ResNet-101 model of [1]. See resnet_v2() for arg and return description.""" 271 | blocks = [ 272 | resnet_v2_block('block1', base_depth=64, num_units=3, stride=2), 273 | resnet_v2_block('block2', base_depth=128, num_units=4, stride=2), 274 | resnet_v2_block('block3', base_depth=256, num_units=23, stride=2), 275 | resnet_v2_block('block4', base_depth=512, num_units=3, stride=1), 276 | ] 277 | return resnet_v2(inputs, blocks, num_classes, is_training=is_training, 278 | global_pool=global_pool, output_stride=output_stride, 279 | include_root_block=True, spatial_squeeze=spatial_squeeze, 280 | reuse=reuse, scope=scope) 281 | resnet_v2_101.default_image_size = resnet_v2.default_image_size 282 | 283 | 284 | def resnet_v2_152(inputs, 285 | num_classes=None, 286 | is_training=True, 287 | global_pool=True, 288 | output_stride=None, 289 | spatial_squeeze=True, 290 | reuse=None, 291 | scope='resnet_v2_152'): 292 | """ResNet-152 model of [1]. See resnet_v2() for arg and return description.""" 293 | blocks = [ 294 | resnet_v2_block('block1', base_depth=64, num_units=3, stride=2), 295 | resnet_v2_block('block2', base_depth=128, num_units=8, stride=2), 296 | resnet_v2_block('block3', base_depth=256, num_units=36, stride=2), 297 | resnet_v2_block('block4', base_depth=512, num_units=3, stride=1), 298 | ] 299 | return resnet_v2(inputs, blocks, num_classes, is_training=is_training, 300 | global_pool=global_pool, output_stride=output_stride, 301 | include_root_block=True, spatial_squeeze=spatial_squeeze, 302 | reuse=reuse, scope=scope) 303 | resnet_v2_152.default_image_size = resnet_v2.default_image_size 304 | 305 | 306 | def resnet_v2_200(inputs, 307 | num_classes=None, 308 | is_training=True, 309 | global_pool=True, 310 | output_stride=None, 311 | spatial_squeeze=True, 312 | reuse=None, 313 | scope='resnet_v2_200'): 314 | """ResNet-200 model of [2]. See resnet_v2() for arg and return description.""" 315 | blocks = [ 316 | resnet_v2_block('block1', base_depth=64, num_units=3, stride=2), 317 | resnet_v2_block('block2', base_depth=128, num_units=24, stride=2), 318 | resnet_v2_block('block3', base_depth=256, num_units=36, stride=2), 319 | resnet_v2_block('block4', base_depth=512, num_units=3, stride=1), 320 | ] 321 | return resnet_v2(inputs, blocks, num_classes, is_training=is_training, 322 | global_pool=global_pool, output_stride=output_stride, 323 | include_root_block=True, spatial_squeeze=spatial_squeeze, 324 | reuse=reuse, scope=scope) 325 | resnet_v2_200.default_image_size = resnet_v2.default_image_size -------------------------------------------------------------------------------- /lib/postprocess/post_process.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import pyclipper 4 | 5 | from shapely.geometry import Polygon 6 | from db_config import cfg 7 | 8 | class SegDetectorRepresenter(): 9 | def __init__(self, thresh=0.3, box_thresh=0.7, max_candidates=1000): 10 | self.min_size = 1 11 | self.thresh = thresh 12 | self.box_thresh = box_thresh 13 | self.max_candidates = max_candidates 14 | 15 | def __call__(self, input_batch, score_maps, is_output_polygon=False): 16 | segmentation = self._binarize(score_maps) 17 | boxes_batch = [] 18 | scores_batch = [] 19 | for batch_index in range(len(input_batch)): 20 | height, width, _ = input_batch[batch_index].shape 21 | if is_output_polygon: 22 | boxes, scores = self._polygons_from_bitmap(score_maps[batch_index], segmentation[batch_index], width, height) 23 | else: 24 | boxes, scores = self._boxes_from_bitmap(score_maps[batch_index], segmentation[batch_index], width, height) 25 | boxes_batch.append(boxes) 26 | scores_batch.append(scores) 27 | return boxes_batch, scores_batch 28 | 29 | def _binarize(self, pred): 30 | return pred > self.thresh 31 | 32 | def _polygons_from_bitmap(self, pred, bitmap, dest_width, dest_height): 33 | ''' 34 | _bitmap: single map with shape (H, W), 35 | whose values are binarized as {0, 1} 36 | ''' 37 | 38 | assert len(bitmap.shape) == 3 39 | height, width, _ = bitmap.shape 40 | boxes = [] 41 | scores = [] 42 | 43 | contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) 44 | for contour in contours[:self.max_candidates]: 45 | epsilon = cfg.EPSILON_RATIO * cv2.arcLength(contour, True) 46 | approx = cv2.approxPolyDP(contour, epsilon, True) 47 | points = approx.reshape((-1, 2)) 48 | if points.shape[0] < 4: 49 | continue 50 | # print('poly contour shape', contour.shape) 51 | contour = contour.reshape([-1, 2]) 52 | score = self._box_score_fast(pred, contour) 53 | if self.box_thresh > score: 54 | continue 55 | # print('points', points) 56 | if points.shape[0] > 2: 57 | box = self._unclip(points, unclip_ratio=2.0) 58 | # print('bbox', box) 59 | if len(box) != 1: 60 | continue 61 | else: 62 | continue 63 | # print('box', box.shape) 64 | box = box.reshape(-1, 2) 65 | # print('re', box.shape) 66 | _, sside = self._get_mini_boxes(box.reshape((-1, 1, 2))) 67 | if sside < self.min_size + 2: 68 | continue 69 | 70 | if not isinstance(dest_width, int): 71 | dest_width = dest_width.item() 72 | dest_height = dest_height.item() 73 | 74 | box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width) 75 | box[:, 1] = np.clip(np.round(box[:, 1] / height * dest_height), 0, dest_height) 76 | boxes.append(box.astype(np.float)) 77 | scores.append(score) 78 | return np.array(boxes, ), scores 79 | 80 | def _boxes_from_bitmap(self, pred, bitmap, dest_width, dest_height): 81 | ''' 82 | _bitmap: single map with shape (H, W), 83 | whose values are binarized as {0, 1} 84 | ''' 85 | 86 | assert len(bitmap.shape) == 3 87 | height, width, _ = bitmap.shape 88 | contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) 89 | num_contours = min(len(contours), self.max_candidates) 90 | boxes = np.zeros((num_contours, 4, 2), dtype=np.float32) 91 | scores = np.zeros((num_contours,), dtype=np.float32) 92 | 93 | for index in range(num_contours): 94 | contour = contours[index].squeeze(1) 95 | points, sside = self._get_mini_boxes(contour) 96 | if sside < self.min_size: 97 | continue 98 | points = np.array(points) 99 | # print('bbox contour shape', contour.shape) 100 | score = self._box_score_fast(pred, contour) 101 | if self.box_thresh > score: 102 | continue 103 | 104 | box = self._unclip(points).reshape(-1, 1, 2) 105 | box, sside = self._get_mini_boxes(box) 106 | if sside < self.min_size + 2: 107 | continue 108 | box = np.array(box) 109 | if not isinstance(dest_width, int): 110 | dest_width = dest_width.item() 111 | dest_height = dest_height.item() 112 | 113 | box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width) 114 | box[:, 1] = np.clip(np.round(box[:, 1] / height * dest_height), 0, dest_height) 115 | boxes[index, :, :] = box.astype(np.int16) 116 | scores[index] = score 117 | return boxes, scores 118 | 119 | def _unclip(self, box, unclip_ratio=1.5): 120 | poly = Polygon(box) 121 | distance = poly.area * unclip_ratio / poly.length 122 | offset = pyclipper.PyclipperOffset() 123 | offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON) 124 | expanded = np.array(offset.Execute(distance)) 125 | return expanded 126 | 127 | def _get_mini_boxes(self, contour): 128 | # print(contour.shape) 129 | bounding_box = cv2.minAreaRect(contour) 130 | points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0]) 131 | 132 | index_1, index_2, index_3, index_4 = 0, 1, 2, 3 133 | if points[1][1] > points[0][1]: 134 | index_1 = 0 135 | index_4 = 1 136 | else: 137 | index_1 = 1 138 | index_4 = 0 139 | if points[3][1] > points[2][1]: 140 | index_2 = 2 141 | index_3 = 3 142 | else: 143 | index_2 = 3 144 | index_3 = 2 145 | 146 | box = [points[index_1], points[index_2], points[index_3], points[index_4]] 147 | return box, min(bounding_box[1]) 148 | 149 | def _box_score_fast(self, bitmap, _box): 150 | h, w = bitmap.shape[:2] 151 | box = _box.copy() 152 | xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int), 0, w - 1) 153 | xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int), 0, w - 1) 154 | ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int), 0, h - 1) 155 | ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int), 0, h - 1) 156 | 157 | mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) 158 | box[:, 0] = box[:, 0] - xmin 159 | box[:, 1] = box[:, 1] - ymin 160 | cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1) 161 | return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] 162 | -------------------------------------------------------------------------------- /lib/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import cv2 3 | import os 4 | import shutil 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | from shapely.geometry import Polygon, MultiPoint 9 | 10 | import lib.networks.model as model 11 | 12 | def quad_iou(_gt_bbox, _pre_bbox): 13 | 14 | gt_poly = Polygon(_gt_bbox).convex_hull 15 | pre_poly = Polygon(_pre_bbox).convex_hull 16 | 17 | union_poly = np.concatenate((_gt_bbox, _pre_bbox)) 18 | 19 | if not gt_poly.intersects(pre_poly): 20 | iou = 0 21 | return iou 22 | else: 23 | inter_area = gt_poly.intersection(pre_poly).area 24 | union_area = MultiPoint(union_poly).convex_hull.area 25 | 26 | if union_area == 0: 27 | iou = 0 28 | else: 29 | iou = float(inter_area) / union_area 30 | 31 | return iou 32 | 33 | def polygon_riou(pred_box, gt_box): 34 | """ 35 | :param pred_box: list [[x1, y1], [x2, y2], [x3, y3], [x4, y4]] 36 | :param gt_box: list [[x1, y1], [x2, y2], [x3, y3], [x4, y4]] 37 | :return: 38 | """ 39 | pred_polygon_points = np.array(pred_box).reshape(-1, 2) 40 | pred_poly = Polygon(pred_polygon_points).convex_hull 41 | 42 | gt_polygon_points = np.array(gt_box).reshape(-1, 2) 43 | 44 | gt_poly = Polygon(gt_polygon_points).convex_hull 45 | if not pred_poly.intersects(gt_poly): 46 | iou = 0 47 | else: 48 | inter_area = pred_poly.intersection(gt_poly).area 49 | union_area = gt_poly.area 50 | if union_area == 0: 51 | iou = 0 52 | else: 53 | iou = float(inter_area) / union_area 54 | return iou 55 | 56 | def compute_f1_score(precision, recall): 57 | if precision == 0 or recall == 0: 58 | return 0.0 59 | else: 60 | return 2.0 * (precision * recall) / (precision + recall) 61 | 62 | def load_ctw1500_labels(path): 63 | """ 64 | load pts 65 | :param path: 66 | :return: polys shape [N, 14, 2] 67 | """ 68 | assert os.path.exists(path), '{} is not exits'.format(path) 69 | polys = [] 70 | tags = [] 71 | with open(path, 'r') as f: 72 | lines = f.readlines() 73 | for line in lines: 74 | parts = line.strip().split(',') 75 | x = float(parts[0]) 76 | y = float(parts[1]) 77 | pts = [float(i) for i in parts[4:32]] 78 | poly = np.array(pts) + [x, y] * 14 79 | polys.append(poly.reshape([-1, 2])) 80 | tags.append(False) 81 | return np.array(polys, np.float), tags 82 | 83 | def load_icdar_labels(path): 84 | pass 85 | 86 | def make_dir(dir): 87 | if os.path.exists(dir): 88 | shutil.rmtree(dir) 89 | os.makedirs(dir) 90 | 91 | 92 | def resize_img(img, max_size=736): 93 | h, w, _ = img.shape 94 | 95 | if max(h, w) > max_size: 96 | ratio = float(max_size) / h if h > w else float(max_size) / w 97 | else: 98 | ratio = 1. 99 | 100 | resize_h = int(ratio * h) 101 | resize_w = int(ratio * w) 102 | 103 | resize_h = resize_h if resize_h % 32 == 0 else abs(resize_h // 32 - 1) * 32 104 | resize_w = resize_w if resize_w % 32 == 0 else abs(resize_w // 32 - 1) * 32 105 | resized_img = cv2.resize(img, (int(resize_w), int(resize_h))) 106 | 107 | ratio_h = resize_h / float(h) 108 | ratio_w = resize_w / float(w) 109 | 110 | return resized_img, (ratio_h, ratio_w) 111 | 112 | def ckpt2pb(ckptpath): 113 | 114 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 115 | tf.reset_default_graph() 116 | input_images = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_images') 117 | global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) 118 | 119 | binarize_map, threshold_map, thresh_binary = model.model(input_images, is_training=False) 120 | 121 | variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step) 122 | saver = tf.train.Saver(variable_averages.variables_to_restore()) 123 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) 124 | gpu_config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options, allow_soft_placement=True) 125 | sess = tf.Session(config=gpu_config) 126 | saver.restore(sess, ckptpath) 127 | 128 | from tensorflow.python.framework import graph_util 129 | constant_graph = graph_util.convert_variables_to_constants( 130 | sess, 131 | sess.graph_def, 132 | ['feature_fusion/binarize_branch/Conv2d_transpose_1/Sigmoid']) 133 | 134 | with tf.gfile.FastGFile('db.pb', mode='wb') as f: 135 | f.write(constant_graph.SerializeToString()) 136 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | imgaug 3 | pyclipper 4 | easydict 5 | Shapely 6 | tqdm 7 | tensorflow 8 | imageio 9 | opencv_python 10 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import time 3 | import numpy as np 4 | import logging 5 | import os 6 | import tensorflow as tf 7 | from tensorflow.contrib import slim 8 | 9 | from db_config import cfg 10 | 11 | import lib.networks.model as model 12 | from lib.networks.losses import compute_loss, compute_acc 13 | from lib.dataset.dataloader import get_batch 14 | 15 | import warnings 16 | warnings.filterwarnings("ignore") 17 | 18 | def make_dir(dir): 19 | if not os.path.exists(dir): 20 | os.makedirs(dir) 21 | 22 | def tower_loss(images, gt_score_maps, gt_threshold_map, gt_score_mask, 23 | gt_thresh_mask, reuse_variables): 24 | 25 | with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables): 26 | binarize_map, threshold_map, thresh_binary = model.model(images, is_training=True) 27 | 28 | model_loss = compute_loss(binarize_map, threshold_map, thresh_binary, 29 | gt_score_maps, gt_threshold_map, gt_score_mask, gt_thresh_mask) 30 | 31 | total_loss = tf.add_n([model_loss] + tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) 32 | 33 | # add summary 34 | if reuse_variables is None: 35 | tf.summary.image('gt/input_imgs', images) 36 | tf.summary.image('gt/score_map', gt_score_maps) 37 | tf.summary.image('gt/threshold_map', gt_threshold_map * 255) 38 | tf.summary.image('gt/score_mask', gt_score_mask) 39 | tf.summary.image('gt/thresh_mask', gt_thresh_mask) 40 | 41 | tf.summary.image('pred/binarize_map', binarize_map) 42 | tf.summary.image('pred/threshold_map', threshold_map * 255) 43 | tf.summary.image('pred/thresh_binary', thresh_binary) 44 | 45 | tf.summary.scalar('model_loss', model_loss) 46 | tf.summary.scalar('total_loss', total_loss) 47 | 48 | return total_loss, model_loss, binarize_map, threshold_map, thresh_binary 49 | 50 | 51 | def average_gradients(tower_grads): 52 | average_grads = [] 53 | for grad_and_vars in zip(*tower_grads): 54 | grads = [] 55 | for g, _ in grad_and_vars: 56 | expanded_g = tf.expand_dims(g, 0) 57 | grads.append(expanded_g) 58 | 59 | grad = tf.concat(grads, 0) 60 | grad = tf.reduce_mean(grad, 0) 61 | 62 | v = grad_and_vars[0][1] 63 | grad_and_var = (grad, v) 64 | average_grads.append(grad_and_var) 65 | 66 | return average_grads 67 | 68 | 69 | def _train_logger_init(): 70 | """ 71 | 初始化log日志 72 | :return: 73 | """ 74 | train_logger = logging.getLogger('train') 75 | train_logger.setLevel(logging.DEBUG) 76 | 77 | # 添加文件输出 78 | log_file = os.path.join(cfg["TRAIN"]["TRAIN_LOGS"], time.strftime('%Y%m%d%H%M', time.localtime(time.time())) + '.logs') 79 | file_handler = logging.FileHandler(log_file, mode='w') 80 | file_handler.setLevel(logging.DEBUG) 81 | file_formatter = logging.Formatter('%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s') 82 | file_handler.setFormatter(file_formatter) 83 | train_logger.addHandler(file_handler) 84 | 85 | # 添加控制台输出 86 | consol_handler = logging.StreamHandler() 87 | consol_handler.setLevel(logging.DEBUG) 88 | consol_formatter = logging.Formatter('%(message)s') 89 | consol_handler.setFormatter(consol_formatter) 90 | train_logger.addHandler(consol_handler) 91 | return train_logger 92 | 93 | 94 | def main(): 95 | import os 96 | os.environ['CUDA_VISIBLE_DEVICES'] = cfg.TRAIN.VIS_GPU 97 | if not tf.gfile.Exists(cfg["TRAIN"]["CHECKPOINTS_OUTPUT_DIR"]): 98 | tf.gfile.MkDir(cfg["TRAIN"]["CHECKPOINTS_OUTPUT_DIR"]) 99 | 100 | train_logger = _train_logger_init() 101 | 102 | input_images = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_images') 103 | input_score_maps = tf.placeholder(tf.float32, shape=[None, None, None, 1], name='input_score_maps') 104 | input_threshold_maps = tf.placeholder(tf.float32, shape=[None, None, None, 1], name='input_threshold_maps') 105 | 106 | input_score_masks = tf.placeholder(tf.float32, shape=[None, None, None, 1], name='input_score_masks') 107 | input_threshold_masks = tf.placeholder(tf.float32, shape=[None, None, None, 1], name='input_threshold_masks') 108 | 109 | global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) 110 | 111 | learning_rate = tf.train.exponential_decay(cfg["TRAIN"]["LEARNING_RATE"], global_step, decay_steps=10000, 112 | decay_rate=0.94, staircase=True) 113 | 114 | if cfg.TRAIN.OPT == 'adam': 115 | # learning_rate = tf.constant(cfg["TRAIN"]["LEARNING_RATE"], tf.float32) 116 | opt = tf.train.AdamOptimizer(learning_rate) 117 | elif cfg.TRAIN.OPT == 'momentum': 118 | opt = tf.train.MomentumOptimizer(learning_rate, 0.9) 119 | else: 120 | assert 0, 'error optimzer' 121 | print('use ', cfg.TRAIN.OPT) 122 | 123 | # add summary 124 | tf.summary.scalar('learning_rate', learning_rate) 125 | 126 | gpus = [str(i) for i in range(len(cfg.TRAIN.VIS_GPU.split(',')))] 127 | input_images_split = tf.split(input_images, len(gpus)) 128 | input_score_maps_split = tf.split(input_score_maps, len(gpus)) 129 | input_threshold_maps_split = tf.split(input_threshold_maps, len(gpus)) 130 | input_score_masks_split = tf.split(input_score_masks, len(gpus)) 131 | input_threshold_masks_split = tf.split(input_threshold_masks, len(gpus)) 132 | 133 | 134 | tower_grads = [] 135 | reuse_variables = None 136 | total_binarize_acc = 0 137 | total_thresh_binary_acc = 0 138 | for i, gpu_id in enumerate(gpus): 139 | print('gpu_id', gpu_id) 140 | with tf.device('/gpu:' + gpu_id): 141 | with tf.name_scope('model_' + gpu_id) as scope: 142 | gt_imgs = input_images_split[i] 143 | gt_scores = input_score_maps_split[i] 144 | gt_thresholds = input_threshold_maps_split[i] 145 | gt_score_masks = input_score_masks_split[i] 146 | gt_threshold_masks = input_threshold_masks_split[i] 147 | total_loss, model_loss, binarize_map, threshold_map, thresh_binary = \ 148 | tower_loss(gt_imgs, gt_scores, gt_thresholds, gt_score_masks, gt_threshold_masks, reuse_variables) 149 | binarize_acc, thresh_binary_acc = compute_acc(binarize_map, threshold_map, thresh_binary, 150 | gt_scores, gt_thresholds, gt_score_masks, gt_threshold_masks) 151 | total_binarize_acc += binarize_acc 152 | total_thresh_binary_acc += thresh_binary_acc 153 | reuse_variables = True 154 | 155 | batch_norm_updates_op = tf.group(*tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope)) 156 | 157 | grads = opt.compute_gradients(total_loss) 158 | tower_grads.append(grads) 159 | 160 | grads = average_gradients(tower_grads) 161 | apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) 162 | 163 | avg_binarize_acc = total_binarize_acc / 2.0 164 | avg_thresh_binary_acc = total_thresh_binary_acc / 2.0 165 | 166 | summary_op = tf.summary.merge_all() 167 | 168 | variable_averages = tf.train.ExponentialMovingAverage(cfg["TRAIN"]["MOVING_AVERAGE_DECAY"], global_step) 169 | 170 | variables_averages_op = variable_averages.apply(tf.trainable_variables()) 171 | 172 | with tf.control_dependencies([variables_averages_op, apply_gradient_op, batch_norm_updates_op]): 173 | train_op = tf.no_op(name='train_op') 174 | 175 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=cfg.TRAIN.SAVE_MAX) 176 | 177 | 178 | train_logs_dir = os.path.join(cfg.TRAIN.TRAIN_LOGS, 'train') 179 | val_logs_dir = os.path.join(cfg.TRAIN.TRAIN_LOGS, 'val') 180 | 181 | make_dir(train_logs_dir) 182 | make_dir(val_logs_dir) 183 | 184 | train_summary_writer = tf.summary.FileWriter(train_logs_dir, tf.get_default_graph()) 185 | val_summary_writer = tf.summary.FileWriter(val_logs_dir, tf.get_default_graph()) 186 | 187 | 188 | init = tf.global_variables_initializer() 189 | 190 | with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: 191 | try: 192 | 193 | if cfg["TRAIN"]["RESTORE"]: 194 | train_logger.info('continue training from previous checkpoint') 195 | ckpt = tf.train.get_checkpoint_state(cfg["TRAIN"]["RESTORE_CKPT_PATH"]) 196 | train_logger.info('restore model path:', ckpt.model_checkpoint_path) 197 | saver.restore(sess, ckpt.model_checkpoint_path) 198 | train_logger.info("done") 199 | elif cfg["TRAIN"]["PRETRAINED_MODEL_PATH"] is not None: 200 | sess.run(init) 201 | print(cfg["TRAIN"]["PRETRAINED_MODEL_PATH"]) 202 | train_logger.info('load pretrain model:{}', str(cfg["TRAIN"]["PRETRAINED_MODEL_PATH"])) 203 | variable_restore_op = slim.assign_from_checkpoint_fn(cfg["TRAIN"]["PRETRAINED_MODEL_PATH"], 204 | slim.get_trainable_variables(), 205 | ignore_missing_vars=True) 206 | variable_restore_op(sess) 207 | train_logger.info("done") 208 | 209 | else: 210 | sess.run(init) 211 | except: 212 | assert 0, 'load error' 213 | 214 | train_data_generator = get_batch(num_workers=cfg.TRAIN.NUM_READERS, 215 | img_dir=cfg.TRAIN.IMG_DIR, 216 | label_dir=cfg.TRAIN.LABEL_DIR, 217 | batchsize=cfg.TRAIN.BATCH_SIZE_PER_GPU * len(gpus)) 218 | 219 | val_data_generator = get_batch(num_workers=10, 220 | img_dir=cfg.EVAL.IMG_DIR, 221 | label_dir=cfg.EVAL.LABEL_DIR, 222 | batchsize=cfg.TRAIN.BATCH_SIZE_PER_GPU * len(gpus)) 223 | 224 | test_data_generator = get_batch(num_workers=1, 225 | img_dir=cfg.EVAL.IMG_DIR, 226 | label_dir=cfg.EVAL.LABEL_DIR, 227 | batchsize=cfg.TRAIN.BATCH_SIZE_PER_GPU * len(gpus), 228 | is_eval=True) 229 | 230 | test_epoch = 0 231 | 232 | start = time.time() 233 | for step in range(cfg["TRAIN"]["MAX_STEPS"]): 234 | train_data = next(train_data_generator) 235 | 236 | train_feed_dict = {input_images: train_data[0], 237 | input_score_maps: train_data[1], 238 | input_threshold_maps: train_data[3], 239 | input_score_masks: train_data[2], 240 | input_threshold_masks: train_data[4]} 241 | 242 | ml, tl, _ = sess.run([model_loss, total_loss, train_op], feed_dict=train_feed_dict) 243 | if np.isnan(tl): 244 | train_logger.info('Loss diverged, stop training') 245 | break 246 | 247 | if step % 10 == 0: 248 | avg_time_per_step = (time.time() - start) / 10 249 | avg_examples_per_second = (10 * cfg["TRAIN"]["BATCH_SIZE_PER_GPU"] * len(gpus)) / (time.time() - start) 250 | start = time.time() 251 | train_logger.info( 252 | '{}->Step {:06d}, model loss {:.4f}, total loss {:.4f}, {:.2f} seconds/step, {:.2f} examples/second'.format( 253 | cfg.TRAIN.VERSION, step, ml, tl, avg_time_per_step, avg_examples_per_second)) 254 | 255 | if step % cfg["TRAIN"]["SAVE_CHECKPOINT_STEPS"] == 0: 256 | saver.save(sess, os.path.join(cfg["TRAIN"]["CHECKPOINTS_OUTPUT_DIR"], 257 | 'DB_' + cfg.BACKBONE + '_' + cfg.TRAIN.VERSION + '_model.ckpt'), 258 | global_step=global_step) 259 | 260 | if step % cfg["TRAIN"]["SAVE_SUMMARY_STEPS"] == 0: 261 | _, tl, train_summary_str = sess.run([train_op, total_loss, summary_op], feed_dict=train_feed_dict) 262 | train_summary_writer.add_summary(train_summary_str, global_step=step) 263 | 264 | val_data = next(val_data_generator) 265 | val_feed_dict = {input_images: val_data[0], 266 | input_score_maps: val_data[1], 267 | input_threshold_maps: val_data[3], 268 | input_score_masks: val_data[2], 269 | input_threshold_masks: val_data[4]} 270 | eval_summary_str = sess.run(summary_op, feed_dict=val_feed_dict) 271 | 272 | val_summary_writer.add_summary(eval_summary_str, global_step=step) 273 | 274 | if step % cfg.EVAL.TEST_STEP == 0 and step != 0: 275 | temp_epoch = test_epoch 276 | train_logger.info('~~~~~~~~~~~~~~~~~~start to test~~~~~~~~~~~~~~~~~~~~~') 277 | avg_bc = [] 278 | avg_tbc = [] 279 | while temp_epoch==test_epoch: 280 | test_data = next(test_data_generator) 281 | test_feed_dict = {input_images: test_data[0], 282 | input_score_maps: test_data[1], 283 | input_threshold_maps: test_data[3], 284 | input_score_masks: test_data[2], 285 | input_threshold_masks: test_data[4]} 286 | test_epoch = test_data[5] 287 | bc, tbc = sess.run([avg_binarize_acc, avg_thresh_binary_acc], 288 | feed_dict=test_feed_dict) 289 | 290 | avg_bc.append(bc) 291 | avg_tbc.append(tbc) 292 | 293 | train_logger.info('avg binarize acc is :{}'.format(sum(avg_bc)/len(avg_bc))) 294 | train_logger.info('avg thresh binary acc is :{}'.format(sum(avg_tbc)/len(avg_tbc))) 295 | 296 | 297 | if __name__ == '__main__': 298 | 299 | main() 300 | 301 | --------------------------------------------------------------------------------