├── .gitignore ├── README.md ├── config.py ├── dataset └── reader.py ├── dfb_opt.py ├── dfb_train.py ├── dfb_utils.py ├── framwork.png ├── models ├── dfb.py └── nets │ ├── resnet.py │ └── resnet_utils.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/config.cpython-36.pyc 2 | __pycache__/config.cpython-36.pyc 3 | __pycache__/dfb_opt.cpython-36.pyc 4 | __pycache__/dfb_utils.cpython-36.pyc 5 | dataset/__pycache__/reader.cpython-36.pyc 6 | models/__pycache__/dfb.cpython-36.pyc 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DFL-CNN-tensorflow 2 | 3 | Implementation of the CVPR2018 [Learning a Discriminative Filter Bank within a CNN for Fine-grained Recognition](https://arxiv.org/abs/1611.09932) in Tensorflow. 4 | 5 | ## Abstract 6 | 7 | Compared to earlier multistage frameworks using CNN features, recent end-to-end deep approaches for fine-grained recognition essentially enhance the mid-level learning capability of CNNs. Previous approaches achieve this by introducing an auxiliary network to infuse localization information into the main classification network, or a sophisticated feature encoding method to capture higher order feature statistics. We show that mid-level representation learning can be enhanced within the CNN framework, by learning a bank of convolutional filters that capture class-specific discriminative patches without extra part or bounding box annotations. Such a filter bank is well structured, properly initialized and discriminatively learned through a novel asymmetric multi-stream architecture with convolutional filter supervision and a non-random layer initialization. Experimental results show that our approach achieves state-of-the-art on three publicly available fine-grained recognition datasets (CUB-200-2011, Stanford Cars and FGVC-Aircraft). Ablation studies and visualizations are provided to understand our approach. 8 | 9 | ## Framwork 10 | 11 | ![framwork](framwork.png) 12 | 13 | ## Requirements 14 | 15 | First install tensorflow, then install other Python packages: 16 | >pip install -r requirements.txt 17 | 18 | ## Usage 19 | 20 | >python dfb_train.py 21 | 22 | ## Reference 23 | 24 | - [Learning a Discriminative Filter Bank within a CNN for Fine-grained Recognition](https://arxiv.org/abs/1611.09932) 25 | 26 | 27 | ## TODO 28 | 29 | - [ ] Nonrandomly initialization 30 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import tensorflow as tf 3 | """ 4 | " Log Configuration 5 | """ 6 | tf.app.flags.DEFINE_string(name="data_dir", default="E:\plant", help="The directory to the dataset.") 7 | 8 | tf.app.flags.DEFINE_string(name="train_dir", default="pdr2018_trainingset_20181023", help="The directory to the dataset.") 9 | 10 | tf.app.flags.DEFINE_string(name="test_dir", default="pdr2018_testa_20181023", help="The directory to the dataset.") 11 | 12 | tf.app.flags.DEFINE_string(name="valid_dir", default="pdr2018_validationset_20181023", help="The directory to the dataset.") 13 | 14 | tf.app.flags.DEFINE_string(name="train_json", default="AgriculturalDisease_train_annotations.json", help="The jsonname.") 15 | 16 | tf.app.flags.DEFINE_string(name="valid_json", default="AgriculturalDisease_validation_annotations.json", help="The jsonname.") 17 | 18 | tf.app.flags.DEFINE_string(name="logs_dir", default="", help="The directory to the logs.") 19 | 20 | tf.app.flags.DEFINE_integer(name="batch_size", default=55, help="The number of samples in each batch.") 21 | 22 | tf.app.flags.DEFINE_integer(name="num_class", default=61, help="The number of classes.") 23 | 24 | tf.app.flags.DEFINE_integer(name="num_train", default=31000, help="The number of trainset.") 25 | 26 | tf.app.flags.DEFINE_integer(name="num_valid", default=4000, help="The number of validset.") 27 | 28 | 29 | tf.app.flags.DEFINE_integer(name="epoches", default=1000, help="The number of training epoch.") 30 | 31 | tf.app.flags.DEFINE_integer(name="verbose", default=8, help="The number of training step to show the loss and accuracy.") 32 | 33 | tf.app.flags.DEFINE_integer(name="patience", default=2, help="The patience of the early stop.") 34 | 35 | tf.app.flags.DEFINE_boolean(name="debug", default=True, help="Debug mode or not") 36 | 37 | tf.flags.DEFINE_string(name="mode", default="train", help="Mode train/ test/ visualize") 38 | FLAGS = tf.app.flags.FLAGS -------------------------------------------------------------------------------- /dataset/reader.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import json 3 | import os 4 | import tensorflow as tf 5 | import random 6 | import cv2 7 | import numpy as np 8 | import math 9 | import time 10 | from skimage import transform 11 | from PIL import Image, ImageEnhance, ImageFilter 12 | from config import * 13 | 14 | def cv_imread(filePath): 15 | cv_img = cv2.imdecode(np.fromfile(filePath, dtype=np.uint8), -1) 16 | return cv_img 17 | def get_aug_data(split_dir, image_dir, py_dict, norm=True): 18 | split_path = os.path.join(FLAGS.data_dir, split_dir) 19 | image_path = os.path.join(split_path, image_dir) 20 | disease_class = py_dict['disease_class'] 21 | image_id = py_dict['image_id'] 22 | zero_array = np.zeros(61) 23 | zero_array[disease_class] = 1 24 | label = zero_array 25 | img = cv_imread(os.path.join(image_path, image_id)) 26 | # random_crop 27 | # img = _random_crop(img) 28 | # flip 29 | img = _flip(img) 30 | # resized 31 | # img = _resize(img) 32 | # img = _pad(img) 33 | # rotation 34 | img = _rotation(img) 35 | img = np.asarray(img) 36 | # color aug 37 | # img = _color_augment(img) 38 | if norm: 39 | img = img / 255. 40 | return img, label 41 | 42 | def aux_generator(split_dir, json_file, norm=True): 43 | # path 44 | split_path = os.path.join(FLAGS.data_dir, split_dir) 45 | image_path = os.path.join(split_path, 'pad_images') 46 | with open(os.path.join(split_path, json_file), 'r', encoding='utf-8') as f: 47 | py_list = json.load(f) 48 | images = [] 49 | labels = [] 50 | set_size = len(py_list) 51 | while True: 52 | images = [] 53 | labels = [] 54 | i = 0 55 | while i < FLAGS.batch_size: 56 | random.shuffle(py_list) 57 | img, label = get_aug_data(split_dir, 'pad_images', py_list[i], norm) 58 | images.append(img) 59 | labels.append(label) 60 | i += 1 61 | images = np.asarray(images) 62 | labels = np.asarray(labels) 63 | yield images, labels 64 | 65 | def generator(split_dir, json_file, norm=True): 66 | return aux_generator(split_dir, json_file, norm) 67 | 68 | def _color_augment(image): 69 | image.flags.writeable = True # 将数组改为读写模式 70 | image = Image.fromarray(np.uint8(image)) 71 | # image.show() 72 | # 亮度增强 73 | if random.choice([0, 1]): 74 | enh_bri = ImageEnhance.Brightness(image) 75 | brightness = random.choice([0.6, 0.8, 1.2, 1.4]) 76 | image = enh_bri.enhance(brightness) 77 | # image.show() 78 | # 色度增强 79 | if random.choice([0, 1]): 80 | enh_col = ImageEnhance.Color(image) 81 | color = random.choice([0.6, 0.8, 1.2, 1.4]) 82 | image = enh_col.enhance(color) 83 | # image.show() 84 | # 对比度增强 85 | if random.choice([0, 1]): 86 | enh_con = ImageEnhance.Contrast(image) 87 | contrast = random.choice([0.6, 0.8, 1.2, 1.4]) 88 | image = enh_con.enhance(contrast) 89 | # image.show() 90 | # 锐度增强 91 | if random.choice([0, 1]): 92 | enh_sha = ImageEnhance.Sharpness(image) 93 | sharpness = random.choice([0.6, 0.8, 1.2, 1.4]) 94 | image = enh_sha.enhance(sharpness) 95 | image.show() 96 | # 模糊 97 | if random.choice([0, 1]): 98 | image = image.filter(ImageFilter.BLUR) 99 | image = np.asarray(image) 100 | return image 101 | 102 | def _flip(image): 103 | if random.choice([0, 1]): 104 | direct = random.choice([-1, 0, 1]) 105 | image = cv2.flip(image, direct) 106 | return image 107 | 108 | def _resize(image): 109 | if random.choice([0, 1]): 110 | h, w = image.shape[:2] 111 | ratio = random.choice([0.7, 0.8, 0.9, 1.1, 1.2, 1.3]) 112 | resized_h = math.ceil(h * ratio) 113 | resized_w = math.ceil(w * ratio) 114 | image = cv2.resize(image, (resized_w, resized_h)) 115 | return image 116 | 117 | 118 | def _shift(image): 119 | if random.choice([0, 1]): 120 | h, w =image.shape[:2] 121 | shift_h = random.choice([-h/4, h/4, -h/3, h/3]) 122 | shift_w = random.choice([-h/4, h/4, -w/3, w/3]) 123 | # 在矩阵第一行中表示的是[1,0,x],其中x表示图像将向左或向右移动的距离,如果x是正值,则表示向右移动,如果是负值的话,则表示向左移动。 124 | # 在矩阵第二行表示的是[0,1,y],其中y表示图像将向上或向下移动的距离,如果y是正值的话,则向下移动,如果是负值的话,则向上移动 125 | shift_mat = np.float32([[1, 0, shift_w],[0, 1, shift_h]]) 126 | image = cv2.warpAffine(image, shift_mat, (w, h)) 127 | return image 128 | 129 | def _random_crop(image): 130 | if random.choice([0, 1]): 131 | size = image.shape 132 | h = size[0] 133 | w = size[1] 134 | ratio = random.uniform(0.7, 0.8) 135 | h_beg = math.floor(random.uniform(0, h * (1 - ratio))) 136 | w_beg = math.floor(random.uniform(0, w * (1 - ratio))) 137 | dh = math.floor(h * ratio) 138 | dw = math.floor(w * ratio) 139 | 140 | img_crop = image[h_beg:h_beg + dh, w_beg:w_beg + dw, :] 141 | image = cv2.resize(img_crop, (w, h)) 142 | return image 143 | 144 | def _rotation(image): 145 | if random.choice([0, 1]): 146 | r_angle = random.randint(-90,90) 147 | h, w = image.shape[:2] 148 | # 第一个参数旋转中心,第二个参数旋转角度,第三个参数:缩放比例 149 | M = cv2.getRotationMatrix2D((h / 2, w / 2), r_angle, 1.) 150 | # 第三个参数:变换后的图像大小 151 | # w_rot = math.ceil(h*math.sin(r_angle/180*math.pi)+w*math.cos(r_angle/180*math.pi))+2 152 | # h_rot = math.ceil(h*math.cos(r_angle/180*math.pi)+w*math.sin(r_angle/180*math.pi))+2 153 | image = cv2.warpAffine(image, M, (w, h)) 154 | return image 155 | def _pad(image): 156 | # pading 157 | img_h = image.shape[0] 158 | img_w = image.shape[1] 159 | if max(img_h, img_w) >= 448: 160 | ratio = max(img_h, img_w) / 447 161 | image = cv2.resize(image, (math.ceil(img_w//ratio),math.ceil(img_h//ratio))) 162 | img_h = image.shape[0] 163 | img_w = image.shape[1] 164 | img_pad = np.zeros((448, 448, 3), dtype=np.float32) 165 | h_pad_beg = (448 - img_h) // 2 166 | w_pad_beg = (448 - img_w) // 2 167 | h_pad_end = h_pad_beg + img_h 168 | w_pad_end = w_pad_beg + img_w 169 | img_pad[h_pad_beg:h_pad_end, w_pad_beg:w_pad_end, :] = image[:, :, :] 170 | return img_pad 171 | # if __name__=='__main__': 172 | 173 | -------------------------------------------------------------------------------- /dfb_opt.py: -------------------------------------------------------------------------------- 1 | #coding : utf-8 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import tensorflow as tf 7 | 8 | from tensorflow.contrib import layers 9 | from tensorflow.contrib.framework.python.ops import arg_scope 10 | from tensorflow.contrib.layers.python.layers import layers as layers_lib 11 | from tensorflow.contrib.layers.python.layers import regularizers 12 | from tensorflow.contrib.layers.python.layers import utils 13 | from tensorflow.python.ops import array_ops 14 | from tensorflow.python.ops import init_ops 15 | from tensorflow.python.ops import nn_ops 16 | from tensorflow.python.ops import variable_scope 17 | 18 | from tensorflow.contrib import slim 19 | import os 20 | import sys 21 | import numpy as np 22 | 23 | def train(loss_val, base_var_list, var_list, lr, clip_value): 24 | opt = tf.train.AdamOptimizer 25 | fc_optimizer = opt(learning_rate=lr) 26 | net_optimizer = opt(learning_rate=lr*0.01) 27 | grads = tf.gradients(loss_val, var_list) 28 | net_grads = grads[:len(base_var_list)] 29 | fc_grads = grads[len(base_var_list):] 30 | clipped_net_grads = [(tf.clip_by_value(grad, -clip_value, clip_value), var) for grad, var in zip(net_grads, var_list[:len(base_var_list)]) if grad is not None] 31 | clipped_fc_grads = [(tf.clip_by_value(grad, -clip_value, clip_value), var) for grad, var in zip(fc_grads, var_list[len(base_var_list):]) if grad is not None] 32 | for grad, var in clipped_fc_grads: 33 | tf.summary.histogram(var.op.name + "/gradient", grad) 34 | for grad, var in clipped_net_grads: 35 | tf.summary.histogram(var.op.name + "/gradient", grad) 36 | train_fc = fc_optimizer.apply_gradients(clipped_fc_grads) 37 | train_net = net_optimizer.apply_gradients(clipped_net_grads) 38 | train_op = tf.group(train_fc, train_net) 39 | return train_op 40 | -------------------------------------------------------------------------------- /dfb_train.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import sys 7 | import tensorflow as tf 8 | from tensorflow.contrib import slim 9 | from models.nets.resnet import resnet_v2 10 | from models.nets import resnet_utils 11 | import os 12 | import time 13 | import numpy as np 14 | from time import time 15 | from dataset import reader 16 | from models.dfb import dfb 17 | from dfb_opt import train 18 | import dfb_utils 19 | from PIL import Image 20 | from config import * 21 | 22 | 23 | M = FLAGS.num_class 24 | k = 10 25 | 26 | def main(argv=None): 27 | is_training = True 28 | input_images = tf.placeholder(dtype=tf.float32, 29 | shape=[FLAGS.batch_size, 448, 448, 3], 30 | name="input_images") 31 | y_true = tf.placeholder(dtype=tf.float32, 32 | shape=[FLAGS.batch_size, FLAGS.num_class], 33 | name="y_true") 34 | keep_prob = tf.placeholder(dtype=tf.float32, 35 | name="dropout") 36 | learning_rate = tf.placeholder(dtype=tf.float64, 37 | name="learning_rate") 38 | clip_value = tf.placeholder(dtype=tf.float32, 39 | name="clip_value") 40 | 41 | """ 42 | ""inference 43 | """ 44 | fc_obj, fc_part, fc_ccp, base_var_list, t_var_list = dfb(input_images, keep_prob, is_training=is_training) 45 | # Predictions 46 | t_predictions = (tf.nn.softmax(fc_part) + 0.1 * tf.nn.softmax(fc_ccp) + tf.nn.softmax(fc_obj)) / 3. 47 | 48 | """ 49 | ""Loss 50 | """ 51 | obj_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_true, logits=fc_obj)) 52 | part_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_true, logits=fc_part)) 53 | ccp_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_true, logits=fc_ccp)) 54 | loss = 0.1 * ccp_loss + part_loss + obj_loss 55 | acc_top1 = dfb_utils.accuracy_top1(y_true, t_predictions) 56 | acc_top5 = dfb_utils.accuracy_top5(y_true, t_predictions) 57 | """ 58 | ""Summary 59 | """ 60 | tf.summary.scalar("t_obj_loss", obj_loss) 61 | tf.summary.scalar("t_part_loss", part_loss) 62 | tf.summary.scalar("t_ccp_loss", ccp_loss) 63 | tf.summary.scalar("t_loss", loss) 64 | tf.summary.scalar("acc_top1", acc_top1) 65 | tf.summary.scalar("acc_top5", acc_top5) 66 | 67 | train_op = train(loss, base_var_list, t_var_list, learning_rate, clip_value) 68 | print("Setting up summary op...") 69 | summary_op = tf.summary.merge_all() 70 | """ 71 | " Loading Data 72 | """ 73 | print("Loading Data......") 74 | if FLAGS.mode == 'train': 75 | generator_train = reader.generator(FLAGS.train_dir, FLAGS.train_json) 76 | print("\tLoaded Train Data......") 77 | 78 | generator_valid = reader.generator(FLAGS.valid_dir, FLAGS.valid_json) 79 | print("\tLoaded Validation Data......") 80 | 81 | """ 82 | " Setting up Saver 83 | """ 84 | 85 | print("Setting up Saver...") 86 | sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))) 87 | saver = tf.train.Saver(t_var_list, max_to_keep=3) 88 | train_writer = tf.summary.FileWriter(os.path.join(FLAGS.logs_dir, 'train'), 89 | sess.graph) 90 | 91 | valid_writer = tf.summary.FileWriter(os.path.join(FLAGS.logs_dir, 'valid'), 92 | sess.graph) 93 | 94 | 95 | print("Initialize global variables") 96 | sess.run(tf.global_variables_initializer()) 97 | 98 | """ 99 | " Resume 100 | """ 101 | ckpt = tf.train.get_checkpoint_state(FLAGS.logs_dir) 102 | if ckpt and ckpt.model_checkpoint_path: 103 | saver.restore(sess, ckpt_t.model_checkpoint_path) 104 | print("Model restored...") 105 | 106 | """ 107 | " Training... 108 | """ 109 | if FLAGS.mode == 'train': 110 | train_batch = int(FLAGS.num_train / FLAGS.batch_size) 111 | valid_batch = int(FLAGS.num_valid / FLAGS.batch_size) 112 | last_loss = 10000. 113 | patience = 0 114 | best_acc = 0.0 115 | clipvalue = 1e-4 116 | current = 1e-4 117 | 118 | for epoch in range(1, FLAGS.epoches if FLAGS.debug else 1): 119 | print("Epoch %i ----> Starting......" % epoch) 120 | start_time = time() 121 | """ 122 | " Build learning rate 123 | """ 124 | if epoch <= 10: 125 | lr = 1e-3 / 10.0 * epoch 126 | current = lr 127 | elif epoch > 10: 128 | lr = current 129 | for step in range(train_batch): 130 | batch_x, batch_y = next(generator_train) 131 | summary, _ , loss_t, loss_ct, loss_ot, loss_pt, acc_1t, acc_5t = \ 132 | sess.run([summary_op, train_op, loss, ccp_loss, obj_loss, part_loss, acc_top1, acc_top5], 133 | feed_dict={input_images: batch_x, 134 | y_true: batch_y, 135 | keep_prob: 0.3, 136 | learning_rate: lr, 137 | clip_value: clipvalue}) 138 | print("Step %i, Loss %0.4f, ccp_loss %0.4f,obj_loss %0.4f,part_loss %0.4f, acc_1 %0.4f,acc_5 %0.4f, lr %.7f,clip:%.7f" % 139 | ((epoch-1) * train_batch + step, loss_t, loss_ct, loss_ot, loss_pt, acc_1t, acc_5t, lr, clipvalue)) 140 | 141 | train_writer.add_summary(summary, step + train_batch * (epoch-1)) 142 | acc1_reg = [] 143 | acc5_reg = [] 144 | loss_reg = [] 145 | for step in range(valid_batch): 146 | batch_x, batch_y = next(generator_valid) 147 | loss_v, acc_1v, acc_5v, summary = sess.run([loss, acc_top1, acc_top5, summary_op], 148 | feed_dict={input_images: batch_x, 149 | y_true: batch_y, 150 | keep_prob: 1., 151 | learning_rate: lr, 152 | clip_value: clipvalue}) 153 | valid_writer.add_summary(summary, step + valid_batch * (epoch-1)) 154 | acc1_reg.append(acc_1v) 155 | acc5_reg.append(acc_5v) 156 | loss_reg.append(loss_v) 157 | avg_acc1 = np.mean(np.array(acc1_reg)) 158 | avg_acc5 = np.mean(np.array(acc5_reg)) 159 | avg_loss = np.mean(np.array(loss_reg)) 160 | print("Valid_loss ----> %0.4f Valid_acc ----> %0.4f, %0.4f" % (avg_loss, avg_acc1, avg_acc5)) 161 | """ 162 | " Save the best model 163 | """ 164 | 165 | if avg_acc1 > best_acc: 166 | best_acc = avg_acc1 167 | saver.save(sess=sess, 168 | save_path=os.path.join(FLAGS.teacher_logs_dir, 'teacher'), 169 | global_step=epoch) 170 | 171 | print("Save the best model with val_acc %0.4f" % best_acc) 172 | else: 173 | print("Val_acc stay with val_acc %0.4f" % best_acc) 174 | 175 | if patience >= FLAGS.patience: 176 | patience = 0 177 | last_loss = 10000 178 | current = current * 0.8 179 | clipvalue = clipvalue * 0.8 180 | print("Lr decay, update the learning rate when lr = %0.4f" % lr) 181 | end_time = time() 182 | print("Epoch %i ----> Ended in %0.4f" % (epoch, end_time - start_time)) 183 | train_writer.close() 184 | valid_writer.close() 185 | print("......Ended") 186 | 187 | print("Ending......") 188 | 189 | if __name__ == "__main__": 190 | tf.app.run() 191 | -------------------------------------------------------------------------------- /dfb_utils.py: -------------------------------------------------------------------------------- 1 | #coding : utf-8 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import tensorflow as tf 7 | from tensorflow.contrib import slim 8 | import os 9 | import sys 10 | import numpy as np 11 | 12 | def accuracy_top1(y_true, predictions): 13 | acc_top1 = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y_true, axis=-1), tf.argmax(predictions, axis=-1)), tf.float32), axis=-1) 14 | return acc_top1 15 | 16 | def accuracy_top5(y_true, predictions): 17 | acc_top5 = tf.reduce_mean(tf.cast(tf.nn.in_top_k(predictions, tf.argmax(y_true, axis=-1), k=5), tf.float32), axis=-1) 18 | return acc_top5 19 | 20 | def accuracy_top3(y_true, predictions): 21 | acc_top3 = tf.reduce_mean(tf.cast(tf.nn.in_top_k(predictions, tf.argmax(y_true, axis=-1), k=3), tf.float32), axis=-1) 22 | return acc_top3 23 | 24 | def focal_loss(targets, logits): 25 | one_vector = tf.ones(logits.get_shape().as_list(), logits.dtype.base_dtype) 26 | _epsilon = tf.convert_to_tensor(1e-12, logits.dtype.base_dtype) 27 | logits = tf.clip_by_value(logits, _epsilon, 1. - _epsilon) 28 | return tf.reduce_mean(-tf.reduce_sum((one_vector - logits) ** 2 * targets * tf.log(logits), axis=-1), axis=0) 29 | 30 | def smooth_l1_loss(targets, logits): 31 | one_vector = tf.ones(logits.get_shape().as_list(), logits.dtype.base_dtype) 32 | smoothl1_loss = 0.5*tf.reduce_mean(tf.cast(tf.less(tf.abs(logits-targets), one_vector),tf.float32)*tf.square(logits-targets)) \ 33 | + tf.reduce_mean((one_vector-tf.cast(tf.less(tf.abs(logits-targets), one_vector),tf.float32))*(tf.abs(logits-targets)-0.5*one_vector)) 34 | return smoothl1_loss 35 | 36 | def ohkm(loss, batch_size): 37 | ohkm_loss = 0. 38 | for i in range(batch_size): 39 | sub_loss = loss[i] 40 | topk_val, topk_idx = tf.nn.top_k(sub_loss, 41 | k=8, 42 | sorted=False, name='ohkm{}'.format(i)) 43 | tmp_loss = tf.gather(sub_loss, topk_idx, name='ohkm_loss{}'.format(i)) # can be ignore ??? 44 | ohkm_loss += tf.reduce_sum(tmp_loss) / 8 45 | ohkm_loss /= batch_size 46 | return ohkm_loss -------------------------------------------------------------------------------- /framwork.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mengrang/DFL-CNN-tensorflow/305f35a4701b1f91ac988b0a03f0149d1f8426c0/framwork.png -------------------------------------------------------------------------------- /models/dfb.py: -------------------------------------------------------------------------------- 1 | # -*- coding : utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import tensorflow as tf 7 | 8 | from tensorflow.contrib import layers 9 | from tensorflow.contrib.framework.python.ops import arg_scope 10 | from tensorflow.contrib.layers.python.layers import layers as layers_lib 11 | from tensorflow.contrib.layers.python.layers import regularizers 12 | from tensorflow.contrib.layers.python.layers import utils 13 | from tensorflow.python.ops import array_ops 14 | from tensorflow.python.ops import init_ops 15 | from tensorflow.python.ops import nn_ops 16 | from tensorflow.python.ops import variable_scope 17 | 18 | from tensorflow.contrib import slim 19 | from models.nets.resnet import resnet_v2 20 | import os 21 | import time 22 | import sys 23 | import numpy as np 24 | from time import time 25 | 26 | M = 61 27 | k = 10 28 | def dfb(input_images, 29 | keep_prob, 30 | is_training=True, 31 | weight_decay=5e-5, 32 | batch_norm_decay=0.99, 33 | batch_norm_epsilon=0.001): 34 | with tf.variable_scope("Teacher_model"): 35 | net, endpoints = resnet_v2(inputs=input_images, 36 | num_classes=M, 37 | is_training=True, 38 | scope='resnet_v2') 39 | 40 | base_var_list = slim.get_model_variables('Teacher_model/resnet_v2') 41 | 42 | part_feature = endpoints["InvertedResidual_{}_{}".format(1024, 3)] 43 | object_feature = endpoints["InvertedResidual_{}_{}".format(1024, 5)] 44 | 45 | object_feature_h = object_feature.get_shape().as_list()[1] 46 | object_feature_w = object_feature.get_shape().as_list()[2] 47 | fc_obj = slim.max_pool2d(object_feature, (object_feature_h, object_feature_w), scope="GMP1") 48 | batch_norm_params = { 49 | 'center': True, 50 | 'scale': True, 51 | 'decay': batch_norm_decay, 52 | 'epsilon': batch_norm_epsilon, 53 | } 54 | 55 | fc_obj = slim.conv2d(fc_obj, 56 | M, 57 | [1, 1], 58 | activation_fn=None, 59 | weights_regularizer=tf.contrib.layers.l2_regularizer(weight_decay), 60 | biases_regularizer=tf.contrib.layers.l2_regularizer(weight_decay), 61 | scope='fc_obj') 62 | fc_obj = tf.nn.dropout(fc_obj, keep_prob=keep_prob) 63 | fc_obj = slim.flatten(fc_obj) 64 | fc_part = slim.conv2d(part_feature, 65 | M * k, #卷积核个数 66 | [1, 1], #卷积核高宽 67 | activation_fn=tf.nn.relu, 68 | normalizer_fn=slim.batch_norm, # 标准化器设置为BN 69 | normalizer_params=batch_norm_params, 70 | weights_regularizer=tf.contrib.layers.l2_regularizer(weight_decay), 71 | biases_regularizer=tf.contrib.layers.l2_regularizer(weight_decay) 72 | ) 73 | fc_part_h = fc_part.get_shape().as_list()[1] 74 | fc_part_w = fc_part.get_shape().as_list()[2] 75 | fc_part = slim.max_pool2d(fc_part, (fc_part_h, fc_part_w), scope="GMP2") 76 | ft_list = tf.split(fc_part, 77 | num_or_size_splits=M, 78 | axis=-1) #最后一维度(C) 79 | cls_list = [] 80 | for i in range(M): 81 | ft = tf.transpose(ft_list[i], [0, 1, 3, 2]) 82 | cls = layers_lib.pool(ft, 83 | [1, k], 84 | "AVG") 85 | cls = layers.flatten(cls) 86 | cls_list.append(cls) 87 | fc_ccp = tf.concat(cls_list, axis=-1) #cross_channel_pooling (N, M) 88 | 89 | fc_part = slim.conv2d(fc_part, 90 | M, 91 | [1, 1], 92 | activation_fn=None, 93 | weights_regularizer=tf.contrib.layers.l2_regularizer(weight_decay), 94 | biases_regularizer=tf.contrib.layers.l2_regularizer(weight_decay), 95 | scope="fc_part") 96 | fc_part = tf.nn.dropout(fc_part, keep_prob=keep_prob) 97 | fc_part = slim.flatten(fc_part) 98 | t_var_list = slim.get_model_variables() 99 | return fc_obj, fc_part, fc_ccp, base_var_list, t_var_list 100 | -------------------------------------------------------------------------------- /models/nets/resnet.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from tensorflow.contrib.layers.python.layers import layers 6 | from tensorflow.contrib import layers as layers_lib 7 | from tensorflow.contrib.framework.python.ops import add_arg_scope 8 | from tensorflow.contrib.framework.python.ops import arg_scope 9 | from tensorflow.contrib.layers.python.layers import initializers 10 | from tensorflow.contrib.layers.python.layers import regularizers 11 | from tensorflow.contrib.layers.python.layers import utils 12 | from tensorflow.python.framework import ops 13 | from tensorflow.python.ops import array_ops 14 | from tensorflow.python.ops import nn_ops 15 | from tensorflow.python.ops import variable_scope 16 | 17 | from collections import namedtuple 18 | import functools 19 | 20 | import tensorflow as tf 21 | 22 | slim = tf.contrib.slim 23 | 24 | # _CONV_DEFS specifies the MobileNet body 25 | Conv = namedtuple('Conv', ['kernel', 'stride', 'depth']) 26 | InvertedResidual = namedtuple('InvertedResidual', ['kernel', 'stride', 'depth', 'num', 'bottle_depth']) 27 | _CONV_DEFS = [ 28 | Conv(kernel=[7, 7], stride=2, depth=64), 29 | InvertedResidual(kernel=[3, 3], stride=2, depth=128, num=1, bottle_depth=64), 30 | InvertedResidual(kernel=[3, 3], stride=2, depth=256, num=3, bottle_depth=64), 31 | InvertedResidual(kernel=[3, 3], stride=2, depth=512, num=4, bottle_depth=128), 32 | InvertedResidual(kernel=[3, 3], stride=2, depth=1024, num=6, bottle_depth=256), 33 | 34 | ] 35 | 36 | def subsample(inputs, factor, scope=None): 37 | if factor == 1: 38 | return inputs 39 | else: 40 | return layers.max_pool2d(inputs, [1, 1], stride=factor, scope=scope) 41 | 42 | def conv2d_same(inputs, num_outputs, kernel_size, stride, rate=1, scope=None): 43 | if stride == 1: 44 | return layers_lib.conv2d( 45 | inputs, 46 | num_outputs, 47 | kernel_size, 48 | stride=1, 49 | rate=rate, 50 | padding='SAME', 51 | scope=scope) 52 | else: 53 | kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1) 54 | pad_total = kernel_size_effective - 1 55 | pad_beg = pad_total // 2 56 | pad_end = pad_total - pad_beg 57 | inputs = array_ops.pad( 58 | inputs, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) 59 | return layers_lib.conv2d( 60 | inputs, 61 | num_outputs, 62 | kernel_size, 63 | stride=stride, 64 | rate=rate, 65 | padding='VALID', 66 | scope=scope) 67 | 68 | 69 | @slim.add_arg_scope 70 | def _inverted_residual_bottleneck(inputs, depth, stride, bottleneck_depth, scope=None): 71 | with tf.variable_scope(scope, 'InvertedResidual', [inputs]) as sc: 72 | depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4) 73 | preact = slim.batch_norm(inputs, activation_fn=tf.nn.relu, scope='preact') 74 | if depth == depth_in: 75 | shortcut = subsample(inputs, stride, 'shortcut') 76 | else: 77 | shortcut = slim.conv2d(preact, depth, 1, stride=stride, 78 | activation_fn=None, normalizer_fn=None, scope='shortcut') 79 | output = slim.conv2d(preact, bottleneck_depth, 1, stride=1, 80 | activation_fn=None, normalizer_fn=None, scope='conv1') 81 | """ 82 | slim.conv2d(inputs,num_outputs,kernel_size,stride=1, padding='SAME',data_format=None,rate=1,activation_fn=nn.relu,normalizer_fn=None, 83 | normalizer_params=None,weights_initializer=initializers.xavier_initializer(),weights_regularizer=None, 84 | biases_initializer=init_ops.zeros_initializer(),biases_regularizer=None, 85 | reuse=None,variables_collections=None,outputs_collections=None,trainable=True,scope=None) 86 | """ 87 | # output = slim.conv2d(output, bottleneck_depth, 3, stride=stride, 88 | # activation_fn=None, normalizer_fn=None, scope='conv2') 89 | output = conv2d_same( 90 | output, bottleneck_depth, 3, stride, rate=1, scope='conv2') 91 | 92 | output = slim.conv2d(output, depth, 1, stride=1, 93 | activation_fn=None, normalizer_fn=None, scope='conv3') 94 | 95 | output = shortcut + output 96 | 97 | return output 98 | 99 | 100 | 101 | def resnet_v2_base(inputs, 102 | final_endpoint='InvertedResidual_{}_{}'.format(1024, 5), 103 | min_depth=8, 104 | depth_multiplier=1.0, 105 | conv_defs=None, 106 | output_stride=None, 107 | scope=None): 108 | 109 | depth = lambda d: max(int(d * depth_multiplier), min_depth) 110 | end_points = {} 111 | 112 | # Used to find thinned depths for each layer. 113 | if depth_multiplier <= 0: 114 | raise ValueError('depth_multiplier is not greater than zero.') 115 | 116 | if conv_defs is None: 117 | conv_defs = _CONV_DEFS 118 | 119 | if output_stride is not None and output_stride not in [8, 16, 32]: 120 | raise ValueError('Only allowed output_stride values are 8, 16, 32.') 121 | 122 | with tf.variable_scope(scope, 'ResNetV2', [inputs]): 123 | with slim.arg_scope([slim.conv2d], padding='SAME'): 124 | current_stride = 1 125 | # The atrous convolution rate parameter. 126 | rate = 1 127 | net = inputs 128 | for i, conv_def in enumerate(conv_defs): 129 | if output_stride is not None and current_stride == output_stride: 130 | # If we have reached the target output_stride, then we need to employ 131 | # atrous convolution with stride=1 and multiply the atrous rate by the 132 | # current unit's stride for use in subsequent layers. 133 | layer_stride = 1 134 | layer_rate = rate 135 | rate *= conv_def.stride 136 | else: 137 | layer_stride = conv_def.stride 138 | layer_rate = 1 139 | current_stride *= conv_def.stride 140 | 141 | if isinstance(conv_def, Conv): 142 | end_point = 'Conv2d_%d' % i 143 | net = slim.conv2d(net, depth(conv_def.depth), conv_def.kernel, 144 | stride=conv_def.stride, 145 | normalizer_fn=slim.batch_norm, 146 | # biases_initializer=None, 147 | scope=end_point) 148 | end_points[end_point] = net 149 | if end_point == final_endpoint: 150 | return net, end_points 151 | 152 | elif isinstance(conv_def, InvertedResidual): 153 | for n in range(conv_def.num): 154 | end_point = 'InvertedResidual_{}_{}'.format(conv_def.depth, n) 155 | stride = conv_def.stride if n == 0 else 1 156 | net = _inverted_residual_bottleneck(net, depth(conv_def.depth), stride, conv_def.bottle_depth, scope=end_point) 157 | end_points[end_point] = net 158 | 159 | if end_point == final_endpoint: 160 | return net, end_points 161 | else: 162 | raise ValueError('Unknown convolution type %s for layer %d' 163 | % (conv_def.ltype, i)) 164 | raise ValueError('Unknown final endpoint %s' % final_endpoint) 165 | 166 | 167 | def resnet_v2(inputs, 168 | num_classes=1000, 169 | dropout_keep_prob=0.997, 170 | is_training=True, 171 | min_depth=8, 172 | depth_multiplier=1.0, 173 | conv_defs=None, 174 | reuse=None, 175 | scope='ResNetV2'): 176 | 177 | input_shape = inputs.get_shape().as_list() 178 | if len(input_shape) != 4: 179 | raise ValueError('Invalid input tensor rank, expected 4, was: %d' % 180 | len(input_shape)) 181 | 182 | with tf.variable_scope(scope, 'ResNetV2', [inputs], reuse=reuse) as scope: 183 | with slim.arg_scope([slim.batch_norm, slim.dropout], 184 | is_training=is_training): 185 | net, end_points = resnet_v2_base(inputs, scope=scope, 186 | min_depth=min_depth, 187 | depth_multiplier=depth_multiplier, 188 | conv_defs=conv_defs) 189 | 190 | return net, end_points 191 | 192 | 193 | def wrapped_partial(func, *args, **kwargs): 194 | partial_func = functools.partial(func, *args, **kwargs) 195 | functools.update_wrapper(partial_func, func) 196 | return partial_func 197 | 198 | 199 | def _reduced_kernel_size_for_small_input(input_tensor, kernel_size): 200 | """Define kernel size which is automatically reduced for small input. 201 | 202 | If the shape of the input images is unknown at graph construction time this 203 | function assumes that the input images are large enough. 204 | 205 | Args: 206 | input_tensor: input tensor of size [batch_size, height, width, channels]. 207 | kernel_size: desired kernel size of length 2: [kernel_height, kernel_width] 208 | 209 | Returns: 210 | a tensor with the kernel size. 211 | """ 212 | shape = input_tensor.get_shape().as_list() 213 | if shape[1] is None or shape[2] is None: 214 | kernel_size_out = kernel_size 215 | else: 216 | kernel_size_out = [min(shape[1], kernel_size[0]), 217 | min(shape[2], kernel_size[1])] 218 | return kernel_size_out 219 | 220 | 221 | def mobilenet_v2_arg_scope(is_training=True, 222 | weight_decay=0.00004, 223 | stddev=0.09, 224 | regularize_depthwise=False): 225 | """Defines the default MobilenetV2 arg scope. 226 | 227 | Args: 228 | is_training: Whether or not we're training the model. 229 | weight_decay: The weight decay to use for regularizing the model. 230 | stddev: The standard deviation of the trunctated normal weight initializer. 231 | regularize_depthwise: Whether or not apply regularization on depthwise. 232 | 233 | Returns: 234 | An `arg_scope` to use for the mobilenet v2 model. 235 | """ 236 | batch_norm_params = { 237 | 'is_training': is_training, 238 | 'center': True, 239 | 'scale': True, 240 | 'decay': 0.997, 241 | 'epsilon': 0.001, 242 | } 243 | 244 | # Set weight_decay for weights in Conv and DepthSepConv layers. 245 | weights_init = tf.truncated_normal_initializer(stddev=stddev) 246 | regularizer = tf.contrib.layers.l2_regularizer(weight_decay) 247 | if regularize_depthwise: 248 | depthwise_regularizer = regularizer 249 | else: 250 | depthwise_regularizer = None 251 | with slim.arg_scope([slim.conv2d, slim.separable_conv2d], 252 | weights_initializer=weights_init, 253 | activation_fn=tf.nn.relu, normalizer_fn=slim.batch_norm): 254 | with slim.arg_scope([slim.batch_norm], **batch_norm_params): 255 | with slim.arg_scope([slim.conv2d], weights_regularizer=regularizer): 256 | with slim.arg_scope([slim.separable_conv2d], 257 | weights_regularizer=depthwise_regularizer) as sc: 258 | return sc -------------------------------------------------------------------------------- /models/nets/resnet_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Contains building blocks for various versions of Residual Networks. 16 | Residual networks (ResNets) were proposed in: 17 | Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 18 | Deep Residual Learning for Image Recognition. arXiv:1512.03385, 2015 19 | More variants were introduced in: 20 | Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 21 | Identity Mappings in Deep Residual Networks. arXiv: 1603.05027, 2016 22 | We can obtain different ResNet variants by changing the network depth, width, 23 | and form of residual unit. This module implements the infrastructure for 24 | building them. Concrete ResNet units and full ResNet networks are implemented in 25 | the accompanying resnet_v1.py and resnet_v2.py modules. 26 | Compared to https://github.com/KaimingHe/deep-residual-networks, in the current 27 | implementation we subsample the output activations in the last residual unit of 28 | each block, instead of subsampling the input activations in the first residual 29 | unit of each block. The two implementations give identical results but our 30 | implementation is more memory efficient. 31 | """ 32 | from __future__ import absolute_import 33 | from __future__ import division 34 | from __future__ import print_function 35 | 36 | import collections 37 | import tensorflow as tf 38 | 39 | slim = tf.contrib.slim 40 | 41 | 42 | class Block(collections.namedtuple('Block', ['scope', 'unit_fn', 'args'])): 43 | """A named tuple describing a ResNet block. 44 | Its parts are: 45 | scope: The scope of the `Block`. 46 | unit_fn: The ResNet unit function which takes as input a `Tensor` and 47 | returns another `Tensor` with the output of the ResNet unit. 48 | args: A list of length equal to the number of units in the `Block`. The list 49 | contains one (depth, depth_bottleneck, stride) tuple for each unit in the 50 | block to serve as argument to unit_fn. 51 | """ 52 | 53 | 54 | def subsample(inputs, factor, scope=None): 55 | """Subsamples the input along the spatial dimensions. 56 | Args: 57 | inputs: A `Tensor` of size [batch, height_in, width_in, channels]. 58 | factor: The subsampling factor. 59 | scope: Optional variable_scope. 60 | Returns: 61 | output: A `Tensor` of size [batch, height_out, width_out, channels] with the 62 | input, either intact (if factor == 1) or subsampled (if factor > 1). 63 | """ 64 | if factor == 1: 65 | return inputs 66 | else: 67 | return slim.max_pool2d(inputs, [1, 1], stride=factor, scope=scope) 68 | 69 | 70 | def conv2d_same(inputs, num_outputs, kernel_size, stride, rate=1, scope=None): 71 | """Strided 2-D convolution with 'SAME' padding. 72 | When stride > 1, then we do explicit zero-padding, followed by conv2d with 73 | 'VALID' padding. 74 | Note that 75 | net = conv2d_same(inputs, num_outputs, 3, stride=stride) 76 | is equivalent to 77 | net = slim.conv2d(inputs, num_outputs, 3, stride=1, padding='SAME') 78 | net = subsample(net, factor=stride) 79 | whereas 80 | net = slim.conv2d(inputs, num_outputs, 3, stride=stride, padding='SAME') 81 | is different when the input's height or width is even, which is why we add the 82 | current function. For more details, see ResnetUtilsTest.testConv2DSameEven(). 83 | Args: 84 | inputs: A 4-D tensor of size [batch, height_in, width_in, channels]. 85 | num_outputs: An integer, the number of output filters. 86 | kernel_size: An int with the kernel_size of the filters. 87 | stride: An integer, the output stride. 88 | rate: An integer, rate for atrous convolution. 89 | scope: Scope. 90 | Returns: 91 | output: A 4-D tensor of size [batch, height_out, width_out, channels] with 92 | the convolution output. 93 | """ 94 | if stride == 1: 95 | return slim.conv2d(inputs, num_outputs, kernel_size, stride=1, rate=rate, biases_initializer=None, 96 | padding='SAME', scope=scope) 97 | else: 98 | kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1) 99 | pad_total = kernel_size_effective - 1 100 | pad_beg = pad_total // 2 101 | pad_end = pad_total - pad_beg 102 | inputs = tf.pad(inputs, 103 | [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) 104 | return slim.conv2d(inputs, num_outputs, kernel_size, stride=stride, biases_initializer=None, 105 | rate=rate, padding='VALID', scope=scope) 106 | 107 | 108 | @slim.add_arg_scope 109 | def stack_blocks_dense(net, blocks, output_stride=None, 110 | store_non_strided_activations=False, 111 | outputs_collections=None): 112 | """Stacks ResNet `Blocks` and controls output feature density. 113 | First, this function creates scopes for the ResNet in the form of 114 | 'block_name/unit_1', 'block_name/unit_2', etc. 115 | Second, this function allows the user to explicitly control the ResNet 116 | output_stride, which is the ratio of the input to output spatial resolution. 117 | This is useful for dense prediction tasks such as semantic segmentation or 118 | object detection. 119 | Most ResNets consist of 4 ResNet blocks and subsample the activations by a 120 | factor of 2 when transitioning between consecutive ResNet blocks. This results 121 | to a nominal ResNet output_stride equal to 8. If we set the output_stride to 122 | half the nominal network stride (e.g., output_stride=4), then we compute 123 | responses twice. 124 | Control of the output feature density is implemented by atrous convolution. 125 | Args: 126 | net: A `Tensor` of size [batch, height, width, channels]. 127 | blocks: A list of length equal to the number of ResNet `Blocks`. Each 128 | element is a ResNet `Block` object describing the units in the `Block`. 129 | output_stride: If `None`, then the output will be computed at the nominal 130 | network stride. If output_stride is not `None`, it specifies the requested 131 | ratio of input to output spatial resolution, which needs to be equal to 132 | the product of unit strides from the start up to some level of the ResNet. 133 | For example, if the ResNet employs units with strides 1, 2, 1, 3, 4, 1, 134 | then valid values for the output_stride are 1, 2, 6, 24 or None (which 135 | is equivalent to output_stride=24). 136 | store_non_strided_activations: If True, we compute non-strided (undecimated) 137 | activations at the last unit of each block and store them in the 138 | `outputs_collections` before subsampling them. This gives us access to 139 | higher resolution intermediate activations which are useful in some 140 | dense prediction problems but increases 4x the computation and memory cost 141 | at the last unit of each block. 142 | outputs_collections: Collection to add the ResNet block outputs. 143 | Returns: 144 | net: Output tensor with stride equal to the specified output_stride. 145 | Raises: 146 | ValueError: If the target output_stride is not valid. 147 | """ 148 | # The current_stride variable keeps track of the effective stride of the 149 | # activations. This allows us to invoke atrous convolution whenever applying 150 | # the next residual unit would result in the activations having stride larger 151 | # than the target output_stride. 152 | current_stride = 1 153 | 154 | # The atrous convolution rate parameter. 155 | rate = 1 156 | 157 | for block in blocks: 158 | with tf.variable_scope(block.scope, 'block', [net]) as sc: 159 | block_stride = 1 160 | for i, unit in enumerate(block.args): 161 | if store_non_strided_activations and i == len(block.args) - 1: 162 | # Move stride from the block's last unit to the end of the block. 163 | block_stride = unit.get('stride', 1) 164 | unit = dict(unit, stride=1) 165 | 166 | with tf.variable_scope('unit_%d' % (i + 1), values=[net]): 167 | # If we have reached the target output_stride, then we need to employ 168 | # atrous convolution with stride=1 and multiply the atrous rate by the 169 | # current unit's stride for use in subsequent layers. 170 | if output_stride is not None and current_stride == output_stride: 171 | net = block.unit_fn(net, rate=rate, **dict(unit, stride=1)) 172 | rate *= unit.get('stride', 1) 173 | 174 | else: 175 | net = block.unit_fn(net, rate=1, **unit) 176 | current_stride *= unit.get('stride', 1) 177 | if output_stride is not None and current_stride > output_stride: 178 | raise ValueError('The target output_stride cannot be reached.') 179 | 180 | # Collect activations at the block's end before performing subsampling. 181 | net = slim.utils.collect_named_outputs(outputs_collections, sc.name, net) 182 | 183 | # Subsampling of the block's output activations. 184 | if output_stride is not None and current_stride == output_stride: 185 | rate *= block_stride 186 | else: 187 | net = subsample(net, block_stride) 188 | current_stride *= block_stride 189 | if output_stride is not None and current_stride > output_stride: 190 | raise ValueError('The target output_stride cannot be reached.') 191 | 192 | if output_stride is not None and current_stride != output_stride: 193 | raise ValueError('The target output_stride cannot be reached.') 194 | 195 | return net 196 | 197 | 198 | def resnet_arg_scope(weight_decay=0.0001, 199 | batch_norm_decay=0.997, 200 | batch_norm_epsilon=1e-5, 201 | batch_norm_scale=True, 202 | activation_fn=tf.nn.relu, 203 | use_batch_norm=True, 204 | batch_norm_updates_collections=tf.GraphKeys.UPDATE_OPS): 205 | """Defines the default ResNet arg scope. 206 | TODO(gpapan): The batch-normalization related default values above are 207 | appropriate for use in conjunction with the reference ResNet models 208 | released at https://github.com/KaimingHe/deep-residual-networks. When 209 | training ResNets from scratch, they might need to be tuned. 210 | Args: 211 | weight_decay: The weight decay to use for regularizing the model. 212 | batch_norm_decay: The moving average decay when estimating layer activation 213 | statistics in batch normalization. 214 | batch_norm_epsilon: Small constant to prevent division by zero when 215 | normalizing activations by their variance in batch normalization. 216 | batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the 217 | activations in the batch normalization layer. 218 | activation_fn: The activation function which is used in ResNet. 219 | use_batch_norm: Whether or not to use batch normalization. 220 | batch_norm_updates_collections: Collection for the update ops for 221 | batch norm. 222 | Returns: 223 | An `arg_scope` to use for the resnet models. 224 | """ 225 | batch_norm_params = { 226 | 'decay': batch_norm_decay, 227 | 'epsilon': batch_norm_epsilon, 228 | 'scale': batch_norm_scale, 229 | 'updates_collections': batch_norm_updates_collections, 230 | 'fused': None, # Use fused batch norm if possible. 231 | } 232 | 233 | with slim.arg_scope( 234 | [slim.conv2d], 235 | weights_regularizer=slim.l2_regularizer(weight_decay), 236 | weights_initializer=slim.variance_scaling_initializer(), 237 | activation_fn=activation_fn, 238 | normalizer_fn=slim.batch_norm if use_batch_norm else None, 239 | normalizer_params=batch_norm_params): 240 | with slim.arg_scope([slim.batch_norm], **batch_norm_params): 241 | # The following implies padding='SAME' for pool1, which makes feature 242 | # alignment easier for dense prediction tasks. This is also used in 243 | # https://github.com/facebook/fb.resnet.torch. However the accompanying 244 | # code of 'Deep Residual Learning for Image Recognition' uses 245 | # padding='VALID' for pool1. You can switch to that choice by setting 246 | # slim.arg_scope([slim.max_pool2d], padding='VALID'). 247 | with slim.arg_scope([slim.max_pool2d], padding='SAME') as arg_sc: 248 | return arg_sc -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | os 2 | time 3 | sys 4 | numpy 5 | cv2 6 | PIL 7 | json 8 | skimage 9 | --------------------------------------------------------------------------------