├── network ├── __init__.py ├── denet.py └── network.py ├── figs ├── Fig1.png └── demo.png ├── fusion ├── include │ ├── file_io.h │ ├── utility.h │ ├── pro_and_fusion.h │ ├── constant.h │ └── frame.h ├── CMakeLists.txt └── src │ ├── main.cpp │ ├── utility.cpp │ ├── file_io.cpp │ └── pro_and_fusion.cpp ├── README.md ├── data └── data_aug.py ├── predict.py └── train.py /network/__init__.py: -------------------------------------------------------------------------------- 1 | from .denet import DeNet -------------------------------------------------------------------------------- /figs/Fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luohongcheng/BayesianDeNet/HEAD/figs/Fig1.png -------------------------------------------------------------------------------- /figs/demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luohongcheng/BayesianDeNet/HEAD/figs/demo.png -------------------------------------------------------------------------------- /fusion/include/file_io.h: -------------------------------------------------------------------------------- 1 | #ifndef FILE_IO_H 2 | #define FILE_IO_H 3 | 4 | #include "frame.h" 5 | 6 | void process_frame_paths(std::vector& frame_paths); 7 | 8 | void load_frame(const FramePath& frame_path, Frame& frame, int cols, int rows, float ushort_factor); 9 | 10 | #endif -------------------------------------------------------------------------------- /fusion/include/utility.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILITY_H 2 | #define UTILITY_H 3 | 4 | #include "frame.h" 5 | 6 | cv::Mat resize_depth(const cv::Mat &src, int cols, int rows); 7 | 8 | cv::Mat confidence_to_uncertainty(const cv::Mat &confidence); 9 | 10 | cv::Mat ushort_to_float(const cv::Mat &ushort_map, float ushort_factor); 11 | 12 | #endif -------------------------------------------------------------------------------- /fusion/include/pro_and_fusion.h: -------------------------------------------------------------------------------- 1 | #ifndef PROANDFUSION_H 2 | #define PROANDFUSION_H 3 | 4 | #include "frame.h" 5 | 6 | void propogate_depth(const FramePath &frame_path_pre, const FramePath &frame_path_curr, const cv::Mat& Ki, Frame &frame_pre, Frame &frame_curr, 7 | bool do_post_process); 8 | 9 | 10 | void fuse_depth(Frame &frame, float white_noise); 11 | 12 | #endif -------------------------------------------------------------------------------- /fusion/include/constant.h: -------------------------------------------------------------------------------- 1 | #ifndef CONSTANT_H 2 | #define CONSTANT_H 3 | #include 4 | 5 | const float ushort_factor = 5000.0; 6 | const float image_scale = 2.0; 7 | 8 | const float camera_fx = 500 / image_scale; 9 | const float camera_fy = 500 / image_scale; 10 | const float camera_cx = 320 / image_scale; 11 | const float camera_cy = 240 / image_scale; 12 | 13 | const int image_cols = 640 / image_scale; 14 | const int image_rows = 480 / image_scale; 15 | 16 | #endif -------------------------------------------------------------------------------- /fusion/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | project(fusion) 3 | 4 | set( CMAKE_CXX_FLAGS "--std=gnu++11" ${CMAKE_CXX_FLAGS} ) 5 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Wall") 6 | set(CMAKE_BUILD_TYPE "Release") 7 | 8 | 9 | find_package(OpenCV REQUIRED) 10 | include_directories(include) 11 | include_directories(/home/lhc/library/eigen-eigen-5a0156e40feb/) 12 | 13 | add_executable(fusion src/main.cpp src/file_io.cpp src/pro_and_fusion.cpp src/utility.cpp) 14 | 15 | 16 | target_link_libraries(fusion ${OpenCV_LIBS}) -------------------------------------------------------------------------------- /fusion/include/frame.h: -------------------------------------------------------------------------------- 1 | #ifndef FRAME_H 2 | #define FRAME_H 3 | 4 | #include 5 | 6 | struct FramePath { 7 | FramePath() {} 8 | 9 | std::string rgb_path; 10 | std::string gt_depth_path; 11 | std::string pred_depth_path; 12 | std::string pred_confi_path; 13 | 14 | std::string pose_path; 15 | }; 16 | 17 | 18 | struct Frame { 19 | Frame() {}; 20 | 21 | cv::Mat rgb_image; 22 | 23 | //resized gt depth 24 | cv::Mat gt_depth; 25 | //raw depth is used for evalution 26 | cv::Mat gt_depth_raw; 27 | 28 | cv::Mat observed_depth; 29 | cv::Mat propogated_depth; 30 | cv::Mat fused_depth; 31 | 32 | cv::Mat observed_uncertainty; 33 | cv::Mat propogated_uncertainty; 34 | cv::Mat fused_uncertainty; 35 | 36 | std::vector t; //t_x t_y t_z 37 | std::vector q; //q_x q_y q_z q_w 38 | }; 39 | 40 | #endif -------------------------------------------------------------------------------- /fusion/src/main.cpp: -------------------------------------------------------------------------------- 1 | #include "frame.h" 2 | #include "file_io.h" 3 | #include "constant.h" 4 | #include "pro_and_fusion.h" 5 | 6 | int main() { 7 | 8 | cv::Mat K = cv::Mat::eye(3, 3, CV_32F); 9 | K.at(0, 0) = camera_fx; 10 | K.at(1, 1) = camera_fy; 11 | K.at(0, 2) = camera_cx; 12 | K.at(1, 2) = camera_cy; 13 | 14 | std::vector frame_paths; 15 | //process frames 16 | 17 | int fusion_interval = 10; 18 | 19 | if (frame_paths.size() < fusion_interval) 20 | return -1; 21 | 22 | Frame frame_pre, frame_current; 23 | // load the first frame 24 | load_frame(frame_paths[0], frame_pre, image_cols, image_rows, ushort_factor); 25 | 26 | for (size_t i = 0; i < frame_paths.size() - fusion_interval; i += fusion_interval) { 27 | 28 | load_frame(frame_paths[i + fusion_interval], frame_current, image_cols, image_rows, ushort_factor); 29 | 30 | propogate_depth(frame_paths[i], frame_paths[i + fusion_interval], K, frame_pre, frame_current, true); 31 | 32 | fuse_depth(frame_current, 0.0); 33 | 34 | frame_pre.fused_depth = frame_current.fused_depth.clone(); 35 | frame_pre.fused_uncertainty = frame_current.fused_uncertainty.clone(); 36 | frame_pre.t = frame_current.t; 37 | frame_pre.q = frame_current.q; 38 | } 39 | 40 | } -------------------------------------------------------------------------------- /fusion/src/utility.cpp: -------------------------------------------------------------------------------- 1 | #include "utility.h" 2 | 3 | cv::Mat resize_depth(const cv::Mat &src, int cols, int rows) { 4 | 5 | cv::Mat dst = cv::Mat(cv::Size(cols, rows), CV_32FC1); 6 | float scale = src.cols / (float)cols; 7 | for (int i = 0; i < rows; ++i) { 8 | for (int j = 0; j < cols; ++j) { 9 | dst.at(i, j) = src.at(scale * i, scale * j); 10 | } 11 | } 12 | return dst; 13 | } 14 | 15 | cv::Mat confidence_to_uncertainty(const cv::Mat &confidence) { 16 | 17 | cv::Mat uncertainty = cv::Mat::zeros(confidence.size(), CV_32FC1); 18 | 19 | for (int i = 0; i < confidence.rows; ++i) { 20 | for (int j = 0; j < confidence.cols; ++j) { 21 | 22 | float value = confidence.at(i, j); 23 | if (value >= 1) 24 | value = 0.99999999; 25 | if (value <= 0 ) 26 | value = 1e-6; 27 | uncertainty.at(i, j) = pow(-log(value), 2); 28 | } 29 | } 30 | return uncertainty; 31 | } 32 | 33 | cv::Mat ushort_to_float(const cv::Mat &ushort_map, float ushort_factor) { 34 | 35 | cv::Mat float_map = cv::Mat::zeros(ushort_map.size(), CV_32FC1); 36 | 37 | for (int i = 0; i < ushort_map.rows; ++i) { 38 | for (int j = 0; j < ushort_map.cols; ++j) { 39 | float_map.at(i, j) = (ushort_map.at(i, j) / ushort_factor); 40 | } 41 | } 42 | return float_map; 43 | } -------------------------------------------------------------------------------- /fusion/src/file_io.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "file_io.h" 4 | #include "utility.h" 5 | 6 | void load_frame(const FramePath& frame_path, Frame& frame, int cols, int rows, float ushort_factor) { 7 | 8 | cv::Mat gt_depth = cv::imread(frame_path.gt_depth_path, -1); 9 | 10 | cv::Mat gt_depth_float = ushort_to_float(gt_depth, ushort_factor); 11 | frame.gt_depth_raw = gt_depth_float.clone(); 12 | 13 | gt_depth_float = resize_depth(gt_depth_float, cols, rows); 14 | frame.gt_depth = gt_depth_float.clone(); 15 | 16 | //load predicted depth 17 | cv::Mat obs_depth = cv::imread(frame_path.pred_depth_path, -1); 18 | obs_depth = ushort_to_float(obs_depth, ushort_factor); 19 | cv::resize(obs_depth, obs_depth, cv::Size(cols, rows)); 20 | frame.observed_depth = obs_depth.clone(); 21 | frame.fused_depth = obs_depth.clone(); 22 | 23 | //load predicted confidence and convert to uncertainty 24 | cv::Mat obs_confidence = cv::imread(frame_path.pred_confi_path, -1); 25 | obs_confidence = ushort_to_float(obs_confidence, ushort_factor); 26 | cv::resize(obs_confidence, obs_confidence, cv::Size(cols, rows)); 27 | cv::Mat obs_uncertainty = confidence_to_uncertainty(obs_confidence); 28 | frame.observed_uncertainty = obs_uncertainty.clone(); 29 | frame.fused_uncertainty = obs_uncertainty.clone(); 30 | 31 | //load camera poses 32 | std::ifstream ifs(frame_path.pose_path); 33 | std::string str; 34 | std::vector q_and_t; 35 | while (ifs >> str) { 36 | q_and_t.push_back(std::atof(str.c_str())); 37 | } 38 | ifs.close(); 39 | for (int i = 0; i < 3; ++i) { 40 | frame.t.push_back(q_and_t[i]); 41 | } 42 | for (int i = 3; i < 7; ++i) { 43 | frame.q.push_back(q_and_t[i]); 44 | } 45 | 46 | } 47 | 48 | void process_frame_paths(std::vector& frame_paths) { 49 | //TODO 50 | return ; 51 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## BayesianDeNet 2 | 3 | This is the implementation of our paper "Bayesian DeNet: Monocular Depth Prediction and Frame-wise Fusion with Synchronized Uncertainty". 4 | 5 |

6 | 7 | 8 | Demo Video: 9 | 10 | [![IMAGE ALT TEXT](figs/demo.png)](https://www.youtube.com/watch?v=DwH3RtjfQB8) 11 | 12 | ### Dependences 13 | * Tensorflow 14 | * OpenCV 15 | * Numpy 16 | * Eigen 17 | 18 | ### Filelist.txt 19 | > rgb/00000000.png, depth/00000000.png 20 | > rgb/00000001.png, depth/00000001.png 21 | > rgb/00000002.png, depth/00000002.png 22 | > ... 23 | 24 | ### Test 25 | Run 26 | 27 | python predict.py --rgb_path='rgb.png' --model_path='model' --depth_path='depth.png' --confidence_path='confidence.png' 28 | 29 | ### Train 30 | To train the model, run 31 | 32 | python train.py --filelist_path='filelist.txt' --pretrain_model_path='resnet50.npy' --output_models_dir='/home/xx/saved_models' 33 | 34 | ### Data Augmentation 35 | Run 36 | 37 | python data_aug.py 38 | 39 | ### Multi-frame Fusion 40 | cd fusion 41 | mkdir build && cd build 42 | cmake .. && make -j4 43 | ./fusion 44 | ### Citation 45 | If you find this code useful, please cite: 46 | > @article{BayesianDeNet, 47 | > title={Bayesian DeNet: Monocular Depth Prediction and Frame-wise Fusion with Synchronized Uncertainty}, 48 | > author={X. Yang and Y. Gao and H. Luo and C. Liao and K. Cheng}, 49 | > journal={IEEE Transactions on Multimedia}, 50 | > year = {2019}, 51 | > } 52 | 53 | ### Acknowledgement 54 | We thank [FCRN ](https://github.com/iro-cp/FCRN-DepthPrediction) for their released code. -------------------------------------------------------------------------------- /data/data_aug.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import random 4 | 5 | 6 | def scale_transform(image, scale_ratio): 7 | new_width = int(scale_ratio * 640) 8 | new_height = int(scale_ratio * 480) 9 | image_result = cv2.resize(image, (new_width, new_height)) 10 | image_result = image_result[new_height / 2 - 240:new_height / 2 + 240, new_width / 2 - 320:new_width / 2 + 320] 11 | return image_result 12 | 13 | 14 | def color_transform(image, scale_ratio): 15 | image_result = image * scale_ratio 16 | image_result[image_result > 255] = 255 17 | image_result[image_result < 0] = 0 18 | return image_result 19 | 20 | 21 | def flip_transfrom(image, id): 22 | if id == 2: 23 | return image 24 | image_result = cv2.flip(image, id) 25 | return image_result 26 | 27 | 28 | def listdir(path): 29 | file_list = [] 30 | for file in os.listdir(path): 31 | file_list.append(file) 32 | return file_list 33 | 34 | 35 | if __name__ == '__main__': 36 | input_dir = "/home/dataset/nyu_dataset/" 37 | output_dir = "/home/dataset/nyu_dataset_augmented/" 38 | file_list = listdir(input_dir + "rgb/") 39 | 40 | image_count = 0 41 | 42 | for file_name in file_list: 43 | rgb_image = cv2.imread(input_dir + "rgb/" + file_name, 1) 44 | depth_image = cv2.imread(input_dir + "depth/" + file_name, -1) 45 | print input_dir + "rgb/" + file_name 46 | scale_ratio = random.uniform(1, 1.5) 47 | color_ratio = random.uniform(0.6, 1.4) 48 | 49 | # 2^3=8 50 | for flip in [1, 2]: 51 | flipped_rgb_image = flip_transfrom(rgb_image, flip) 52 | flipped_depth_image = flip_transfrom(depth_image, flip) 53 | 54 | for color in [1, color_ratio]: 55 | colored_rgb_image = color_transform(flipped_rgb_image, color) # .astype('uint8') 56 | colored_depth_image = flipped_depth_image 57 | 58 | for scale in [1, scale_ratio]: 59 | scaled_rgb_image = scale_transform(colored_rgb_image, scale); 60 | scaled_depth_image = scale_transform(colored_depth_image, scale); 61 | scaled_depth_image = (scaled_depth_image / scale).astype('uint16') 62 | cv2.imwrite(output_dir + "rgb/%08d.png" % image_count, scaled_rgb_image) 63 | cv2.imwrite(output_dir + "depth/%08d.png" % image_count, scaled_depth_image) 64 | image_count += 1 65 | -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import tensorflow as tf 4 | import time 5 | import cv2 6 | import argparse 7 | 8 | import network 9 | 10 | 11 | def predict(rgb_path, model_path, depth_path, confidence_path): 12 | height = 228 13 | width = 304 14 | channels = 3 15 | batch_size = 1 16 | 17 | float_to_int_scale = 5000.0 18 | 19 | input_node = tf.placeholder(tf.float32, shape=(None, height, width, channels)) 20 | net = network.DeNet({'data': input_node}, batch_size, 1, False) 21 | 22 | with tf.Session() as sess: 23 | saver = tf.train.Saver() 24 | saver.restore(sess, model_path) 25 | 26 | # read rgb image 27 | rgb_img = cv2.imread(rgb_path) 28 | src_height, src_width, src_channel = rgb_img.shape 29 | rgb_img = cv2.resize(rgb_img, (width, height)) 30 | 31 | # network forward 32 | rgb_images = np.ndarray([batch_size, height, width, channels]) 33 | rgb_images[0] = rgb_img 34 | pred = sess.run(net.get_output(), feed_dict={input_node: rgb_images}) 35 | 36 | # save depth map 37 | pred_depth = pred[0, :, :, 0] 38 | pred_depth[np.where(pred_depth < 0)] = 0 39 | pred_depth = cv2.resize(pred_depth, (src_width, src_height)) 40 | pred_depth = np.array(pred_depth * float_to_int_scale).astype('uint16') 41 | cv2.imwrite(depth_path, pred_depth) 42 | 43 | # save confidene map 44 | pred_conf = pred[0, :, :, 1] 45 | pred_conf[np.where(pred_conf < 0)] = 0 46 | pred_conf = cv2.resize(pred_conf, (src_width, src_height)) 47 | pred_conf = np.array(pred_conf * float_to_int_scale).astype('uint16') 48 | cv2.imwrite(confidence_path, pred_conf) 49 | 50 | 51 | if __name__ == '__main__': 52 | # parse command line 53 | parser = argparse.ArgumentParser(description=''' 54 | predict the depth and confidence map of a RGB image 55 | ''') 56 | parser.add_argument('--rgb_path', help='input rgb path', default='') 57 | parser.add_argument('--model_path', help='input model path(.ckpt)', default='') 58 | parser.add_argument('--depth_path', help='saved depth map path', default='') 59 | parser.add_argument('--confidence_path', help='saved confidence map path', default='') 60 | args = parser.parse_args() 61 | 62 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 63 | predict(args.rgb_path, args.model_path, args.depth_path, args.confidence_path) 64 | os._exit(0) 65 | -------------------------------------------------------------------------------- /fusion/src/pro_and_fusion.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "pro_and_fusion.h" 5 | 6 | void post_process(cv::Mat& propogated_depth, cv::Mat &propogated_uncertainty) { 7 | 8 | cv::Mat propogate_depth_temp = propogated_depth.clone(); 9 | cv::Mat propogated_uncertainty_temp = propogated_uncertainty.clone(); 10 | int rows = propogated_depth.rows; 11 | int cols = propogated_depth.cols; 12 | for (int i = 0; i < rows; ++i) { 13 | for (int j = 0; j < cols; ++j) { 14 | if (propogated_depth.at(i, j) > 1e-9) 15 | continue; 16 | else { 17 | 18 | int count = 0; 19 | float total = 0; 20 | float total_uncer = 0; 21 | int window_size = 3; 22 | 23 | for (int ii = i - window_size <= 0 ? 0 : i - window_size; ii <= (i + window_size >= rows - 1 ? rows - 1 : i + window_size); ++ii) { 24 | for (int jj = j - window_size <= 0 ? 0 : j - window_size; jj <= (j + window_size >= cols - 1 ? cols - 1 : j + window_size); ++jj) { 25 | 26 | if (propogated_depth.at(ii, jj) > 1e-6) { 27 | total += propogated_depth.at(ii, jj); 28 | total_uncer += propogated_uncertainty.at(ii, jj); 29 | 30 | count++; 31 | } 32 | } 33 | } 34 | if (count != 0) { 35 | propogate_depth_temp.at(i, j) = total / (count); 36 | propogated_uncertainty_temp.at(i, j) = total_uncer / (count); 37 | } 38 | 39 | } 40 | 41 | } 42 | } 43 | propogated_depth = propogate_depth_temp.clone(); 44 | propogated_uncertainty = propogated_uncertainty_temp.clone(); 45 | } 46 | 47 | void propogate_depth(const FramePath &frame_path_pre, const FramePath &frame_path_curr, const cv::Mat& Ki, Frame &frame_pre, Frame &frame_curr, 48 | bool do_post_process) { 49 | 50 | 51 | float camera_fx = Ki.at(0, 0); 52 | float camera_fy = Ki.at(1, 1); 53 | float camera_cx = Ki.at(0, 2); 54 | float camera_cy = Ki.at(1, 2); 55 | 56 | Eigen::Matrix3d K; 57 | K << camera_fx, 0, camera_cx, 0, camera_fy, camera_cy, 0, 0, 1; 58 | 59 | Eigen::Quaterniond q1 = Eigen::Quaterniond(frame_pre.q[3], frame_pre.q[0], frame_pre.q[1], frame_pre.q[2]); 60 | Eigen::Quaterniond q2 = Eigen::Quaterniond(frame_curr.q[3], frame_curr.q[0], frame_curr.q[1], frame_curr.q[2]); 61 | Eigen::Matrix3d r1 = Eigen::Matrix3d::Identity(); 62 | r1 = q1.toRotationMatrix(); 63 | Eigen::Matrix3d r2 = Eigen::Matrix3d::Identity(); 64 | r2 = q2.toRotationMatrix(); 65 | Eigen::Matrix3d r = r2.inverse() * r1; 66 | cv::Mat rr = (cv::Mat_(3, 3) << 67 | r(0, 0), r(0, 1), r(0, 2), 68 | r(1, 0), r(1, 1), r(1, 2), 69 | r(2, 0), r(2, 1), r(2, 2)); 70 | Eigen::Vector3d t1; 71 | t1 << frame_pre.t[0], frame_pre.t[1], frame_pre.t[2]; 72 | Eigen::Vector3d t2; 73 | t2 << frame_curr.t[0], frame_curr.t[1], frame_curr.t[2]; 74 | Eigen::Vector3d t = r2.inverse() * (t1 - t2); 75 | 76 | float absolute_scale = 1.0; 77 | cv::Mat propogated_depth = cv::Mat::zeros(frame_pre.fused_depth.size(), CV_32FC1); 78 | cv::Mat propogated_uncertainty = cv::Mat::zeros(frame_pre.fused_depth.size(), CV_32FC1); 79 | int cols = propogated_uncertainty.cols; 80 | int rows = propogated_depth.rows; 81 | 82 | for (int i = 0; i < propogated_depth.rows; ++i) { 83 | for (int j = 0; j < propogated_depth.cols; ++j) { 84 | float d = frame_pre.fused_depth.at(i, j); 85 | if (d < 1e-9) 86 | continue; 87 | 88 | Eigen::Vector3d points_in_3d; 89 | double z = d; 90 | double x = (j - camera_cx) * z / camera_fx; 91 | double y = (i - camera_cy) * z / camera_fy; 92 | points_in_3d << x, y, z; 93 | 94 | points_in_3d = points_in_3d; 95 | 96 | Eigen::Vector3d normalized = K * (r * points_in_3d + t * absolute_scale); 97 | 98 | if (normalized[2] <= 0) 99 | continue; 100 | 101 | float u = normalized[0] / normalized[2]; 102 | float v = normalized[1] / normalized[2]; 103 | 104 | if (u < cols && u >= 0 && v < rows && v >= 0) { 105 | //warp_image.at(v, u) = rgb1.at(i, j); 106 | propogated_depth.at(v, u) = normalized[2]; 107 | propogated_uncertainty.at(v, u) = frame_pre.fused_uncertainty.at(i, j); 108 | } 109 | } 110 | } 111 | if (do_post_process) 112 | post_process(propogated_depth, propogated_uncertainty); 113 | frame_curr.propogated_depth = propogated_depth.clone(); 114 | frame_curr.propogated_uncertainty = propogated_uncertainty.clone(); 115 | 116 | } 117 | 118 | void fuse_depth(Frame &frame, float white_noise) { 119 | 120 | cv::Mat fused_depth = cv::Mat::zeros(frame.observed_depth.size(), CV_32FC1); 121 | cv::Mat fused_uncertainty = cv::Mat::zeros(frame.observed_uncertainty.size(), CV_32FC1); 122 | int rows = frame.observed_depth.rows; 123 | int cols = frame.observed_depth.cols; 124 | 125 | for (int i = 0; i < rows; ++i) { 126 | for (int j = 0; j < cols; ++j) { 127 | 128 | float pro_depth_value = frame.propogated_depth.at(i, j); 129 | float obs_depth_value = frame.observed_depth.at(i, j); 130 | float pro_uncertainty_value = frame.propogated_uncertainty.at(i, j); 131 | float obs_uncertrainty_value = frame.observed_uncertainty.at(i, j); 132 | 133 | float fused_depth_value; 134 | float fused_uncertainty_value; 135 | 136 | if (pro_depth_value < 1e-5 || pro_uncertainty_value < 1e-9) { 137 | fused_depth_value = obs_depth_value; 138 | fused_uncertainty_value = obs_uncertrainty_value; 139 | } 140 | else { 141 | pro_uncertainty_value += white_noise; 142 | fused_depth_value 143 | = (pro_depth_value * obs_uncertrainty_value + obs_depth_value * pro_uncertainty_value) / (pro_uncertainty_value + obs_uncertrainty_value); 144 | fused_uncertainty_value 145 | = (obs_uncertrainty_value * pro_uncertainty_value) / (pro_uncertainty_value + obs_uncertrainty_value); 146 | } 147 | 148 | fused_depth.at(i, j) = fused_depth_value; 149 | fused_uncertainty.at(i, j) = fused_uncertainty_value; 150 | } 151 | } 152 | 153 | frame.fused_depth = fused_depth.clone(); 154 | frame.fused_uncertainty = fused_uncertainty.clone(); 155 | } 156 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import os 3 | import sys 4 | import time 5 | import random 6 | import argparse 7 | import numpy as np 8 | import tensorflow as tf 9 | from datetime import datetime 10 | from matplotlib import pyplot as plt 11 | from PIL import Image 12 | from scipy import misc 13 | 14 | import network 15 | 16 | 17 | def read_rgb_file_names(path): 18 | file_names = [] 19 | file = open(path) 20 | while 1: 21 | line = file.readline() 22 | if not line: 23 | break 24 | else: 25 | if line[-1] == '\n': 26 | line = line[:-1] 27 | file_names.append(line.split(',')[0]) 28 | file.close() 29 | return file_names 30 | 31 | 32 | def loss_function(pred, gt_depth): 33 | gt_depth = gt_depth[:, :, :, 0] 34 | # ignore invalid pixel 35 | mask = tf.cast(tf.not_equal(gt_depth, tf.constant(0, dtype=tf.float32)), dtype=tf.float32) 36 | 37 | L1_distance = tf.abs(pred[:, :, :, 0] * mask - gt_depth * mask) 38 | L_depth = tf.reduce_mean(L1_distance) 39 | gt_confidence = tf.exp(-tf.abs(L1_distance)) 40 | L_confidence = tf.reduce_mean(tf.abs(pred[:, :, :, 1] * mask - gt_confidence * mask)) 41 | L_regular = tf.reduce_mean(tf.abs(pred[:, :, :, 1] * mask)) 42 | 43 | # Loss= 0.1*L_d + 0.5*(L_c+0.5*L_r) 44 | L_final = tf.constant(1.0, dtype=tf.float32) * L_depth + tf.constant(0.5, dtype=tf.float32) * ( 45 | L_confidence + tf.constant(0.5, dtype=tf.float32) * L_regular) 46 | return L_final 47 | 48 | 49 | def train(filelist_path, pretrain_model_path, output_models_dir): 50 | # hyper-params 51 | initial_lr = 0.01 # initial learning rate 52 | lr_decay = 0.25 # learning rate decay 53 | lr_dacay_epoch = 10 # learning rate drops every 10 epochs 54 | momentum = 0.9 # momentum 55 | max_iters = 250000 # max itarations 56 | batch_size = 16 # batch size 57 | display_step = 20 # display step 58 | model_save_step = 5000 # model saving step 59 | 60 | # input and output size 61 | height = 228 62 | width = 304 63 | depth_height = 228 64 | depth_width = 304 65 | channels_rgb = 3 66 | channels_depth = 1 67 | float_to_int_scale = 5000.0 68 | 69 | rgb_filelist = read_rgb_file_names(filelist_path) 70 | depth_filelist = [] 71 | iter_per_epoch = len(filelist_path) / batch_size 72 | lr_dacay_iter = lr_dacay_epoch * iter_per_epoch 73 | 74 | # Create a placeholder for the input image and label 75 | input_node_rgb = tf.placeholder(tf.float32, [batch_size, height, width, channels_rgb]) 76 | input_node_depth = tf.placeholder(tf.float32, [batch_size, depth_height, depth_width, channels_depth]) 77 | 78 | # Construct the network 79 | net = network.DeNet({'data': input_node_rgb}, batch_size, 1.0, 1) 80 | pred = net.get_output() 81 | 82 | # Loss, Learning rate, optimizer 83 | loss = loss_function(pred, input_node_depth) 84 | global_step = tf.Variable(0, trainable=False) 85 | learning_rate = tf.train.exponential_decay(initial_lr, global_step, lr_dacay_iter, lr_decay, staircase=True) 86 | optimizer = tf.train.MomentumOptimizer(learning_rate, momentum, use_locking=False, 87 | name='Momentum', use_nesterov=False).minimize(loss, global_step=global_step) 88 | 89 | init = tf.global_variables_initializer() 90 | 91 | with tf.Session() as sess: 92 | sess.run(init) 93 | 94 | # load the pre-train model 95 | if os.path.exists(pretrain_model_path): 96 | net.load(pretrain_model_path, sess, True) 97 | 98 | step = 0 99 | filelist_size = len(rgb_filelist) 100 | train_ptr = 0 101 | batch_of_rgb_path = [] 102 | batch_of_depth_path = [] 103 | labels = [] 104 | 105 | loss_batch_average = 0 106 | 107 | while step < max_iters: 108 | 109 | # shuffle the filelist when one epoch is finished 110 | if step % iteration_per_epoch == 0: 111 | random.shuffle(rgb_filelist) 112 | depth_filelist = [] 113 | for rgb_path in rgb_filelist: 114 | depth_filelist.append(rgb_path.replace("rgb", "depth")) 115 | 116 | # Get next batch of image and depth labels 117 | if (train_ptr + batch_size) < filelist_size: 118 | batch_of_rgb_path = rgb_filelist[train_ptr:(train_ptr + batch_size)] 119 | batch_of_depth_path = depth_filelist[train_ptr:(train_ptr + batch_size)] 120 | train_ptr += batch_size 121 | else: 122 | new_ptr = (train_ptr + batch_size) % filelist_size 123 | batch_of_rgb_path = rgb_filelist[train_ptr:] + rgb_filelist[:new_ptr] 124 | batch_of_depth_path = depth_filelist[train_ptr:] + depth_filelist[:new_ptr] 125 | train_ptr = new_ptr 126 | 127 | # Container for input rgb and depth label 128 | rgb_images = np.ndarray([batch_size, height, width, channels_rgb]) 129 | depth_images = np.ndarray([batch_size, depth_height, depth_width, channels_depth]) 130 | 131 | # read rgb image, resize and crop 132 | for i, rgb_path in enumerate(batch_of_rgb_path): 133 | # print rgb_path 134 | rgb_img = cv2.imread(rgb_path) 135 | rgb_img = cv2.resize(rgb_img, (320, 240)) 136 | rgb_img = rgb_img[6:234, 8:312] 137 | rgb_images[i] = rgb_img 138 | 139 | # read depth image, resize and crop 140 | for j, depth_path in enumerate(batch_of_depth_path): 141 | # print depth_path 142 | depth_image = cv2.imread(depth_path, -1) 143 | depth_image = np.array(depth_image).astype('float32') 144 | depth_image = depth_image / float_to_int_scale 145 | depth_image = cv2.resize(depth_image, (320, 240)) 146 | depth_image = depth_image[6:234, 8:312] 147 | depth_image = np.expand_dims(depth_image, axis=2) 148 | depth_images[j] = depth_image 149 | 150 | batch_loss, _, _ = sess.run([loss, learning_rate, optimizer], 151 | feed_dict={input_node_rgb: rgb_images, input_node_depth: depth_images}) 152 | loss_batch_average += batch_loss 153 | 154 | # print loss 155 | if (step + 1) % display_step == 0: 156 | print >> sys.stderr, "{} Iter {}: Training Loss = {:.4f} ".format(datetime.now(), step, 157 | loss_batch_average / display_step) 158 | loss_batch_average = 0 159 | 160 | # save the model 161 | if step % model_save_step == 0: 162 | saver = tf.train.Saver() 163 | saver.save(sess, output_models_dir + '/model%08d' % step) 164 | 165 | step += 1 166 | 167 | print "Training denet finished!" 168 | 169 | 170 | if __name__ == '__main__': 171 | parser = argparse.ArgumentParser(description=''' 172 | Train the network. 173 | ''') 174 | parser.add_argument('--filelist_path', help='file list path(.txt)', default='') 175 | parser.add_argument('--pretrain_model_path', help='pretrain model path(.npy). If there is not need, ignore it.', 176 | default='') 177 | parser.add_argument('--output_models_dir', help='directory for saving models', default='') 178 | args = parser.parse_args() 179 | 180 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 181 | train(args.filelist_path, args.pretrain_model_path, args.output_models_dir) 182 | os._exit(0) 183 | -------------------------------------------------------------------------------- /network/denet.py: -------------------------------------------------------------------------------- 1 | from .network import Network 2 | 3 | 4 | class DeNet(Network): 5 | def setup(self): 6 | (self.feed('data') 7 | .conv(7, 7, 64, 2, 2, relu=False, name='conv1') 8 | .batch_normalization(relu=True, name='bn_conv1') 9 | .max_pool(3, 3, 2, 2, name='pool1') 10 | .conv(1, 1, 256, 1, 1, biased=False, relu=False, name='res2a_branch1') 11 | .batch_normalization(name='bn2a_branch1')) 12 | 13 | (self.feed('pool1') 14 | .conv(1, 1, 64, 1, 1, biased=False, relu=False, name='res2a_branch2a') 15 | .batch_normalization(relu=True, name='bn2a_branch2a') 16 | .conv(3, 3, 64, 1, 1, biased=False, relu=False, name='res2a_branch2b') 17 | .batch_normalization(relu=True, name='bn2a_branch2b') 18 | .conv(1, 1, 256, 1, 1, biased=False, relu=False, name='res2a_branch2c') 19 | .batch_normalization(name='bn2a_branch2c')) 20 | 21 | (self.feed('bn2a_branch1', 22 | 'bn2a_branch2c') 23 | .add(name='res2a') 24 | .relu(name='res2a_relu') 25 | .conv(1, 1, 64, 1, 1, biased=False, relu=False, name='res2b_branch2a') 26 | .batch_normalization(relu=True, name='bn2b_branch2a') 27 | .conv(3, 3, 64, 1, 1, biased=False, relu=False, name='res2b_branch2b') 28 | .batch_normalization(relu=True, name='bn2b_branch2b') 29 | .conv(1, 1, 256, 1, 1, biased=False, relu=False, name='res2b_branch2c') 30 | .batch_normalization(name='bn2b_branch2c')) 31 | 32 | (self.feed('res2a_relu', 33 | 'bn2b_branch2c') 34 | .add(name='res2b') 35 | .relu(name='res2b_relu') 36 | .conv(1, 1, 64, 1, 1, biased=False, relu=False, name='res2c_branch2a') 37 | .batch_normalization(relu=True, name='bn2c_branch2a') 38 | .conv(3, 3, 64, 1, 1, biased=False, relu=False, name='res2c_branch2b') 39 | .batch_normalization(relu=True, name='bn2c_branch2b') 40 | .conv(1, 1, 256, 1, 1, biased=False, relu=False, name='res2c_branch2c') 41 | .batch_normalization(name='bn2c_branch2c')) 42 | 43 | (self.feed('res2b_relu', 44 | 'bn2c_branch2c') 45 | .add(name='res2c') 46 | .relu(name='res2c_relu') 47 | .conv(1, 1, 512, 2, 2, biased=False, relu=False, name='res3a_branch1') 48 | .batch_normalization(name='bn3a_branch1')) 49 | 50 | (self.feed('res2c_relu') 51 | .conv(1, 1, 128, 2, 2, biased=False, relu=False, name='res3a_branch2a') 52 | .batch_normalization(relu=True, name='bn3a_branch2a') 53 | .conv(3, 3, 128, 1, 1, biased=False, relu=False, name='res3a_branch2b') 54 | .batch_normalization(relu=True, name='bn3a_branch2b') 55 | .conv(1, 1, 512, 1, 1, biased=False, relu=False, name='res3a_branch2c') 56 | .batch_normalization(name='bn3a_branch2c')) 57 | 58 | (self.feed('bn3a_branch1', 59 | 'bn3a_branch2c') 60 | .add(name='res3a') 61 | .relu(name='res3a_relu') 62 | .conv(1, 1, 128, 1, 1, biased=False, relu=False, name='res3b_branch2a') 63 | .batch_normalization(relu=True, name='bn3b_branch2a') 64 | .conv(3, 3, 128, 1, 1, biased=False, relu=False, name='res3b_branch2b') 65 | .batch_normalization(relu=True, name='bn3b_branch2b') 66 | .conv(1, 1, 512, 1, 1, biased=False, relu=False, name='res3b_branch2c') 67 | .batch_normalization(name='bn3b_branch2c')) 68 | 69 | (self.feed('res3a_relu', 70 | 'bn3b_branch2c') 71 | .add(name='res3b') 72 | .relu(name='res3b_relu') 73 | .conv(1, 1, 128, 1, 1, biased=False, relu=False, name='res3c_branch2a') 74 | .batch_normalization(relu=True, name='bn3c_branch2a') 75 | .conv(3, 3, 128, 1, 1, biased=False, relu=False, name='res3c_branch2b') 76 | .batch_normalization(relu=True, name='bn3c_branch2b') 77 | .conv(1, 1, 512, 1, 1, biased=False, relu=False, name='res3c_branch2c') 78 | .batch_normalization(name='bn3c_branch2c')) 79 | 80 | (self.feed('res3b_relu', 81 | 'bn3c_branch2c') 82 | .add(name='res3c') 83 | .relu(name='res3c_relu') 84 | .conv(1, 1, 128, 1, 1, biased=False, relu=False, name='res3d_branch2a') 85 | .batch_normalization(relu=True, name='bn3d_branch2a') 86 | .conv(3, 3, 128, 1, 1, biased=False, relu=False, name='res3d_branch2b') 87 | .batch_normalization(relu=True, name='bn3d_branch2b') 88 | .conv(1, 1, 512, 1, 1, biased=False, relu=False, name='res3d_branch2c') 89 | .batch_normalization(name='bn3d_branch2c')) 90 | 91 | (self.feed('res3c_relu', 92 | 'bn3d_branch2c') 93 | .add(name='res3d') 94 | .relu(name='res3d_relu') 95 | .conv(1, 1, 1024, 2, 2, biased=False, relu=False, name='res4a_branch1') 96 | .batch_normalization(name='bn4a_branch1')) 97 | 98 | (self.feed('res3d_relu') 99 | .conv(1, 1, 256, 2, 2, biased=False, relu=False, name='res4a_branch2a') 100 | .batch_normalization(relu=True, name='bn4a_branch2a') 101 | .conv(3, 3, 256, 1, 1, biased=False, relu=False, name='res4a_branch2b') 102 | .batch_normalization(relu=True, name='bn4a_branch2b') 103 | .conv(1, 1, 1024, 1, 1, biased=False, relu=False, name='res4a_branch2c') 104 | .batch_normalization(name='bn4a_branch2c')) 105 | 106 | (self.feed('bn4a_branch1', 107 | 'bn4a_branch2c') 108 | .add(name='res4a') 109 | .relu(name='res4a_relu') 110 | .conv(1, 1, 256, 1, 1, biased=False, relu=False, name='res4b_branch2a') 111 | .batch_normalization(relu=True, name='bn4b_branch2a') 112 | .conv(3, 3, 256, 1, 1, biased=False, relu=False, name='res4b_branch2b') 113 | .batch_normalization(relu=True, name='bn4b_branch2b') 114 | .conv(1, 1, 1024, 1, 1, biased=False, relu=False, name='res4b_branch2c') 115 | .batch_normalization(name='bn4b_branch2c')) 116 | 117 | (self.feed('res4a_relu', 118 | 'bn4b_branch2c') 119 | .add(name='res4b') 120 | .relu(name='res4b_relu') 121 | .conv(1, 1, 256, 1, 1, biased=False, relu=False, name='res4c_branch2a') 122 | .batch_normalization(relu=True, name='bn4c_branch2a') 123 | .conv(3, 3, 256, 1, 1, biased=False, relu=False, name='res4c_branch2b') 124 | .batch_normalization(relu=True, name='bn4c_branch2b') 125 | .conv(1, 1, 1024, 1, 1, biased=False, relu=False, name='res4c_branch2c') 126 | .batch_normalization(name='bn4c_branch2c')) 127 | 128 | (self.feed('res4b_relu', 129 | 'bn4c_branch2c') 130 | .add(name='res4c') 131 | .relu(name='res4c_relu') 132 | .conv(1, 1, 256, 1, 1, biased=False, relu=False, name='res4d_branch2a') 133 | .batch_normalization(relu=True, name='bn4d_branch2a') 134 | .conv(3, 3, 256, 1, 1, biased=False, relu=False, name='res4d_branch2b') 135 | .batch_normalization(relu=True, name='bn4d_branch2b') 136 | .conv(1, 1, 1024, 1, 1, biased=False, relu=False, name='res4d_branch2c') 137 | .batch_normalization(name='bn4d_branch2c')) 138 | 139 | (self.feed('res4c_relu', 140 | 'bn4d_branch2c') 141 | .add(name='res4d') 142 | .relu(name='res4d_relu') 143 | .conv(1, 1, 256, 1, 1, biased=False, relu=False, name='res4e_branch2a') 144 | .batch_normalization(relu=True, name='bn4e_branch2a') 145 | .conv(3, 3, 256, 1, 1, biased=False, relu=False, name='res4e_branch2b') 146 | .batch_normalization(relu=True, name='bn4e_branch2b') 147 | .conv(1, 1, 1024, 1, 1, biased=False, relu=False, name='res4e_branch2c') 148 | .batch_normalization(name='bn4e_branch2c')) 149 | 150 | (self.feed('res4d_relu', 151 | 'bn4e_branch2c') 152 | .add(name='res4e') 153 | .relu(name='res4e_relu') 154 | .conv(1, 1, 256, 1, 1, biased=False, relu=False, name='res4f_branch2a') 155 | .batch_normalization(relu=True, name='bn4f_branch2a') 156 | .conv(3, 3, 256, 1, 1, biased=False, relu=False, name='res4f_branch2b') 157 | .batch_normalization(relu=True, name='bn4f_branch2b') 158 | .conv(1, 1, 1024, 1, 1, biased=False, relu=False, name='res4f_branch2c') 159 | .batch_normalization(name='bn4f_branch2c')) 160 | 161 | (self.feed('res4e_relu', 162 | 'bn4f_branch2c') 163 | .add(name='res4f') 164 | .relu(name='res4f_relu') 165 | .conv(1, 1, 2048, 2, 2, biased=False, relu=False, name='res5a_branch1') 166 | .batch_normalization(name='bn5a_branch1')) 167 | 168 | (self.feed('res4f_relu') 169 | .conv(1, 1, 512, 2, 2, biased=False, relu=False, name='res5a_branch2a') 170 | .batch_normalization(relu=True, name='bn5a_branch2a') 171 | .conv(3, 3, 512, 1, 1, biased=False, relu=False, name='res5a_branch2b') 172 | .batch_normalization(relu=True, name='bn5a_branch2b') 173 | .conv(1, 1, 2048, 1, 1, biased=False, relu=False, name='res5a_branch2c') 174 | .batch_normalization(name='bn5a_branch2c')) 175 | 176 | (self.feed('bn5a_branch1', 177 | 'bn5a_branch2c') 178 | .add(name='res5a') 179 | .relu(name='res5a_relu') 180 | .conv(1, 1, 512, 1, 1, biased=False, relu=False, name='res5b_branch2a') 181 | .batch_normalization(relu=True, name='bn5b_branch2a') 182 | .conv(3, 3, 512, 1, 1, biased=False, relu=False, name='res5b_branch2b') 183 | .batch_normalization(relu=True, name='bn5b_branch2b') 184 | .conv(1, 1, 2048, 1, 1, biased=False, relu=False, name='res5b_branch2c') 185 | .batch_normalization(name='bn5b_branch2c')) 186 | 187 | (self.feed('res5a_relu', 188 | 'bn5b_branch2c') 189 | .add(name='res5b') 190 | .relu(name='res5b_relu') 191 | .conv(1, 1, 512, 1, 1, biased=False, relu=False, name='res5c_branch2a') 192 | .batch_normalization(relu=True, name='bn5c_branch2a') 193 | .conv(3, 3, 512, 1, 1, biased=False, relu=False, name='res5c_branch2b') 194 | .batch_normalization(relu=True, name='bn5c_branch2b') 195 | .conv(1, 1, 2048, 1, 1, biased=False, relu=False, name='res5c_branch2c') 196 | .batch_normalization(name='bn5c_branch2c')) 197 | 198 | (self.feed('res5b_relu', 199 | 'bn5c_branch2c') 200 | .add(name='res5c') 201 | .relu(name='res5c_relu') 202 | .conv(1, 1, 1024, 1, 1, biased=True, relu=False, name='layer1') 203 | .batch_normalization(relu=False, name='layer1_BN') 204 | .up_project([3, 3, 1024, 512], id='2x', stride=1, BN=True) 205 | .up_project([3, 3, 512, 256], id='4x', stride=1, BN=True) 206 | .up_project([3, 3, 256, 128], id='8x', stride=1, BN=True) 207 | .up_project([3, 3, 128, 64], id='16x', stride=1, BN=True) 208 | .dropout(name='drop', keep_prob=1.0) 209 | .conv(3, 3, 2, 1, 1, name='ConvPred', relu=False) 210 | .upsample_bilinear([228, 304], name='bilinear') 211 | 212 | ) 213 | -------------------------------------------------------------------------------- /network/network.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | DEFAULT_PADDING = 'SAME' 5 | 6 | 7 | def get_incoming_shape(incoming): 8 | """ Returns the incoming data shape """ 9 | if isinstance(incoming, tf.Tensor): 10 | return incoming.get_shape().as_list() 11 | elif type(incoming) in [np.array, list, tuple]: 12 | return np.shape(incoming) 13 | else: 14 | raise Exception("Invalid incoming layer.") 15 | 16 | 17 | def interleave(tensors, axis): 18 | old_shape = get_incoming_shape(tensors[0])[1:] 19 | new_shape = [-1] + old_shape 20 | new_shape[axis] *= len(tensors) 21 | return tf.reshape(tf.stack(tensors, axis + 1), new_shape) 22 | 23 | 24 | def layer(op): 25 | '''Decorator for composable network layers.''' 26 | 27 | def layer_decorated(self, *args, **kwargs): 28 | # Automatically set a name if not provided. 29 | name = kwargs.setdefault('name', self.get_unique_name(op.__name__)) 30 | 31 | # Figure out the layer inputs. 32 | if len(self.terminals) == 0: 33 | raise RuntimeError('No input variables found for layer %s.' % name) 34 | elif len(self.terminals) == 1: 35 | layer_input = self.terminals[0] 36 | else: 37 | layer_input = list(self.terminals) 38 | # Perform the operation and get the output. 39 | layer_output = op(self, layer_input, *args, **kwargs) 40 | # Add to layer LUT. 41 | self.layers[name] = layer_output 42 | # This output is now the input for the next layer. 43 | self.feed(layer_output) 44 | # Return self for chained calls. 45 | return self 46 | 47 | return layer_decorated 48 | 49 | 50 | class Network(object): 51 | 52 | def __init__(self, inputs, batch, keep_prob, is_training, trainable=True): 53 | # The input nodes for this network 54 | self.inputs = inputs 55 | # The current list of terminal nodes 56 | self.terminals = [] 57 | # Mapping from layer names to layers 58 | self.layers = dict(inputs) 59 | # If true, the resulting variables are set as trainable 60 | self.trainable = trainable 61 | self.batch_size = batch 62 | self.keep_prob = keep_prob 63 | self.is_training = is_training 64 | self.setup() 65 | 66 | def setup(self): 67 | '''Construct the network. ''' 68 | raise NotImplementedError('Must be implemented by the subclass.') 69 | 70 | def load(self, data_path, session, ignore_missing=False): 71 | '''Load network weights. 72 | data_path: The path to the numpy-serialized network weights 73 | session: The current TensorFlow session 74 | ignore_missing: If true, serialized weights for missing layers are ignored. 75 | ''' 76 | data_dict = np.load(data_path, encoding='latin1').item() 77 | for op_name in data_dict: 78 | with tf.variable_scope(op_name, reuse=True): 79 | for param_name, data in iter(data_dict[op_name].items()): 80 | try: 81 | var = tf.get_variable(param_name) 82 | session.run(var.assign(data)) 83 | 84 | except ValueError: 85 | if not ignore_missing: 86 | raise 87 | 88 | def feed(self, *args): 89 | '''Set the input(s) for the next operation by replacing the terminal nodes. 90 | The arguments can be either layer names or the actual layers. 91 | ''' 92 | assert len(args) != 0 93 | self.terminals = [] 94 | for fed_layer in args: 95 | if isinstance(fed_layer, str): 96 | try: 97 | fed_layer = self.layers[fed_layer] 98 | except KeyError: 99 | raise KeyError('Unknown layer name fed: %s' % fed_layer) 100 | self.terminals.append(fed_layer) 101 | return self 102 | 103 | def get_output(self): 104 | '''Returns the current network output.''' 105 | return self.terminals[-1] 106 | 107 | def get_layer_output(self, name): 108 | return self.layers[name] 109 | 110 | def get_unique_name(self, prefix): 111 | '''Returns an index-suffixed unique name for the given prefix. 112 | This is used for auto-generating layer names based on the type-prefix. 113 | ''' 114 | ident = sum(t.startswith(prefix) for t, _ in self.layers.items()) + 1 115 | return '%s_%d' % (prefix, ident) 116 | 117 | def make_var(self, name, shape, regularizer=None): 118 | '''Creates a new TensorFlow variable.''' 119 | # weight decay 120 | return tf.get_variable(name, shape, dtype='float32', trainable=self.trainable, regularizer=regularizer) 121 | 122 | def validate_padding(self, padding): 123 | '''Verifies that the padding is one of the supported ones.''' 124 | assert padding in ('SAME', 'VALID') 125 | 126 | @layer # bilinear_resize layer 127 | def upsample_bilinear(self, input_data, input_size, name): 128 | return tf.image.resize_bilinear(input_data, size=input_size, name=name) 129 | 130 | @layer 131 | def conv(self, 132 | input_data, 133 | k_h, 134 | k_w, 135 | c_o, 136 | s_h, 137 | s_w, 138 | name, 139 | relu=True, 140 | padding=DEFAULT_PADDING, 141 | group=1, 142 | biased=True): 143 | 144 | # Verify that the padding is acceptable 145 | self.validate_padding(padding) 146 | # Get the number of channels in the input 147 | c_i = input_data.get_shape()[-1] 148 | 149 | if (padding == 'SAME'): 150 | input_data = tf.pad(input_data, 151 | [[0, 0], [(k_h - 1) // 2, (k_h - 1) // 2], [(k_w - 1) // 2, (k_w - 1) // 2], [0, 0]], 152 | "CONSTANT") 153 | 154 | # Verify that the grouping parameter is valid 155 | assert c_i % group == 0 156 | assert c_o % group == 0 157 | # Convolution for a given input and kernel 158 | convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding='VALID') 159 | 160 | with tf.variable_scope(name) as scope: 161 | # weight decay 162 | kernel = self.make_var('weights', shape=[k_h, k_w, c_i // group, c_o], 163 | regularizer=tf.contrib.layers.l2_regularizer(tf.constant(0.0005, dtype=tf.float32))) 164 | 165 | if group == 1: 166 | # This is the common-case. Convolve the input without any further complications. 167 | output = convolve(input_data, kernel) 168 | else: 169 | # Split the input into groups and then convolve each of them independently 170 | 171 | input_groups = tf.split(3, group, input_data) 172 | kernel_groups = tf.split(3, group, kernel) 173 | output_groups = [convolve(i, k) for i, k in zip(input_groups, kernel_groups)] 174 | # Concatenate the groups 175 | output = tf.concat(3, output_groups) 176 | 177 | # Add the biases 178 | if biased: 179 | # weight decay 180 | biases = self.make_var('biases', [c_o], regularizer=tf.contrib.layers.l2_regularizer( 181 | tf.constant(0.0005, dtype=tf.float32))) 182 | output = tf.nn.bias_add(output, biases) 183 | if relu: 184 | # ReLU non-linearity 185 | output = tf.nn.relu(output, name=scope.name) 186 | 187 | return output 188 | 189 | @layer 190 | def relu(self, input_data, name): 191 | return tf.nn.relu(input_data, name=name) 192 | 193 | @layer 194 | def max_pool(self, input_data, k_h, k_w, s_h, s_w, name, padding=DEFAULT_PADDING): 195 | self.validate_padding(padding) 196 | return tf.nn.max_pool(input_data, 197 | ksize=[1, k_h, k_w, 1], 198 | strides=[1, s_h, s_w, 1], 199 | padding=padding, 200 | name=name) 201 | 202 | @layer 203 | def avg_pool(self, input_data, k_h, k_w, s_h, s_w, name, padding=DEFAULT_PADDING): 204 | self.validate_padding(padding) 205 | return tf.nn.avg_pool(input_data, 206 | ksize=[1, k_h, k_w, 1], 207 | strides=[1, s_h, s_w, 1], 208 | padding=padding, 209 | name=name) 210 | 211 | @layer 212 | def lrn(self, input_data, radius, alpha, beta, name, bias=1.0): 213 | return tf.nn.local_response_normalization(input_data, 214 | depth_radius=radius, 215 | alpha=alpha, 216 | beta=beta, 217 | bias=bias, 218 | name=name) 219 | 220 | @layer 221 | def concat(self, inputs, axis, name): 222 | return tf.concat(concat_dim=axis, values=inputs, name=name) 223 | 224 | @layer 225 | def add(self, inputs, name): 226 | return tf.add_n(inputs, name=name) 227 | 228 | @layer 229 | def fc(self, input_data, num_out, name, relu=True): 230 | with tf.variable_scope(name) as scope: 231 | input_shape = input_data.get_shape() 232 | if input_shape.ndims == 4: 233 | # The input is spatial. Vectorize it first. 234 | dim = 1 235 | for d in input_shape[1:].as_list(): 236 | dim *= d 237 | feed_in = tf.reshape(input_data, [-1, dim]) 238 | else: 239 | feed_in, dim = (input_data, input_shape[-1].value) 240 | # weight decay 241 | weights = self.make_var('weights', shape=[dim, num_out], 242 | regularizer=tf.contrib.layers.l2_regularizer(tf.constant(0.0005, dtype=tf.float32))) 243 | # weight decay 244 | biases = self.make_var('biases', [num_out], 245 | regularizer=tf.contrib.layers.l2_regularizer(tf.constant(0.0005, dtype=tf.float32))) 246 | op = tf.nn.relu_layer if relu else tf.nn.xw_plus_b 247 | fc = op(feed_in, weights, biases, name=scope.name) 248 | return fc 249 | 250 | @layer 251 | def softmax(self, input_data, name): 252 | input_shape = map(lambda v: v.value, input_data.get_shape()) 253 | if len(input_shape) > 2: 254 | # For certain models (like NiN), the singleton spatial dimensions 255 | # need to be explicitly squeezed, since they're not broadcast-able 256 | # in TensorFlow's NHWC ordering (unlike Caffe's NCHW). 257 | if input_shape[1] == 1 and input_shape[2] == 1: 258 | input_data = tf.squeeze(input_data, squeeze_dims=[1, 2]) 259 | else: 260 | raise ValueError('Rank 2 tensor input expected for softmax!') 261 | return tf.nn.softmax(input_data, name) 262 | 263 | @layer 264 | def batch_normalization(self, input_data, name, scale_offset=True, relu=False): 265 | 266 | with tf.variable_scope(name) as scope: 267 | shape = [input_data.get_shape()[-1]] 268 | pop_mean = tf.get_variable("mean", shape, initializer=tf.constant_initializer(0.0), trainable=True) 269 | pop_var = tf.get_variable("variance", shape, initializer=tf.constant_initializer(1.0), trainable=True) 270 | epsilon = 1e-4 271 | decay = 0.999 272 | if scale_offset: 273 | scale = tf.get_variable("scale", shape, initializer=tf.constant_initializer(1.0)) 274 | offset = tf.get_variable("offset", shape, initializer=tf.constant_initializer(0.0)) 275 | else: 276 | scale, offset = (None, None) 277 | if self.is_training: 278 | batch_mean, batch_var = tf.nn.moments(input_data, [0, 1, 2]) 279 | 280 | train_mean = tf.assign(pop_mean, 281 | pop_mean * decay + batch_mean * (1 - decay)) 282 | train_var = tf.assign(pop_var, 283 | pop_var * decay + batch_var * (1 - decay)) 284 | with tf.control_dependencies([train_mean, train_var]): 285 | output = tf.nn.batch_normalization(input_data, 286 | batch_mean, batch_var, offset, scale, epsilon, name=name) 287 | else: 288 | output = tf.nn.batch_normalization(input_data, 289 | pop_mean, pop_var, offset, scale, epsilon, name=name) 290 | 291 | if relu: 292 | output = tf.nn.relu(output) 293 | 294 | return output 295 | 296 | @layer 297 | def dropout(self, input_data, keep_prob, name): 298 | return tf.nn.dropout(input_data, keep_prob, name=name) 299 | 300 | def unpool_as_conv(self, size, input_data, id, stride=1, ReLU=False, BN=True): 301 | 302 | # Model upconvolutions (unpooling + convolution) as interleaving feature 303 | # maps of four convolutions (A,B,C,D). Building block for up-projections. 304 | 305 | # Convolution A (3x3) 306 | # -------------------------------------------------- 307 | layerName = "layer%s_ConvA" % (id) 308 | self.feed(input_data) 309 | self.conv(3, 3, size[3], stride, stride, name=layerName, padding='SAME', relu=False) 310 | outputA = self.get_output() 311 | 312 | # Convolution B (2x3) 313 | # -------------------------------------------------- 314 | layerName = "layer%s_ConvB" % (id) 315 | padded_input_B = tf.pad(input_data, [[0, 0], [1, 0], [1, 1], [0, 0]], "CONSTANT") 316 | self.feed(padded_input_B) 317 | self.conv(2, 3, size[3], stride, stride, name=layerName, padding='VALID', relu=False) 318 | outputB = self.get_output() 319 | 320 | # Convolution C (3x2) 321 | # -------------------------------------------------- 322 | layerName = "layer%s_ConvC" % (id) 323 | padded_input_C = tf.pad(input_data, [[0, 0], [1, 1], [1, 0], [0, 0]], "CONSTANT") 324 | self.feed(padded_input_C) 325 | self.conv(3, 2, size[3], stride, stride, name=layerName, padding='VALID', relu=False) 326 | outputC = self.get_output() 327 | 328 | # Convolution D (2x2) 329 | # -------------------------------------------------- 330 | layerName = "layer%s_ConvD" % (id) 331 | padded_input_D = tf.pad(input_data, [[0, 0], [1, 0], [1, 0], [0, 0]], "CONSTANT") 332 | self.feed(padded_input_D) 333 | self.conv(2, 2, size[3], stride, stride, name=layerName, padding='VALID', relu=False) 334 | outputD = self.get_output() 335 | 336 | # Interleaving elements of the four feature maps 337 | # -------------------------------------------------- 338 | left = interleave([outputA, outputB], axis=1) # columns 339 | right = interleave([outputC, outputD], axis=1) # columns 340 | Y = interleave([left, right], axis=2) # rows 341 | 342 | if BN: 343 | layerName = "layer%s_BN" % (id) 344 | self.feed(Y) 345 | self.batch_normalization(name=layerName, scale_offset=True, relu=False) 346 | Y = self.get_output() 347 | 348 | if ReLU: 349 | Y = tf.nn.relu(Y, name=layerName) 350 | 351 | return Y 352 | 353 | def up_project(self, size, id, stride=1, BN=True): 354 | 355 | # Create residual upsampling layer (UpProjection) 356 | 357 | input_data = self.get_output() 358 | 359 | # Branch 1 360 | id_br1 = "%s_br1" % (id) 361 | 362 | # Interleaving Convs of 1st branch 363 | out = self.unpool_as_conv(size, input_data, id_br1, stride, ReLU=True, BN=True) 364 | 365 | # Convolution following the upProjection on the 1st branch 366 | layerName = "layer%s_Conv" % (id) 367 | self.feed(out) 368 | self.conv(size[0], size[1], size[3], stride, stride, name=layerName, relu=False) 369 | 370 | if BN: 371 | layerName = "layer%s_BN" % (id) 372 | self.batch_normalization(name=layerName, scale_offset=True, relu=False) 373 | 374 | # Output of 1st branch 375 | branch1_output = self.get_output() 376 | 377 | # Branch 2 378 | id_br2 = "%s_br2" % (id) 379 | # Interleaving convolutions and output of 2nd branch 380 | branch2_output = self.unpool_as_conv(size, input_data, id_br2, stride, ReLU=False) 381 | 382 | # sum branches 383 | layerName = "layer%s_Sum" % (id) 384 | output = tf.add_n([branch1_output, branch2_output], name=layerName) 385 | # ReLU 386 | layerName = "layer%s_ReLU" % (id) 387 | output = tf.nn.relu(output, name=layerName) 388 | 389 | self.feed(output) 390 | return self 391 | --------------------------------------------------------------------------------