├── .gitignore ├── README.md ├── datasets └── linemod │ └── dataset_posecnn.py ├── eval_net.py ├── lib ├── HoughVoting │ ├── .gitignore │ ├── __init__.py │ ├── houghvoting.cc │ ├── houghvoting.py │ └── setup.py ├── center_est_funcs.py ├── loss_funcions.py ├── roi_pool_pytorch.py ├── vgg16_convs.py ├── vgg16_convs_combine_mask.py └── vgg16_convs_combine_seg_center.py ├── testpytorch.py └── train_net.py /.gitignore: -------------------------------------------------------------------------------- 1 | pretrained_model/vgg16-397923af.pth 2 | log/test 3 | log/train 4 | trained_model/pretrained-vgg-05-1-weight 5 | trained_model/pretrained-posecnn-test 6 | trained_model/pretrained-posecnn-linemod 7 | *.pyc 8 | *.zip 9 | trained_model/pretrained-center-linemod 10 | trained_model/checkpoint.pth.tar 11 | log/* 12 | trained_model/* 13 | pretrained_model/* 14 | *.ipynb 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EECS442_CourseProject 2 | * EECS 442: Computer Vision 2019 Winter Course Project Workspace 3 | * Instructor: David Fouhey 4 | * University of Michigan 5 | 6 | -------------------------------------------------------------------------------- /datasets/linemod/dataset_posecnn.py: -------------------------------------------------------------------------------- 1 | import torch.utils.data as data 2 | from PIL import Image 3 | import os 4 | import os.path 5 | import errno 6 | import torch 7 | import json 8 | import codecs 9 | import numpy as np 10 | import sys 11 | import torchvision.transforms as transforms 12 | import argparse 13 | import json 14 | import time 15 | import random 16 | import numpy.ma as ma 17 | import copy 18 | import scipy.misc 19 | import scipy.io as scio 20 | import yaml 21 | import lib.center_est_funcs as center_img_gt 22 | import matplotlib.pyplot as plt 23 | 24 | class PoseDataset(data.Dataset): 25 | def __init__(self, mode, num, add_noise, root, noise_trans, refine, onehot, 26 | seg = False, vertex_reg = False, vertex_reg_hough = False): 27 | # self.objlist = [1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15] 28 | # self.objlist = [1, 2] 29 | self.objlist = [2] 30 | 31 | self.mode = mode 32 | 33 | self.seg_list = [] 34 | self.list_segmentation = [] # only the 2th class have the ground truth segmentation 35 | self.list_rgb = [] # save the path of the rgbd image 36 | self.list_depth = [] # save the path of the depth image 37 | self.list_label = [] # save the path of the label image 38 | self.list_obj = [] # save the list of objlist(the folder name) 39 | self.list_rank = [] # save the index of data in the folder 40 | self.meta = {} # meta_file have the ground truth information 41 | self.pt = {} 42 | self.root = root 43 | self.noise_trans = noise_trans 44 | self.refine = refine 45 | self.onehot = onehot 46 | self.seg_mode = seg 47 | self.vertex_reg_mode = vertex_reg 48 | self.vertex_reg_hough_mode = vertex_reg_hough 49 | 50 | item_count = 0 51 | for item in self.objlist: 52 | if self.seg_mode and item!=2: 53 | continue 54 | if self.mode != 'train': 55 | input_file = open('{0}/data/{1}/train.txt'.format(self.root, '%02d' % item)) 56 | else: 57 | input_file = open('{0}/data/{1}/test.txt'.format(self.root, '%02d' % item)) 58 | while 1: 59 | item_count += 1 60 | input_line = input_file.readline() 61 | if self.mode == 'test' and item_count % 10 != 0: 62 | continue 63 | if not input_line: 64 | break 65 | if input_line[-1:] == '\n': 66 | input_line = input_line[:-1] 67 | 68 | self.list_rgb.append('{0}/data/{1}/rgb/{2}.png'.format(self.root, '%02d' % item, input_line)) 69 | self.list_depth.append('{0}/data/{1}/depth/{2}.png'.format(self.root, '%02d' % item, input_line)) 70 | 71 | if self.mode == 'eval': 72 | self.list_label.append('{0}/segnet_results/{1}_label/{2}_label.png'.format(self.root, '%02d' % item, input_line)) 73 | elif self.seg_mode: 74 | self.list_label.append('{0}/data/{1}/mask_all/{2}.png'.format(self.root, '%02d' % item, input_line)) 75 | else: 76 | self.list_label.append('{0}/data/{1}/mask/{2}.png'.format(self.root, '%02d' % item, input_line)) 77 | 78 | self.list_obj.append(item) 79 | self.list_rank.append(int(input_line)) 80 | 81 | 82 | meta_file = open('{0}/data/{1}/gt.yml'.format(self.root, '%02d' % item), 'r') 83 | self.meta[item] = yaml.load(meta_file, Loader=yaml.FullLoader) 84 | # self.pt[item] = ply_vtx('{0}/models/obj_{1}.ply'.format(self.root, '%02d' % item)) 85 | print("Object {0} buffer loaded".format(item)) 86 | 87 | self.length = len(self.list_rgb) 88 | 89 | self.cam_cx = 325.26110 90 | self.cam_cy = 242.04899 91 | self.cam_fx = 572.41140 92 | self.cam_fy = 573.57043 93 | self.intrin_matrix = np.array([[self.cam_fx, 0 , self.cam_cx, 0], 94 | [0, self.cam_fy, self.cam_cy, 0], 95 | [0, 0, 1, 0]]) 96 | 97 | self.xmap = np.array([[j for i in range(640)] for j in range(480)]) 98 | self.ymap = np.array([[i for i in range(640)] for j in range(480)]) 99 | 100 | self.num = num # this if the number of points 101 | self.add_noise = add_noise 102 | self.trancolor = transforms.ColorJitter(0.2, 0.2, 0.2, 0.05) 103 | self.norm = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 104 | self.border_list = [-1, 40, 80, 120, 160, 200, 240, 280, 320, 360, 400, 440, 480, 520, 560, 600, 640, 680] 105 | self.num_pt_mesh_large = 500 106 | self.num_pt_mesh_small = 500 107 | self.symmetry_obj_idx = [7, 8] 108 | # This is the pixel value for each class, it has the same order with data in the folder 109 | # pixel value == 21 => the object is 01 in the folder 110 | # self.seg_list = [0, 21, 43, 64, 85,106,128, 149, 170, 191, 213, 234, 255] 111 | self.seg_list = [0, 21, 43, 106,128, 170, 191, 213, 234, 255] 112 | self.weight_clsss = np.array([0.5, 1, 1, 1, 1, 1, 1, 1, 1, 1]) 113 | self.cls_indexes = [i for i in range(len(self.seg_list))] 114 | self.num_classes = len(self.seg_list) 115 | self.seg_label_to_gt_label = [0, 1, 2, 5, 6, 8, 9, 10, 11, 12] 116 | self.gt_label_to_seg_label = [0, 1, 2, -1, -1, 3, 4, -1, 5, 6, 7, 8, 9] 117 | self.extents = self.get_extents() 118 | 119 | def get_extents(self): 120 | extents = np.zeros((self.num_classes, 3)) 121 | for i in range(1,len(self.gt_label_to_seg_label)): 122 | if self.gt_label_to_seg_label[i]>0: 123 | pt = ply_vtx('{0}/models/obj_{1}.ply'.format(self.root, '%02d' % i)) 124 | model_points = pt / 1000.0 125 | points_arr = np.array(model_points) 126 | xyz_min = np.min(points_arr, axis = 0) 127 | xyz_max = np.max(points_arr, axis = 0) 128 | extents[self.gt_label_to_seg_label[i], :] = xyz_max - xyz_min 129 | return -np.sort(-extents, axis = 1) 130 | 131 | 132 | 133 | 134 | def __getitem__(self, index): 135 | img = Image.open(self.list_rgb[index]) 136 | ori_img = np.array(img) 137 | depth = np.array(Image.open(self.list_depth[index])) 138 | label_in = np.array(Image.open(self.list_label[index])) 139 | obj = self.list_obj[index] # what the label for this index 140 | rank = self.list_rank[index] 141 | img = np.array(img)[:, :, :3] 142 | img = np.transpose(img, (2, 0, 1)) 143 | 144 | # Since the second object have more information 145 | if obj == 2: 146 | # if the object is the second object 147 | if not self.vertex_reg_mode: 148 | # if we only need the information of the second object 149 | for i in range(0, len(self.meta[obj][rank])): 150 | if self.meta[obj][rank][i]['obj_id'] == 2: 151 | meta = self.meta[obj][rank][i] 152 | break 153 | else: 154 | # if we need all the information for all objects 155 | meta = self.meta[obj][rank] 156 | else: 157 | meta = self.meta[obj][rank][0] 158 | 159 | label = None 160 | # unique, counts = np.unique(label_in, return_counts=True) 161 | # print(unique, counts) 162 | if self.seg_mode: 163 | label = np.zeros((len(self.seg_list), label_in.shape[0], label_in.shape[1])) 164 | for j in range(len(self.seg_list)): 165 | label[j, :] = label_in == self.seg_list[j] 166 | if not self.onehot: 167 | label = np.argmax(label, axis = 0) 168 | 169 | # plt.imshow(label, cmap = 'hot', interpolation = 'nearest') 170 | # plt.show() 171 | # plt.pause(100) 172 | 173 | # without vertex_reg mod, only return the data for segmentation 174 | if not self.vertex_reg_mode: 175 | return torch.from_numpy(img.astype(np.float32)), \ 176 | torch.from_numpy(label.astype(int)) 177 | 178 | # with vertex_reg mode 179 | # Relate with meta 180 | bboxs = np.zeros((self.num_classes, 5)) # all the bounding box in the same image for different class 181 | extrin_matrixs = np.zeros((self.num_classes, 4, 4)) 182 | extrin_matrixs[:,3,3] = 1 183 | centers = np.zeros((self.num_classes, 2)) 184 | depth_centers = np.zeros((self.num_classes, 1)) 185 | for sub_meta in meta: 186 | seg_index = self.gt_label_to_seg_label[int(sub_meta['obj_id'])] 187 | # preprocess the bounding box information 188 | bboxs[seg_index, 0] = 1 189 | rmin, rmax, cmin, cmax = get_bbox(sub_meta['obj_bb']) 190 | bboxs[seg_index, 1:] = [cmin, rmin, cmax, rmax] 191 | 192 | # preprocess the pose information 193 | extrin_matrixs[seg_index, 0:3, 0:3] = np.resize(np.array(sub_meta['cam_R_m2c']), (3, 3)) 194 | extrin_matrixs[seg_index, 0:3, 3] = np.array(sub_meta['cam_t_m2c']) 195 | obj_center = np.ones((4,1)) 196 | obj_center[0:3, 0] = extrin_matrixs[seg_index, 0:3, 3] 197 | center_homo = self.intrin_matrix.dot(obj_center) 198 | centers[seg_index, :] = center_homo[0:2].reshape(-1)/center_homo[2] 199 | depth_centers[seg_index, :] = extrin_matrixs[seg_index, 2, 3] 200 | if self.onehot: 201 | label_single_channel = np.argmax(label, axis = 0) 202 | else: 203 | label_single_channel = label 204 | vertex_targets, vertex_weights = center_img_gt._vote_centers_train(label_single_channel, self.cls_indexes, 205 | centers, depth_centers, self.num_classes) 206 | 207 | # with vertex reg and hough voting 208 | # load the point cloud and set the size of the point clout to be num_pt_mesh_small 209 | 210 | # meta data include camera intrinsic matrix 211 | meta = np.zeros((48,)) 212 | meta[0] = self.cam_fx 213 | meta[4] = self.cam_fy 214 | meta[2] = self.cam_cx 215 | meta[5] = self.cam_cy 216 | 217 | # gt give to hough voting information to calculate weight 218 | gt_hough = np.zeros((10,1)) 219 | 220 | 221 | 222 | return (torch.from_numpy(img.astype(np.float32)), 223 | torch.from_numpy(label.astype(int)), 224 | torch.from_numpy(vertex_targets.astype(np.float32)), 225 | torch.from_numpy(vertex_weights.astype(np.float32)), 226 | torch.from_numpy(self.extents.astype(np.float32)), 227 | torch.from_numpy(meta.astype(np.float32)), 228 | torch.from_numpy(gt_hough.astype(np.float32)), 229 | torch.from_numpy(extrin_matrixs.astype(np.float32)), 230 | torch.from_numpy(bboxs.astype(np.float32))) 231 | 232 | """ 233 | return torch.from_numpy(cloud.astype(np.float32)), \ 234 | torch.LongTensor(choose.astype(np.int32)), \ 235 | self.norm(torch.from_numpy(img_masked.astype(np.float32))), \ 236 | torch.from_numpy(target.astype(np.float32)), \ 237 | torch.from_numpy(model_points.astype(np.float32)), \ 238 | torch.LongTensor([self.objlist.index(obj)]) 239 | """ 240 | 241 | def __len__(self): 242 | return self.length 243 | 244 | def get_sym_list(self): 245 | return self.symmetry_obj_idx 246 | 247 | def get_num_points_mesh(self): 248 | if self.refine: 249 | return self.num_pt_mesh_large 250 | else: 251 | return self.num_pt_mesh_small 252 | 253 | 254 | 255 | border_list = [-1, 40, 80, 120, 160, 200, 240, 280, 320, 360, 400, 440, 480, 520, 560, 600, 640, 680] 256 | img_width = 480 257 | img_length = 640 258 | 259 | def get_bbox(bbox): 260 | bbx = [bbox[1], bbox[1] + bbox[3], bbox[0], bbox[0] + bbox[2]] 261 | if bbx[0] < 0: 262 | bbx[0] = 0 263 | if bbx[1] >= 480: 264 | bbx[1] = 479 265 | if bbx[2] < 0: 266 | bbx[2] = 0 267 | if bbx[3] >= 640: 268 | bbx[3] = 639 269 | rmin, rmax, cmin, cmax = bbx[0], bbx[1], bbx[2], bbx[3] 270 | r_b = rmax - rmin 271 | for tt in range(len(border_list)): 272 | if r_b > border_list[tt] and r_b < border_list[tt + 1]: 273 | r_b = border_list[tt + 1] 274 | break 275 | c_b = cmax - cmin 276 | for tt in range(len(border_list)): 277 | if c_b > border_list[tt] and c_b < border_list[tt + 1]: 278 | c_b = border_list[tt + 1] 279 | break 280 | center = [int((rmin + rmax) / 2), int((cmin + cmax) / 2)] 281 | rmin = center[0] - int(r_b / 2) 282 | rmax = center[0] + int(r_b / 2) 283 | cmin = center[1] - int(c_b / 2) 284 | cmax = center[1] + int(c_b / 2) 285 | if rmin < 0: 286 | delt = -rmin 287 | rmin = 0 288 | rmax += delt 289 | if cmin < 0: 290 | delt = -cmin 291 | cmin = 0 292 | cmax += delt 293 | if rmax > 480: 294 | delt = rmax - 480 295 | rmax = 480 296 | rmin -= delt 297 | if cmax > 640: 298 | delt = cmax - 640 299 | cmax = 640 300 | cmin -= delt 301 | return rmin, rmax, cmin, cmax 302 | 303 | 304 | def ply_vtx(path): 305 | f = open(path) 306 | assert f.readline().strip() == "ply" 307 | f.readline() 308 | f.readline() 309 | N = int(f.readline().split()[-1]) 310 | while f.readline().strip() != "end_header": 311 | continue 312 | pts = [] 313 | for _ in range(N): 314 | pts.append(np.float32(f.readline().split()[:3])) 315 | return np.array(pts) 316 | -------------------------------------------------------------------------------- /eval_net.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Apr 15 11:04:43 2019 4 | 5 | @author: sunhu 6 | """ 7 | import numpy as np 8 | import torch 9 | import torch.nn as nn 10 | import torch.optim as optim 11 | import torch.nn.functional as F 12 | from sklearn.metrics import average_precision_score as ap_score 13 | from tqdm import tqdm 14 | 15 | 16 | 17 | def cal_AP(testloader, net, criterion, device, num_obj, opt): 18 | ''' 19 | Calculate Average Precision 20 | Evaluation for the semantic segmentation part 21 | ''' 22 | cnt = 0 23 | aps = [] 24 | with torch.no_grad(): 25 | net = net.eval() 26 | preds = [[] for _ in range(num_obj)] 27 | heatmaps = [[] for _ in range(num_obj)] 28 | for data in tqdm(testloader): 29 | if opt.vertex_reg == True: 30 | # Only train the center-voting part 31 | images, labels, vertex_targets, vertex_weights, extents = data 32 | images = images.to(device) 33 | labels = labels.type('torch.LongTensor').to(device) 34 | extents = extents.to(device) 35 | output_seg, _, _ = net(images, extents) 36 | else: 37 | # Only train the segmentation part 38 | images, labels = data 39 | images = images.to(device) 40 | labels = labels.type('torch.LongTensor').to(device) 41 | output_seg = net(images) 42 | output = output_seg.cpu().numpy() 43 | for c in range(num_obj): 44 | preds[c].append(output[:, c].reshape(-1)) 45 | heatmaps[c].append(labels[:, c].cpu().numpy().reshape(-1)) 46 | 47 | for c in range(num_obj): 48 | preds[c] = np.concatenate(preds[c]) 49 | heatmaps[c] = np.concatenate(heatmaps[c]) 50 | if heatmaps[c].max() == 0: 51 | ap = float('nan') 52 | else: 53 | ap = ap_score(heatmaps[c], preds[c]) 54 | aps.append(ap) 55 | print("AP = {}".format(ap)) 56 | 57 | # print(losses / cnt) 58 | return aps -------------------------------------------------------------------------------- /lib/HoughVoting/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nxu96/PoseCNN_PyTorch/92a9a005f3e4f61540cee11650b4288c615b9beb/lib/HoughVoting/.gitignore -------------------------------------------------------------------------------- /lib/HoughVoting/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Pose CNN Pytorch Implementation: Hough Voting Layer 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # -------------------------------------------------------- 5 | -------------------------------------------------------------------------------- /lib/HoughVoting/houghvoting.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "opencv2/opencv.hpp" 11 | 12 | #include 13 | // #include "opencv2/core/matx.hpp" 14 | #include "opencv2/core/core.hpp" 15 | // #include 16 | #include 17 | #include "torch/script.h" 18 | #include 19 | #include 20 | 21 | #define VERTEX_CHANNELS 3 22 | using namespace torch; 23 | // typedef Eigen::Matrix Vec; 24 | using namespace at; 25 | 26 | int clamp(int val, int min_val, int max_val) 27 | { 28 | return std::max(min_val, std::min(max_val, val)); 29 | } 30 | 31 | // // Hough Voting layer main function 32 | // std::vector> Forward(const Tensor& label, const Tensor& vertex, const Tensor& extents, //const Tensor& meta_data, const Tensor& gt, int is_train); 33 | //std::vector> Forward(const Tensor& label, const Tensor& vertex, const Tensor& extents, //const Tensor& meta_data, const Tensor& gt, int is_train); 34 | torch::Tensor Forward(const Tensor& label, const Tensor& vertex, const Tensor& extents, const Tensor& meta_data, const Tensor& gt, int is_train); 35 | 36 | 37 | // Get ground truth model 3D geometry 38 | void getBb3Ds(const Tensor& extents, std::vector>& bb3Ds, int num_classes); 39 | 40 | // // Get ground truth model 3D bounding box 41 | inline std::vector getBB3D(const cv::Vec& extent); 42 | 43 | // // Projected 2D Bounding box 44 | // inline cv::Rect getBB2D(int imageWidth, int imageHeight, const std::vector& bb3D, const cv::Mat& camMat, const cv::Mat& rvec, const cv::Mat& tvec); 45 | 46 | inline float angle_distance(cv::Point2f x, cv::Point2f n, cv::Point2f p); 47 | 48 | void projectPoints(std::vector bb3Ds, float& bb_distance, Eigen::MatrixXf camMat, std::vector& bb2D); 49 | // Hough voting functionality 50 | void hough_voting(const Tensor& v_label, const Tensor& v_vertex, const int labelmap, const int vertmap, std::vector> bb3Ds, int batch, int height, int width, int num_classes, int is_train,float fx, float fy, float px, float py, std::vector >& outputs); 51 | 52 | // // Find better bb2D geometry 53 | // inline void compute_width_height(const Tensor& label, const Tensor& vertex, const int labelmap, const int vertmap, cv::Point2f center, std::vector> bb3Ds, cv::Mat camMat, float inlierThreshold, int height, int width, int channel, int num_classes, int & bb_width, int & bb_height, float & bb_distance); 54 | inline void compute_width_height(const Tensor& v_label, const Tensor& v_vertex, const int labelmap, const float vertmap, cv::Point2f center, std::vector> bb3Ds, Eigen::Matrix3f camMat,float inlierThreshold, int height, int width, int channel, int num_classes, int & bb_width, int & bb_height, float & bb_distance); 55 | 56 | // /////////////////////////////// 57 | 58 | // std::vector> Forward(const Tensor& label, const Tensor& vertex, const Tensor& extents, //const Tensor& meta_data, const Tensor& gt, int is_train) 59 | //std::vector> Forward(const Tensor& label, const Tensor& vertex, const Tensor& extents, //const Tensor& meta_data, const Tensor& gt, int is_train) 60 | torch::Tensor Forward(const Tensor& label, const Tensor& vertex, const Tensor& extents, const Tensor& meta_data, const Tensor& gt, int is_train) 61 | 62 | { 63 | // Grab the input tensor 64 | /////////////////////////////// 65 | // format of the meta_data 66 | // intrinsic matrix: meta_data[0 ~ 8] 67 | // inverse intrinsic matrix: meta_data[9 ~ 17] 68 | // pose_world2live: meta_data[18 ~ 29] 69 | // pose_live2world: meta_data[30 ~ 41] 70 | // voxel step size: meta_data[42, 43, 44] 71 | // voxel min value: meta_data[45, 46, 47] 72 | auto v_meta_data = meta_data.view(-1); 73 | 74 | // const float* v_gt = gt.flat().data(); 75 | // const float* v_gt = >.view(-1); 76 | auto v_gt = gt.view(-1); 77 | // batch size 78 | int batch_size = label.size(0); 79 | // height 80 | int height = label.size(1); 81 | // width 82 | int width = label.size(2); 83 | 84 | auto v_label = label.contiguous().view(-1); 85 | auto v_vertex = vertex.contiguous().view(-1); 86 | // num of classes 87 | int num_classes = vertex.size(3) / VERTEX_CHANNELS; 88 | int num_meta_data = meta_data.size(1); 89 | // int num_gt = gt.size(0); 90 | 91 | std::vector > outputs; 92 | auto v_extents = extents.view(-1); 93 | std::vector> bb3Ds; 94 | 95 | getBb3Ds(v_extents, bb3Ds, num_classes); 96 | 97 | int index_meta_data = 0; 98 | float fx, fy, px, py; 99 | auto acc_v_meta_data = v_meta_data.accessor(); 100 | 101 | for (int n = 0; n < batch_size; n++) 102 | { 103 | // these map are the starting index 104 | const int labelmap = n * height * width; 105 | const int vertmap = n * height * width * VERTEX_CHANNELS * num_classes; 106 | // find camera parameters 107 | fx = acc_v_meta_data[index_meta_data + 0]; 108 | fy = acc_v_meta_data[index_meta_data + 4]; 109 | px = acc_v_meta_data[index_meta_data + 2]; 110 | py = acc_v_meta_data[index_meta_data + 5]; 111 | 112 | hough_voting(v_label, v_vertex, labelmap, vertmap, bb3Ds, n, height, width, num_classes, is_train, fx, fy, px, py, outputs); 113 | 114 | index_meta_data += num_meta_data; 115 | } 116 | if (outputs.size() == 0) 117 | { 118 | std::cout << "no detection" << std::endl; 119 | // add a dummy detection to the output 120 | cv::Vec roi; 121 | roi(0) = 0; 122 | roi(1) = -1; 123 | outputs.push_back(roi); 124 | } 125 | // to change the datatype from vector to tensor 126 | int n_output = outputs.size(); 127 | int size_single_roi = outputs[0].rows; 128 | // std::cout<<"Size_single_roi: "<> result(n_output, std::vector(size_single_roi)); 139 | // for (int i = 0; i> bb3Ds, int batch, int height, int width, int num_classes, int is_train, float fx, float fy, float px, float py, std::vector >& outputs){ 162 | 163 | float inlierThreshold = 0.9; 164 | int votingThreshold = 50; 165 | 166 | // camera intrinsic matrix 3 X 3 167 | // cv::Mat camMat=cv::Mat::zeros(3,3,CV_32F); 168 | // int sz[] = {3,3}; 169 | // cv::Mat camMat; 170 | // camMat.create(2,sz,CV_32FC1); 171 | // camMat = Scalar(0); 172 | // std::vector > camMat(3, std::vector(0)); 173 | // camMat.at([0][0]) = fx; 174 | 175 | 176 | // cv::Mat_ camMat = cv::Mat_::zeros(3, 3); 177 | // camMat(0, 0) = fx; 178 | // camMat(1, 1) = fy; 179 | // camMat(2, 2) = 1.f; 180 | // camMat(0, 2) = px; 181 | // camMat(1, 2) = py; 182 | 183 | 184 | // Xu Ning 185 | Eigen::Matrix3f camMat; 186 | // camMat << fx, 0.0, px. 187 | // 0.0, fy, py, 188 | // 0.0, 0.0, 1.0; 189 | camMat(0,0) = fx; 190 | camMat(1,1) = fy; 191 | camMat(2,2) = 1.f; 192 | camMat(0,2) = px; 193 | camMat(1,2) = py; 194 | camMat(0,1) = 0; 195 | camMat(1,0) = 0; 196 | camMat(2,0) = 0; 197 | camMat(2,1) = 0; 198 | 199 | 200 | // initialize hough space 201 | // H X W X N integer 202 | int* hough_space = (int*)malloc(sizeof(int) * height * width * num_classes); 203 | // Initialize all values to 0 204 | memset(hough_space, 0, height * width * num_classes); 205 | // N integer 206 | int* flags = (int*)malloc(sizeof(int) * num_classes); 207 | // Initialize all values in memory space to 0 208 | memset(flags, 0, num_classes); 209 | auto acc_label = v_label.accessor(); 210 | auto acc_vertex = v_vertex.accessor(); 211 | // for each pixel 212 | for (int x = 0; x < width; x++) 213 | { 214 | for (int y = 0; y < height; y++) 215 | { 216 | // here need to understand the value of label map 217 | int c = acc_label[labelmap+y * width + x]; // label map is one dimension array contains pixel wise image label map, map to class 1-13 etc.. 218 | if (c > 0) // this pixel is in this object class 219 | { 220 | flags[c] = 1; // this is a flag of whether there is this object in this image. 221 | // read the predict center direction 222 | int offset = VERTEX_CHANNELS * c + VERTEX_CHANNELS * num_classes * (y * width + x); // Don't understand this 223 | float u = acc_vertex[vertmap+offset]; 224 | float v = acc_vertex[vertmap+offset + 1]; 225 | float norm = sqrt(u * u + v * v); // u and v here are the delta_x and delta_y 226 | u /= norm; 227 | v /= norm;// (u,v) is the unit vector indicates center direction 228 | 229 | // voting 230 | float delta = 1.0 / fabs(u); 231 | float cx = x; 232 | float cy = y; 233 | while(1) 234 | { 235 | cx += delta * u; 236 | cy += delta * v; 237 | int center_x = int(cx); 238 | int center_y = int(cy); 239 | if (center_x >= 0 && center_x < width && center_y >= 0 && center_y < height) 240 | { 241 | offset = c + num_classes * (center_y * width + center_x); 242 | hough_space[offset] += 1; 243 | } 244 | else 245 | break; 246 | } 247 | } 248 | } 249 | } 250 | // find the maximum in hough space 251 | for (int c = 1; c < num_classes; c++) 252 | { 253 | if (flags[c]) 254 | { 255 | int max_vote = 0; 256 | int max_x, max_y; 257 | for (int x = 0; x < width; x++) 258 | { 259 | for (int y = 0; y < height; y++) 260 | { 261 | int offset = c + num_classes * (y * width + x); 262 | if (hough_space[offset] > max_vote) 263 | { 264 | max_vote = hough_space[offset]; 265 | max_x = x; 266 | max_y = y; 267 | } 268 | } 269 | } 270 | if (max_vote < votingThreshold) 271 | continue; 272 | 273 | // center 274 | cv::Point2f center(max_x, max_y); 275 | int bb_width, bb_height; 276 | float bb_distance; 277 | 278 | compute_width_height(v_label, v_vertex, labelmap, vertmap, center, bb3Ds, camMat, inlierThreshold, height, width, c, num_classes, bb_width, bb_height, bb_distance); 279 | 280 | // construct output 281 | cv::Vec roi; 282 | roi(0) = batch; //batch number index 0 to batchsize -1 283 | roi(1) = c; //cls number index 1 to 13 284 | 285 | // bounding box 286 | float scale = 0.05; 287 | roi(2) = center.x - bb_width * (0.5 + scale); 288 | roi(3) = center.y - bb_height * (0.5 + scale); 289 | roi(4) = center.x + bb_width * (0.5 + scale); 290 | roi(5) = center.y + bb_height * (0.5 + scale); 291 | // score 292 | roi(6) = max_vote; 293 | 294 | // pose 295 | float rx = (center.x - px) / fx; 296 | float ry = (center.y - py) / fy; 297 | roi(7) = 1; 298 | roi(8) = 0; 299 | roi(9) = 0; 300 | roi(10) = 0; 301 | roi(11) = rx * bb_distance; 302 | roi(12) = ry * bb_distance; 303 | roi(13) = bb_distance; 304 | 305 | outputs.push_back(roi); 306 | // ///////////// 307 | // // TODO 308 | // if (is_train) 309 | // { 310 | // // add jittering rois 311 | // float x1 = roi(2); 312 | // float y1 = roi(3); 313 | // float x2 = roi(4); 314 | // float y2 = roi(5); 315 | // float ww = x2 - x1; 316 | // float hh = y2 - y1; 317 | 318 | // // (-1, -1) 319 | // roi(2) = x1 - 0.05 * ww; 320 | // roi(3) = y1 - 0.05 * hh; 321 | // roi(4) = roi(2) + ww; 322 | // roi(5) = roi(3) + hh; 323 | // outputs.push_back(roi); 324 | 325 | // // (+1, -1) 326 | // roi(2) = x1 + 0.05 * ww; 327 | // roi(3) = y1 - 0.05 * hh; 328 | // roi(4) = roi(2) + ww; 329 | // roi(5) = roi(3) + hh; 330 | // outputs.push_back(roi); 331 | 332 | // // (-1, +1) 333 | // roi(2) = x1 - 0.05 * ww; 334 | // roi(3) = y1 + 0.05 * hh; 335 | // roi(4) = roi(2) + ww; 336 | // roi(5) = roi(3) + hh; 337 | // outputs.push_back(roi); 338 | 339 | // // (+1, +1) 340 | // roi(2) = x1 + 0.05 * ww; 341 | // roi(3) = y1 + 0.05 * hh; 342 | // roi(4) = roi(2) + ww; 343 | // roi(5) = roi(3) + hh; 344 | // outputs.push_back(roi); 345 | 346 | // // (0, -1) 347 | // roi(2) = x1; 348 | // roi(3) = y1 - 0.05 * hh; 349 | // roi(4) = roi(2) + ww; 350 | // roi(5) = roi(3) + hh; 351 | // outputs.push_back(roi); 352 | 353 | // // (-1, 0) 354 | // roi(2) = x1 - 0.05 * ww; 355 | // roi(3) = y1; 356 | // roi(4) = roi(2) + ww; 357 | // roi(5) = roi(3) + hh; 358 | // outputs.push_back(roi); 359 | 360 | // // (0, +1) 361 | // roi(2) = x1; 362 | // roi(3) = y1 + 0.05 * hh; 363 | // roi(4) = roi(2) + ww; 364 | // roi(5) = roi(3) + hh; 365 | // outputs.push_back(roi); 366 | 367 | // // (+1, 0) 368 | // roi(2) = x1 + 0.05 * ww; 369 | // roi(3) = y1; 370 | // roi(4) = roi(2) + ww; 371 | // roi(5) = roi(3) + hh; 372 | // outputs.push_back(roi); 373 | // } 374 | } 375 | } 376 | } 377 | 378 | 379 | // get 3D bounding boxes 380 | void getBb3Ds(const Tensor& extents, std::vector>& bb3Ds, int num_classes) 381 | { 382 | // for each object 383 | auto acc_extents = extents.packed_accessor(); 384 | for (int i = 1; i < num_classes; i++) 385 | { 386 | cv::Vec extent; 387 | 388 | extent(0) = acc_extents[i * 3]; 389 | extent(1) = acc_extents[i * 3 + 1]; 390 | extent(2) = acc_extents[i * 3 + 2]; 391 | bb3Ds.push_back(getBB3D(extent)); 392 | } 393 | } 394 | 395 | 396 | inline std::vector getBB3D(const cv::Vec& extent) 397 | { 398 | std::vector bb; 399 | float xHalf = extent[0] * 0.5; 400 | float yHalf = extent[1] * 0.5; 401 | float zHalf = extent[2] * 0.5; 402 | 403 | bb.push_back(cv::Point3f(xHalf, yHalf, zHalf)); 404 | bb.push_back(cv::Point3f(-xHalf, yHalf, zHalf)); 405 | bb.push_back(cv::Point3f(xHalf, -yHalf, zHalf)); 406 | bb.push_back(cv::Point3f(-xHalf, -yHalf, zHalf)); 407 | 408 | bb.push_back(cv::Point3f(xHalf, yHalf, -zHalf)); 409 | bb.push_back(cv::Point3f(-xHalf, yHalf, -zHalf)); 410 | bb.push_back(cv::Point3f(xHalf, -yHalf, -zHalf)); 411 | bb.push_back(cv::Point3f(-xHalf, -yHalf, -zHalf)); 412 | 413 | return bb; 414 | } 415 | 416 | 417 | // inline cv::Rect getBB2D(int imageWidth, int imageHeight, const std::vector& bb3D, const cv::Mat& camMat, const cv::Mat& rvec, const cv::Mat& tvec) 418 | // { 419 | // // project 3D bounding box vertices into the image 420 | // std::vector bb2D; 421 | // cv::projectPoints(bb3D, rvec, tvec, camMat, cv::Mat(), bb2D); 422 | 423 | // // get min-max of projected vertices 424 | // int minX = imageWidth - 1; 425 | // int maxX = 0; 426 | // int minY = imageHeight - 1; 427 | // int maxY = 0; 428 | 429 | // for(unsigned j = 0; j < bb2D.size(); j++) 430 | // { 431 | // minX = std::min((float) minX, bb2D[j].x); 432 | // minY = std::min((float) minY, bb2D[j].y); 433 | // maxX = std::max((float) maxX, bb2D[j].x); 434 | // maxY = std::max((float) maxY, bb2D[j].y); 435 | // } 436 | 437 | // // clamp at image border 438 | // minX = clamp(minX, 0, imageWidth - 1); 439 | // maxX = clamp(maxX, 0, imageWidth - 1); 440 | // minY = clamp(minY, 0, imageHeight - 1); 441 | // maxY = clamp(maxY, 0, imageHeight - 1); 442 | 443 | // return cv::Rect(minX, minY, (maxX - minX + 1), (maxY - minY + 1)); 444 | // } 445 | 446 | inline void compute_width_height(const Tensor& v_label, const Tensor& v_vertex, const int labelmap, const float vertmap, cv::Point2f center, std::vector> bb3Ds, Eigen::Matrix3f camMat, float inlierThreshold, int height, int width, int channel, int num_classes, int & bb_width, int & bb_height, float & bb_distance) 447 | { 448 | float d = 0; 449 | int count = 0; 450 | 451 | // for each pixel 452 | std::vector dx; 453 | std::vector dy; 454 | auto acc_label = v_label.accessor(); 455 | auto acc_vertex = v_vertex.accessor(); 456 | for (int x = 0; x < width; x++) 457 | { 458 | for (int y = 0; y < height; y++) 459 | { 460 | if (acc_label[labelmap+y * width + x] == channel) 461 | { 462 | cv::Point2f point(x, y); 463 | int offset = VERTEX_CHANNELS * channel + VERTEX_CHANNELS * num_classes * (y * width + x); 464 | float u = acc_vertex[vertmap+offset]; 465 | float v = acc_vertex[vertmap+offset + 1]; 466 | float distance = exp(acc_vertex[vertmap+offset + 2]/1000.0); 467 | float norm = sqrt(u * u + v * v); 468 | u /= norm; 469 | v /= norm; 470 | cv::Point2f direction(u, v); 471 | 472 | // inlier check 473 | if(angle_distance(center, direction, point) > inlierThreshold) 474 | { 475 | dx.push_back(fabs(point.x - center.x)); 476 | dy.push_back(fabs(point.y - center.y)); 477 | d += distance; 478 | count++; 479 | } 480 | } 481 | } 482 | } 483 | bb_distance = d / count; 484 | // estimate a projection 485 | // cv::Mat tvec(3, 1, CV_64F); 486 | // cv::Mat rvec(3, 1, CV_64F); 487 | // for(int i = 0; i < 3; i++) 488 | // { 489 | // tvec.at(i, 0) = 0.0; 490 | // rvec.at(i, 0) = 0.0; 491 | // } 492 | // tvec.at(2, 0) = bb_distance; 493 | // // jp::cv_trans_t pose(rvec, tvec); 494 | 495 | // std::vector bb2D; 496 | // // cv::projectPoints(bb3Ds[objID-1], pose.first, pose.second, camMat, cv::Mat(), bb2D); 497 | // cv::projectPoints(bb3Ds[channel-1], rvec, tvec, camMat, cv::Mat(), bb2D); 498 | 499 | //Xu Ning 500 | Eigen::MatrixXf tvec(3,1); 501 | Eigen::MatrixXf rvec(3,1); 502 | for(int i = 0; i < 3; i++) 503 | { 504 | tvec(i,0) = 0.0; 505 | rvec(i,0) = 0.0; 506 | } 507 | tvec(2, 0) = bb_distance; 508 | std::vector bb2D; 509 | projectPoints(bb3Ds[channel-1], bb_distance, camMat, bb2D); 510 | 511 | 512 | // get min-max of projected vertices 513 | int minX = 1e8; 514 | int maxX = -1e8; 515 | int minY = 1e8; 516 | int maxY = -1e8; 517 | for(unsigned int i = 0; i < bb2D.size(); i++) 518 | { 519 | minX = std::min((float) minX, bb2D[i].x); 520 | minY = std::min((float) minY, bb2D[i].y); 521 | maxX = std::max((float) maxX, bb2D[i].x); 522 | maxY = std::max((float) maxY, bb2D[i].y); 523 | } 524 | cv::Rect bb = cv::Rect(0, 0, (maxX - minX + 1), (maxY - minY + 1)); 525 | std::vector::iterator it; 526 | it = std::remove_if(dx.begin(), dx.end(), std::bind2nd(std::greater(), std::max(bb.width, bb.height) )); 527 | dx.erase(it, dx.end()); 528 | 529 | it = std::remove_if(dy.begin(), dy.end(), std::bind2nd(std::greater(), std::max(bb.width, bb.height) )); 530 | dy.erase(it, dy.end()); 531 | std::sort(dx.begin(), dx.end()); 532 | std::sort(dy.begin(), dy.end()); 533 | int index1 = int(dx.size() * 0.95); 534 | int index2 = int(dy.size() * 0.95); 535 | if (dx.size() == 0 || dy.size() == 0){ 536 | bb_width = 2; 537 | bb_height = 2; 538 | }else{ 539 | bb_width = 2 * dx[index1]; 540 | bb_height = 2 * dy[index2]; 541 | } 542 | } 543 | 544 | void projectPoints(std::vector bb3Ds, float& bb_distance, Eigen::MatrixXf camMat, std::vector& bb2D){ 545 | Eigen::MatrixXf extrinsic = Eigen::MatrixXf::Zero(3,4); 546 | Eigen::MatrixXf intrinsic(3,3); 547 | // extrinsic << 1,0,0,0,0,1,0,0,0,0,1,bb_distance; 548 | extrinsic(0,0) = 1; 549 | extrinsic(1,1) = 1; 550 | extrinsic(2,2) = 1; 551 | extrinsic(2,3) = bb_distance; 552 | intrinsic = camMat; 553 | // Eigen::Matrix2d mat; 554 | // mat << 1, 2, 555 | // 3, 4; 556 | // Eigen::Vector2d u(-1,1), v(2,0); 557 | // std::cout << "Here is mat*mat:\n" << mat*u << std::endl; 558 | 559 | for (auto bb3D:bb3Ds){ 560 | cv::Point2f pt(0,0); 561 | Eigen::Vector4f vecpt(bb3D.x, bb3D.y, bb3D.z, 1); 562 | // extrinsic*vecpt; 563 | Eigen::Vector3f pt_vec = intrinsic * extrinsic * vecpt; 564 | pt.x = pt_vec[0]/pt_vec[2]; 565 | pt.y = pt_vec[1]/pt_vec[2]; 566 | bb2D.push_back(pt); 567 | } 568 | } 569 | 570 | inline float angle_distance(cv::Point2f x, cv::Point2f n, cv::Point2f p) 571 | { 572 | return n.dot(x - p) / (cv::norm(n) * cv::norm(x - p)); 573 | } 574 | 575 | PYBIND11_MODULE(HoughVoting, m) { 576 | m.def("forward", &Forward, "HoughVoting forward"); 577 | // m.def("backward", &Backward, "HoughVoting backward"); 578 | } 579 | 580 | 581 | -------------------------------------------------------------------------------- /lib/HoughVoting/houghvoting.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | import HoughVoting 5 | 6 | # class HoughVoting(nn.Module): 7 | # # initialization 8 | # def __init__(self): 9 | # self.vertex_channels = 3 10 | 11 | # def forward(self, label, vertex, extents, meta_data, gt, is_train): 12 | ## label : (batch size, height, weight) 13 | ## vertex: b, h, w, 3 * num_cls 14 | 15 | 16 | # # flatten 17 | # v_meta_data = meta_data.view(-1) 18 | # v_gt = gt.view(-1) 19 | # v_extents = extents.view(-1) 20 | # # batch size 21 | # batch_size = label.shape[0] 22 | # # height 23 | # height = label.shape[1] 24 | # # width 25 | # width = label.shape[2] 26 | # # num of cls 27 | # num_cls = vertex.shape[3] / self.vertex_channels 28 | # # num of meta data 29 | # num_meta_data = meta_data.shape[3] 30 | # # num of gt 31 | # num_gt = gt.shape[0] 32 | 33 | # getBb3Ds(v_extents, num_cls) 34 | # index_meta_data = 0 35 | # # for each image run hough voting 36 | # for n in range(batch_size): 37 | # idx_label = 38 | # idx_vertex = 39 | # fx = v_meta_data(index_meta_data+0) 40 | # fy = v_meta_data(index_meta_data+4) 41 | # px = v_meta_data(index_meta_data+2) 42 | # py = v_meta_data(index_meta_data+5) 43 | # outputs = voting(idx_label, idx_vertex, label, vertex, bb3Ds, n, height, weight, num_cls, is_train, fx, fy, px ,py) 44 | # index_meta_data = index_meta_data + 1 45 | # if (outputs.size() == 0): 46 | # print("No detection") 47 | # # add a dummy detection to the output? 48 | # roi = torch.empty((14,1)) 49 | # roi[0] = 0 50 | # roi[1] = -1 51 | # # add back to outputs 52 | # outputs[] 53 | # class HoughVotingFunction(torch.autograd.Function): 54 | # @staticmethod 55 | # def forward(ctx, input, weights, bias, old_h, old_cell): 56 | # outputs = HoughVoting.forward(label, vertex, extents, meta_data, gt, is_train =True) 57 | # # new_h, new_cell = outputs[:2] 58 | # # variables = outputs[1:] + [weights] 59 | # # ctx.save_for_backward(*variables) 60 | # return outputs 61 | 62 | # @staticmethod 63 | # def backward(ctx, grad_h, grad_cell): 64 | # outputs = lltm_cpp.backward( 65 | # grad_h.contiguous(), grad_cell.contiguous(), *ctx.saved_variables) 66 | # d_old_h, d_input, d_weights, d_bias, d_old_cell = outputs 67 | # return d_input, d_weights, d_bias, d_old_h, d_old_cell 68 | 69 | 70 | class HF(torch.nn.Module): 71 | def __init__(self): 72 | super(HF, self).__init__() 73 | # self.input_features = input_features 74 | # self.state_size = state_size 75 | # self.weights = torch.nn.Parameter( 76 | # torch.empty(3 * state_size, input_features + state_size)) 77 | # self.bias = torch.nn.Parameter(torch.empty(3 * state_size)) 78 | # self.reset_parameters() 79 | self.vertex_channels = 3 80 | 81 | # def reset_parameters(self): 82 | # stdv = 1.0 / math.sqrt(self.state_size) 83 | # for weight in self.parameters(): 84 | # weight.data.uniform_(-stdv, +stdv) 85 | 86 | def forward(self, label, vertex, extents, meta_data, gt, is_train): 87 | outputs = HoughVoting.forward(label, vertex, extents, meta_data, gt, is_train) 88 | return outputs 89 | 90 | def backward(self, label, vertex): 91 | label_grad = torch.zeros(label.size()) 92 | vertex_grad = torch.zeros(vertex.size()) 93 | return label_grad, vertex_grad 94 | -------------------------------------------------------------------------------- /lib/HoughVoting/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | # from torch.utils.cpp_extension import CUDAExtension, BuildExtension 3 | from torch.utils.cpp_extension import CppExtension, BuildExtension 4 | 5 | 6 | setup(name='HoughVoting', 7 | ext_modules=[CppExtension('HoughVoting', ['houghvoting.cc'], 8 | library_dirs = ['/home/parallels/conda/lib/'])], 9 | cmdclass={'build_ext': BuildExtension}) 10 | 11 | -------------------------------------------------------------------------------- /lib/center_est_funcs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Apr 21 15:14:11 2019 4 | 5 | @author: Junzhe Xu 6 | """ 7 | 8 | import numpy as np 9 | import torch 10 | 11 | 12 | 13 | # FOR CENTER ESTIMATION 14 | 15 | # Center-voting for validation 16 | def _vote_centers_val(im_label, cls_indexes, centers, poses, num_classes, extents): 17 | width = im_label.shape[1] 18 | height = im_label.shape[0] 19 | vertex_targets = np.zeros((height, width, 3), dtype=np.float32) 20 | center = np.zeros((2, 1), dtype=np.float32) 21 | 22 | for i in range(1, num_classes): 23 | y, x = np.where(im_label == i) 24 | I = np.where(im_label == i) 25 | ind = np.where(cls_indexes == i)[0] 26 | 27 | if len(x) > 0 and len(ind) > 0: 28 | center[0] = centers[ind, 0] 29 | center[1] = centers[ind, 1] 30 | z = poses[2, 3, ind] 31 | R = np.tile(center, (1, len(x))) - np.vstack((x, y)) 32 | # compute the norm 33 | N = np.linalg.norm(R, axis=0) + 1e-10 34 | # normalization 35 | R = np.divide(R, np.tile(N, (2,1))) 36 | # assignment 37 | vertex_targets[y, x, 0] = R[0,:] 38 | vertex_targets[y, x, 1] = R[1,:] 39 | vertex_targets[y, x, 2] = z 40 | 41 | return vertex_targets 42 | 43 | 44 | 45 | 46 | # Center voting for training 47 | def _vote_centers_train(im_label, cls_indexes, center, depth_centers, num_classes): 48 | height = im_label.shape[0] 49 | width = im_label.shape[1] 50 | vertex_targets = np.zeros((3*num_classes, height, width), dtype=np.float32) 51 | vertex_weights = np.zeros(vertex_targets.shape, dtype=np.float32) 52 | c = np.zeros((2, 1), dtype=np.float32) 53 | 54 | for i in range(1, num_classes): 55 | y, x = np.where(im_label == i) 56 | if len(x) > 0: 57 | c[0] = center[i, 0] 58 | c[1] = center[i, 1] 59 | R = np.tile(c, (1, len(x))) - np.vstack((x, y)) 60 | # compute the norm 61 | N = np.linalg.norm(R, axis=0) + 1e-10 62 | # normalization 63 | R = np.divide(R, np.tile(N, (2,1))) 64 | # assignment 65 | start = 3 * i 66 | end = start + 3 67 | vertex_targets[3*i, y, x] = R[0,:] 68 | vertex_targets[3*i+1, y, x] = R[1,:] 69 | vertex_targets[3*i+2, y, x] = depth_centers[i, 0] 70 | vertex_weights[start:end, y, x] = 10.0 71 | 72 | return vertex_targets, vertex_weights 73 | 74 | 75 | 76 | def smooth_l1_loss_vertex(vertex_pred, vertex_targets, vertex_weights, sigma=1.0, VERTEX_W=5.0): 77 | 78 | sigma_2 = sigma ** 2 79 | vertex_diff = vertex_pred - vertex_targets 80 | diff = vertex_weights * vertex_diff 81 | abs_diff = torch.abs(diff) 82 | smoothL1_sign = torch.tensor((abs_diff < 1. / sigma_2).float(), requires_grad=False) 83 | in_loss = torch.pow(diff, 2) * (sigma_2 / 2.) * smoothL1_sign \ 84 | + (abs_diff - (0.5 / sigma_2)) * (1. - smoothL1_sign) 85 | loss = torch.div(torch.sum(in_loss), torch.sum(vertex_weights) + 1e-10 ) 86 | loss = VERTEX_W * torch.tensor(loss, requires_grad=True) 87 | 88 | return loss 89 | -------------------------------------------------------------------------------- /lib/loss_funcions.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | ####################################################################### 7 | ########## Loss functions adapted from the tensorflow version ######### 8 | ####################################################################### 9 | 10 | def loss_cross_entropy_single_frame(scores, labels): 11 | """ 12 | scores: a tensor [batch_size, height, width, num_classes] 13 | labels: a tensor [batch_size, height, width, num_classes] 14 | """ 15 | 16 | with tf.name_scope('loss'): 17 | cross_entropy = -tf.reduce_sum(labels * scores, reduction_indices=[3]) 18 | loss = tf.div(tf.reduce_sum(cross_entropy), tf.reduce_sum(labels)+1e-10) 19 | 20 | return loss 21 | 22 | 23 | def torch_loss_cross_entropy_single_frame(scores, labels): 24 | 25 | """ 26 | scores: a tensor [batch_size, height, width, num_classes] 27 | labels: a tensor [batch_size, height, width, num_classes] 28 | """ 29 | 30 | cross_entropy = -torch.sum(labels * scores, dim=3) 31 | loss = torch.div(torch.sum(cross_entropy), torch.sum(labels)+1e-10) 32 | loss = torch.tensor(loss, requires_grad=True) 33 | 34 | return loss 35 | 36 | ########################################################################### 37 | 38 | def smooth_l1_loss_vertex(vertex_pred, vertex_targets, vertex_weights, sigma=1.0): 39 | 40 | sigma_2 = sigma ** 2 41 | vertex_diff = vertex_pred - vertex_targets 42 | diff = tf.multiply(vertex_weights, vertex_diff) 43 | abs_diff = tf.abs(diff) 44 | smoothL1_sign = tf.stop_gradient(tf.to_float(tf.less(abs_diff, 1. / sigma_2))) 45 | in_loss = tf.pow(diff, 2) * (sigma_2 / 2.) * smoothL1_sign \ 46 | + (abs_diff - (0.5 / sigma_2)) * (1. - smoothL1_sign) 47 | loss = tf.div( tf.reduce_sum(in_loss), tf.reduce_sum(vertex_weights) + 1e-10 ) 48 | 49 | return loss 50 | 51 | def torch_smooth_l1_loss_vertex(vertex_pred, vertex_targets, vertex_weights, sigma=1.0, VERTEX_W=5.0): 52 | 53 | sigma_2 = sigma ** 2 54 | vertex_diff = vertex_pred - vertex_targets 55 | diff = vertex_weights * vertex_diff 56 | abs_diff = torch.abs(diff) 57 | smoothL1_sign = torch.tensor((abs_diff < 1. / sigma_2).float(), requires_grad=False) 58 | in_loss = torch.pow(diff, 2) * (sigma_2 / 2.) * smoothL1_sign \ 59 | + (abs_diff - (0.5 / sigma_2)) * (1. - smoothL1_sign) 60 | loss = torch.div(torch.sum(in_loss), torch.sum(vertex_weights) + 1e-10 ) 61 | loss = VERTEX_W * torch.tensor(loss, requires_grad=True) 62 | 63 | return loss 64 | 65 | ########################################################################### 66 | """ 67 | logits should be domain_score, labels should be label_domain 68 | """ 69 | 70 | def torch_loss_domain(logits, labels, ADAPT_WEIGHT=0.1): 71 | 72 | loss = ADAPT_WEIGHT * torch.mean(F.nll_loss(F.softmax(logits), labels)) 73 | loss = torch.tensor(loss, requires_grad=True) 74 | 75 | return loss 76 | 77 | ########################################################################### 78 | 79 | 80 | -------------------------------------------------------------------------------- /lib/roi_pool_pytorch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import numpy as np 5 | 6 | 7 | class RoIPool(nn.Module): 8 | def __init__(self, pooled_height, pooled_width, spatial_scale): 9 | super(RoIPool, self).__init__() 10 | self.pooled_width = int(pooled_width) 11 | self.pooled_height = int(pooled_height) 12 | self.spatial_scale = float(spatial_scale) 13 | 14 | def forward(self, features, rois): 15 | """ 16 | features shape is (batch_size, num_channels, img_height, img_width) 17 | rois shape is (num_rois, 5), where the first index is batch index, the last 4 indexes are the coordinate 18 | of the upper left corner and the lower right corner 19 | spatial scale should be like 1/16, 1/8, etc. 20 | """ 21 | 22 | batch_size, num_channels, data_height, data_width = features.size() 23 | num_rois = rois.size()[0] 24 | outputs = Variable(torch.zeros(num_rois, num_channels, self.pooled_height, self.pooled_width)).cuda() 25 | 26 | for roi_idx, roi in enumerate(rois): 27 | batch_idx = int(roi[0]) 28 | if batch_idx > batch_size - 1: 29 | raise ValueError("Batch index out of range!") 30 | upleft_x, upleft_y, downright_x, downright_y = np.round(roi[1:].cpu().numpy() * self.spatial_scale).astype(int) 31 | roi_width = max(downright_x - upleft_x + 1, 1) 32 | roi_height = max(downright_y - upleft_y + 1, 1) 33 | bin_size_w = float(roi_width) / float(self.pooled_width) 34 | bin_size_h = float(roi_height) / float(self.pooled_height) 35 | 36 | for ph in range(self.pooled_height): 37 | hstart = int(np.floor(ph * bin_size_h)) 38 | hend = int(np.ceil((ph + 1) * bin_size_h)) 39 | hstart = min(data_height, max(0, hstart + upleft_y)) 40 | hend = min(data_height, max(0, hend + upleft_y)) 41 | 42 | for pw in range(self.pooled_width): 43 | wstart = int(np.floor(pw * bin_size_w)) 44 | wend = int(np.ceil((pw + 1) * bin_size_w)) 45 | wstart = min(data_width, max(0, wstart + upleft_x)) 46 | wend = min(data_width, max(0, wend + upleft_x)) 47 | is_error = (hend <= hstart) or (wend <= wstart) 48 | 49 | if is_error: 50 | outputs[roi_idx, :, ph, pw] = 0 51 | 52 | else: 53 | data = features[batch_idx] 54 | outputs[roi_idx, :, ph, pw] = torch.max(torch.max(data[:, hstart:hend, wstart:wend], dim=1)[0], dim=2)[0].view(-1) 55 | 56 | return outputs 57 | 58 | -------------------------------------------------------------------------------- /lib/vgg16_convs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 12 13:41:56 2019 4 | 5 | @author: Junzhe Xu 6 | """ 7 | 8 | import torch 9 | import torchvision 10 | import torchvision.transforms as transforms 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | import torch.optim as optim 14 | import time 15 | #from networks.network import Network 16 | from lib.HoughVoting.houghvoting import * 17 | 18 | 19 | class vgg16_convs(nn.Module): 20 | def __init__(self, input_format, num_classes, num_units, scales, threshold_label, vote_threshold, 21 | vertex_reg_2d=False, vertex_reg_3d=False, vertex_reg_hough_in = False, pose_reg=False, 22 | adaptation=False, trainable=True, is_train=True): 23 | super(vgg16_convs, self).__init__() 24 | 25 | self.inputs = [] 26 | self.input_format = input_format 27 | self.num_classes = num_classes 28 | self.num_units = num_units 29 | self.scale = 1.0 30 | self.threshold_label = threshold_label 31 | self.vertex_reg_2d = vertex_reg_2d 32 | self.vertex_reg_3d = vertex_reg_3d 33 | self.vertex_reg = vertex_reg_2d or vertex_reg_3d 34 | self.vertex_reg_hough = vertex_reg_hough_in 35 | self.pose_reg = pose_reg 36 | self.adaptation = adaptation 37 | self.trainable = trainable 38 | 39 | # if vote_threshold < 0, only detect single instance (default). 40 | # Otherwise, multiple instances are detected if hough voting score larger than the threshold 41 | 42 | if is_train: 43 | self.is_train = 1 44 | self.skip_pixels = 10 45 | self.vote_threshold = vote_threshold 46 | self.vote_percentage = 0.02 47 | else: 48 | self.is_train = 0 49 | self.skip_pixels = 10 50 | self.vote_threshold = vote_threshold 51 | self.vote_percentage = 0.02 52 | 53 | 54 | 55 | # VGG-16 for feature extraction 56 | self.conv1_1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1) 57 | self.conv1_2 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1) 58 | self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) # 1/2 of the origin image 59 | 60 | self.conv2_1 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1) 61 | self.conv2_2 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1) 62 | self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) # 1/4 of the origin image 63 | 64 | self.conv3_1 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1) 65 | self.conv3_2 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1) 66 | self.conv3_3 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1) 67 | self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) # 1/8 of the origin image 68 | 69 | self.conv4_1 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1) 70 | self.conv4_2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 71 | self.conv4_3 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 72 | self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) # 1/16 of the origin image 73 | 74 | self.conv5_1 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 75 | self.conv5_2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 76 | self.conv5_3 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 77 | 78 | # If input format is RGBD we use another network 79 | """ 80 | self.conv1_1_p = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1) 81 | self.conv1_2_p = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1) 82 | self.poo1_p = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) 83 | 84 | self.conv2_1_p = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1) 85 | self.conv2_2_p = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1) 86 | self.pool2_p = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) 87 | 88 | self.conv3_1_p = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1) 89 | self.conv3_2_p = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1) 90 | self.conv3_3_p = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1) 91 | self.pool3_p = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) 92 | 93 | self.conv4_1_p = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1) 94 | self.conv4_2_p = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 95 | self.conv4_3_p = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 96 | self.pool4_p = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) 97 | 98 | self.conv5_1_p = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 99 | self.conv5_2_p = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 100 | self.conv5_3_p = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 101 | """ 102 | 103 | 104 | # For combination layer 105 | # For semantic segmentation 106 | self.conv6_seman_a = nn.Conv2d(in_channels=512, out_channels=64, kernel_size=3, stride=1, padding=1) 107 | self.conv6_seman_b = nn.Conv2d(in_channels=512, out_channels=64, kernel_size=3, stride=1, padding=1) 108 | self.dconv6_seman_a = nn.ConvTranspose2d(in_channels=64, out_channels=64, 109 | kernel_size=4, stride=2, padding=1, output_padding=0) 110 | self.dropout = nn.Dropout2d() 111 | self.dconv7_seman = nn.ConvTranspose2d(in_channels=64, out_channels=64, 112 | kernel_size=16, stride=8, padding=4, output_padding=0) 113 | self.conv8_seman = nn.Conv2d(in_channels=64, out_channels=self.num_classes, kernel_size=3, stride=1, padding=1) 114 | 115 | 116 | # For Center estimation 117 | self.conv6_center_a = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=3, stride=1, padding=1) 118 | self.conv6_center_b = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=3, stride=1, padding=1) 119 | self.dconv6_center_a = nn.ConvTranspose2d(in_channels=128, out_channels=128, 120 | kernel_size=4, stride=2, padding=1, output_padding=0) 121 | self.dconv7_center = nn.ConvTranspose2d(in_channels=128, out_channels=128, 122 | kernel_size=16, stride=8, padding=4, output_padding=0) 123 | self.conv8_center = nn.Conv2d(in_channels=128, out_channels=3 * self.num_classes, kernel_size=3, stride=1, padding=1) 124 | 125 | 126 | self.relu = nn.ReLU() 127 | 128 | self.hough_voting = HF() 129 | 130 | 131 | def conv_fun(self, x): 132 | x = self.relu(self.conv1_1(x)) 133 | x = self.relu(self.conv1_2(x)) 134 | x = self.pool1(x) # 1/2 of the original image 135 | 136 | x = self.relu(self.conv2_1(x)) 137 | x = self.relu(self.conv2_2(x)) 138 | x = self.pool2(x) # 1/4 of the original image 139 | 140 | x = self.relu(self.conv3_1(x)) 141 | x = self.relu(self.conv3_2(x)) 142 | x = self.relu(self.conv3_3(x)) 143 | x = self.pool3(x) # 1/8 of the original image 144 | 145 | x = self.relu(self.conv4_1(x)) 146 | x = self.relu(self.conv4_2(x)) 147 | f_conv4 = self.relu(self.conv4_3(x)) 148 | x = self.pool3(f_conv4) # 1/16 of the original image 149 | 150 | x = self.relu(self.conv5_1(x)) 151 | x = self.relu(self.conv5_2(x)) 152 | f_conv5 = self.relu(self.conv5_3(x)) 153 | x = self.pool3(f_conv5) # 1/32 of the original image 154 | return x, f_conv4, f_conv5 155 | 156 | 157 | def seman_net(self, f_conv4, f_conv5): 158 | x_a = self.dconv6_seman_a(self.conv6_seman_a(f_conv5)) 159 | x_b = self.conv6_seman_b(f_conv4) 160 | x = x_a + x_b 161 | x = self.dconv7_seman(x) 162 | x = self.conv8_seman(x) 163 | return x 164 | 165 | 166 | def center_net(self, f_conv4, f_conv5): 167 | x_a = self.dconv6_center_a(self.conv6_center_a(f_conv5)) 168 | x_b = self.conv6_center_b(f_conv4) 169 | x = x_a + x_b 170 | x = self.dconv7_center(x) 171 | x = self.conv8_center(x) 172 | return x 173 | 174 | 175 | def forward(self, x, extents, meta, gt_hough, is_train, device_cpu): 176 | x, f_conv4, f_conv5 = self.conv_fun(x) 177 | x_seman = self.seman_net(f_conv4, f_conv5) # the output of semantic segmentation 178 | 179 | if self.vertex_reg == True: 180 | x_center_dir = self.center_net(f_conv4, f_conv5) # the output of the center estimation 181 | x_center = None 182 | if self.vertex_reg_hough: 183 | x_seman_single = torch.argmax(x_seman, dim = 1).type('torch.IntTensor') 184 | x_seman_single = x_seman_single.to(device_cpu) 185 | 186 | x_center_dir_hough = x_center_dir.permute(0,2,3,1) 187 | x_center_dir_hough = x_center_dir_hough.to(device_cpu) 188 | 189 | x_hough = self.hough_voting(x_seman_single, x_center_dir_hough, extents, meta, gt_hough, is_train) 190 | 191 | return x_seman, x_center_dir, x_hough 192 | else: 193 | return x_seman 194 | 195 | 196 | 197 | -------------------------------------------------------------------------------- /lib/vgg16_convs_combine_mask.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 12 13:41:56 2019 4 | 5 | @author: Junzhe Xu 6 | """ 7 | 8 | import torch 9 | import torchvision 10 | import torchvision.transforms as transforms 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | import torch.optim as optim 14 | import time 15 | #from networks.network import Network 16 | 17 | 18 | class vgg16_convs_comb_seg_center(nn.Module): 19 | def __init__(self, input_format, num_classes, num_units, scales, threshold_label, vote_threshold, 20 | vertex_reg_2d=False, vertex_reg_3d=False, combine_seg_center_in = False, pose_reg=False, 21 | adaptation=False, trainable=True, is_train=True): 22 | super(vgg16_convs_comb_seg_center, self).__init__() 23 | 24 | self.inputs = [] 25 | self.input_format = input_format 26 | self.num_classes = num_classes 27 | self.num_units = num_units 28 | self.scale = 1.0 29 | self.threshold_label = threshold_label 30 | self.vertex_reg_2d = vertex_reg_2d 31 | self.vertex_reg_3d = vertex_reg_3d 32 | self.vertex_reg = vertex_reg_2d or vertex_reg_3d 33 | self.combine_seg_center = combine_seg_center_in 34 | self.pose_reg = pose_reg 35 | self.adaptation = adaptation 36 | self.trainable = trainable 37 | 38 | # if vote_threshold < 0, only detect single instance (default). 39 | # Otherwise, multiple instances are detected if hough voting score larger than the threshold 40 | 41 | if is_train: 42 | self.is_train = 1 43 | self.skip_pixels = 10 44 | self.vote_threshold = vote_threshold 45 | self.vote_percentage = 0.02 46 | else: 47 | self.is_train = 0 48 | self.skip_pixels = 10 49 | self.vote_threshold = vote_threshold 50 | self.vote_percentage = 0.02 51 | 52 | 53 | 54 | # VGG-16 for feature extraction 55 | self.conv1_1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1) 56 | self.conv1_2 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1) 57 | self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) # 1/2 of the origin image 58 | 59 | self.conv2_1 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1) 60 | self.conv2_2 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1) 61 | self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) # 1/4 of the origin image 62 | 63 | self.conv3_1 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1) 64 | self.conv3_2 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1) 65 | self.conv3_3 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1) 66 | self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) # 1/8 of the origin image 67 | 68 | self.conv4_1 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1) 69 | self.conv4_2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 70 | self.conv4_3 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 71 | self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) # 1/16 of the origin image 72 | 73 | self.conv5_1 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 74 | self.conv5_2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 75 | self.conv5_3 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 76 | 77 | # If input format is RGBD we use another network 78 | """ 79 | self.conv1_1_p = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1) 80 | self.conv1_2_p = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1) 81 | self.poo1_p = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) 82 | 83 | self.conv2_1_p = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1) 84 | self.conv2_2_p = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1) 85 | self.pool2_p = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) 86 | 87 | self.conv3_1_p = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1) 88 | self.conv3_2_p = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1) 89 | self.conv3_3_p = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1) 90 | self.pool3_p = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) 91 | 92 | self.conv4_1_p = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1) 93 | self.conv4_2_p = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 94 | self.conv4_3_p = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 95 | self.pool4_p = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) 96 | 97 | self.conv5_1_p = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 98 | self.conv5_2_p = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 99 | self.conv5_3_p = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 100 | """ 101 | 102 | 103 | # For combination layer 104 | # For semantic segmentation 105 | self.conv6_seman_a = nn.Conv2d(in_channels=512, out_channels=64, kernel_size=3, stride=1, padding=1) 106 | self.conv6_seman_b = nn.Conv2d(in_channels=512, out_channels=64, kernel_size=3, stride=1, padding=1) 107 | self.dconv6_seman_a = nn.ConvTranspose2d(in_channels=64, out_channels=64, 108 | kernel_size=4, stride=2, padding=1, output_padding=0) 109 | self.dropout = nn.Dropout2d() 110 | self.dconv7_seman = nn.ConvTranspose2d(in_channels=64, out_channels=64, 111 | kernel_size=16, stride=8, padding=4, output_padding=0) 112 | self.conv8_seman = nn.Conv2d(in_channels=64, out_channels=self.num_classes, kernel_size=3, stride=1, padding=1) 113 | 114 | 115 | # For Center estimation 116 | self.conv6_center_a = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=3, stride=1, padding=1) 117 | self.conv6_center_b = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=3, stride=1, padding=1) 118 | self.dconv6_center_a = nn.ConvTranspose2d(in_channels=128, out_channels=128, 119 | kernel_size=4, stride=2, padding=1, output_padding=0) 120 | self.dconv7_center = nn.ConvTranspose2d(in_channels=128, out_channels=128, 121 | kernel_size=16, stride=8, padding=4, output_padding=0) 122 | self.conv8_center = nn.Conv2d(in_channels=128, out_channels=3 * self.num_classes, kernel_size=3, stride=1, padding=1) 123 | 124 | 125 | # Combine seg with center estimation 126 | self.dconv7_center_comb = nn.ConvTranspose2d(in_channels=192, out_channels=192, 127 | kernel_size=16, stride=8, padding=4, output_padding=0) 128 | self.conv8_center_comb = nn.Conv2d(in_channels=192, out_channels=3 * self.num_classes, kernel_size=3, stride=1, padding=1) 129 | 130 | 131 | self.relu = nn.ReLU() 132 | 133 | 134 | def conv_fun(self, x): 135 | x = self.relu(self.conv1_1(x)) 136 | x = self.relu(self.conv1_2(x)) 137 | x = self.pool1(x) # 1/2 of the original image 138 | 139 | x = self.relu(self.conv2_1(x)) 140 | x = self.relu(self.conv2_2(x)) 141 | x = self.pool2(x) # 1/4 of the original image 142 | 143 | x = self.relu(self.conv3_1(x)) 144 | x = self.relu(self.conv3_2(x)) 145 | x = self.relu(self.conv3_3(x)) 146 | x = self.pool3(x) # 1/8 of the original image 147 | 148 | x = self.relu(self.conv4_1(x)) 149 | x = self.relu(self.conv4_2(x)) 150 | f_conv4 = self.relu(self.conv4_3(x)) 151 | x = self.pool3(f_conv4) # 1/16 of the original image 152 | 153 | x = self.relu(self.conv5_1(x)) 154 | x = self.relu(self.conv5_2(x)) 155 | f_conv5 = self.relu(self.conv5_3(x)) 156 | x = self.pool3(f_conv5) # 1/32 of the original image 157 | return x, f_conv4, f_conv5 158 | 159 | def seman_net(self, f_conv4, f_conv5): 160 | x_a = self.dconv6_seman_a(self.conv6_seman_a(f_conv5)) 161 | x_b = self.conv6_seman_b(f_conv4) 162 | x_mid = x_a + x_b 163 | x = self.dconv7_seman(x_mid) 164 | x = self.conv8_seman(x) 165 | return x, x_mid 166 | 167 | def center_net(self, f_conv4, f_conv5): 168 | x_a = self.dconv6_center_a(self.conv6_center_a(f_conv5)) 169 | x_b = self.conv6_center_b(f_conv4) 170 | x = x_a + x_b 171 | x = self.dconv7_center(x) 172 | x = self.conv8_center(x) 173 | return x 174 | 175 | 176 | def combine_seg_center_net(self, x_seg_mid, f_conv4, f_conv5): 177 | # the input shape of x_seg is batch X channel X ... 178 | # the channel is the # class 179 | x_a = self.dconv6_center_a(self.conv6_center_a(f_conv5)) 180 | x_b = self.conv6_center_b(f_conv4) 181 | x = x_a + x_b 182 | x = torch.cat((x, x_seg_mid), 1) # channel becomes 192 183 | x = self.dconv7_center_comb(x) 184 | x = self.conv8_center_comb(x) 185 | return x 186 | 187 | 188 | 189 | def forward(self, x): 190 | x, f_conv4, f_conv5 = self.conv_fun(x) 191 | x_seman, x_seg_mid = self.seman_net(f_conv4, f_conv5) # the output of semantic segmentation 192 | 193 | if self.vertex_reg == True: 194 | if self.combine_seg_center: 195 | x_center = self.combine_seg_center_net(x_seg_mid, f_conv4, f_conv5) 196 | else: 197 | x_center = self.center_net(f_conv4, f_conv5) # the output of the center estimation 198 | return x_seman, x_center 199 | 200 | else: 201 | return x_seman 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | -------------------------------------------------------------------------------- /lib/vgg16_convs_combine_seg_center.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 12 13:41:56 2019 4 | 5 | @author: Junzhe Xu 6 | """ 7 | 8 | import torch 9 | import torchvision 10 | import torchvision.transforms as transforms 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | import torch.optim as optim 14 | import time 15 | #from networks.network import Network 16 | 17 | 18 | class vgg16_convs_comb_seg_center(nn.Module): 19 | def __init__(self, input_format, num_classes, num_units, scales, threshold_label, vote_threshold, 20 | vertex_reg_2d=False, vertex_reg_3d=False, combine_seg_center_in = False, pose_reg=False, 21 | adaptation=False, trainable=True, is_train=True): 22 | super(vgg16_convs_comb_seg_center, self).__init__() 23 | 24 | self.inputs = [] 25 | self.input_format = input_format 26 | self.num_classes = num_classes 27 | self.num_units = num_units 28 | self.scale = 1.0 29 | self.threshold_label = threshold_label 30 | self.vertex_reg_2d = vertex_reg_2d 31 | self.vertex_reg_3d = vertex_reg_3d 32 | self.vertex_reg = vertex_reg_2d or vertex_reg_3d 33 | self.combine_seg_center = combine_seg_center_in 34 | self.pose_reg = pose_reg 35 | self.adaptation = adaptation 36 | self.trainable = trainable 37 | 38 | # if vote_threshold < 0, only detect single instance (default). 39 | # Otherwise, multiple instances are detected if hough voting score larger than the threshold 40 | 41 | if is_train: 42 | self.is_train = 1 43 | self.skip_pixels = 10 44 | self.vote_threshold = vote_threshold 45 | self.vote_percentage = 0.02 46 | else: 47 | self.is_train = 0 48 | self.skip_pixels = 10 49 | self.vote_threshold = vote_threshold 50 | self.vote_percentage = 0.02 51 | 52 | 53 | 54 | # VGG-16 for feature extraction 55 | self.conv1_1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1) 56 | self.conv1_2 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1) 57 | self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) # 1/2 of the origin image 58 | 59 | self.conv2_1 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1) 60 | self.conv2_2 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1) 61 | self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) # 1/4 of the origin image 62 | 63 | self.conv3_1 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1) 64 | self.conv3_2 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1) 65 | self.conv3_3 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1) 66 | self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) # 1/8 of the origin image 67 | 68 | self.conv4_1 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1) 69 | self.conv4_2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 70 | self.conv4_3 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 71 | self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) # 1/16 of the origin image 72 | 73 | self.conv5_1 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 74 | self.conv5_2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 75 | self.conv5_3 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 76 | 77 | # If input format is RGBD we use another network 78 | """ 79 | self.conv1_1_p = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1) 80 | self.conv1_2_p = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1) 81 | self.poo1_p = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) 82 | 83 | self.conv2_1_p = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1) 84 | self.conv2_2_p = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1) 85 | self.pool2_p = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) 86 | 87 | self.conv3_1_p = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1) 88 | self.conv3_2_p = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1) 89 | self.conv3_3_p = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1) 90 | self.pool3_p = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) 91 | 92 | self.conv4_1_p = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1) 93 | self.conv4_2_p = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 94 | self.conv4_3_p = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 95 | self.pool4_p = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) 96 | 97 | self.conv5_1_p = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 98 | self.conv5_2_p = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 99 | self.conv5_3_p = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) 100 | """ 101 | 102 | 103 | # For combination layer 104 | # For semantic segmentation 105 | self.conv6_seman_a = nn.Conv2d(in_channels=512, out_channels=64, kernel_size=3, stride=1, padding=1) 106 | self.conv6_seman_b = nn.Conv2d(in_channels=512, out_channels=64, kernel_size=3, stride=1, padding=1) 107 | self.dconv6_seman_a = nn.ConvTranspose2d(in_channels=64, out_channels=64, 108 | kernel_size=4, stride=2, padding=1, output_padding=0) 109 | self.dropout = nn.Dropout2d() 110 | self.dconv7_seman = nn.ConvTranspose2d(in_channels=64, out_channels=64, 111 | kernel_size=16, stride=8, padding=4, output_padding=0) 112 | self.conv8_seman = nn.Conv2d(in_channels=64, out_channels=self.num_classes, kernel_size=3, stride=1, padding=1) 113 | 114 | 115 | # For Center estimation 116 | self.conv6_center_a = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=3, stride=1, padding=1) 117 | self.conv6_center_b = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=3, stride=1, padding=1) 118 | self.dconv6_center_a = nn.ConvTranspose2d(in_channels=128, out_channels=128, 119 | kernel_size=4, stride=2, padding=1, output_padding=0) 120 | self.dconv7_center = nn.ConvTranspose2d(in_channels=128, out_channels=128, 121 | kernel_size=16, stride=8, padding=4, output_padding=0) 122 | self.conv8_center = nn.Conv2d(in_channels=128, out_channels=3 * self.num_classes, kernel_size=3, stride=1, padding=1) 123 | 124 | 125 | # Combine seg with center estimation 126 | self.dconv7_center_comb = nn.ConvTranspose2d(in_channels=192, out_channels=192, 127 | kernel_size=16, stride=8, padding=4, output_padding=0) 128 | self.conv8_center_comb = nn.Conv2d(in_channels=192, out_channels=3 * self.num_classes, kernel_size=3, stride=1, padding=1) 129 | 130 | 131 | self.relu = nn.ReLU() 132 | 133 | 134 | def conv_fun(self, x): 135 | x = self.relu(self.conv1_1(x)) 136 | x = self.relu(self.conv1_2(x)) 137 | x = self.pool1(x) # 1/2 of the original image 138 | 139 | x = self.relu(self.conv2_1(x)) 140 | x = self.relu(self.conv2_2(x)) 141 | x = self.pool2(x) # 1/4 of the original image 142 | 143 | x = self.relu(self.conv3_1(x)) 144 | x = self.relu(self.conv3_2(x)) 145 | x = self.relu(self.conv3_3(x)) 146 | x = self.pool3(x) # 1/8 of the original image 147 | 148 | x = self.relu(self.conv4_1(x)) 149 | x = self.relu(self.conv4_2(x)) 150 | f_conv4 = self.relu(self.conv4_3(x)) 151 | x = self.pool3(f_conv4) # 1/16 of the original image 152 | 153 | x = self.relu(self.conv5_1(x)) 154 | x = self.relu(self.conv5_2(x)) 155 | f_conv5 = self.relu(self.conv5_3(x)) 156 | x = self.pool3(f_conv5) # 1/32 of the original image 157 | return x, f_conv4, f_conv5 158 | 159 | def seman_net(self, f_conv4, f_conv5): 160 | x_a = self.dconv6_seman_a(self.conv6_seman_a(f_conv5)) 161 | x_b = self.conv6_seman_b(f_conv4) 162 | x_mid = x_a + x_b 163 | x = self.dconv7_seman(x_mid) 164 | x = self.conv8_seman(x) 165 | return x, x_mid 166 | 167 | def center_net(self, f_conv4, f_conv5): 168 | x_a = self.dconv6_center_a(self.conv6_center_a(f_conv5)) 169 | x_b = self.conv6_center_b(f_conv4) 170 | x = x_a + x_b 171 | x = self.dconv7_center(x) 172 | x = self.conv8_center(x) 173 | return x 174 | 175 | 176 | def combine_seg_center_net(self, x_seg_mid, f_conv4, f_conv5): 177 | # the input shape of x_seg is batch X channel X ... 178 | # the channel is the # class 179 | x_a = self.dconv6_center_a(self.conv6_center_a(f_conv5)) 180 | x_b = self.conv6_center_b(f_conv4) 181 | x = x_a + x_b 182 | x = torch.cat((x, x_seg_mid), 1) # channel becomes 192 183 | x = self.dconv7_center_comb(x) 184 | x = self.conv8_center_comb(x) 185 | return x 186 | 187 | 188 | 189 | def forward(self, x): 190 | x, f_conv4, f_conv5 = self.conv_fun(x) 191 | x_seman, x_seg_mid = self.seman_net(f_conv4, f_conv5) # the output of semantic segmentation 192 | 193 | if self.vertex_reg == True: 194 | if self.combine_seg_center: 195 | x_center = self.combine_seg_center_net(x_seg_mid, f_conv4, f_conv5) 196 | else: 197 | x_center = self.center_net(f_conv4, f_conv5) # the output of the center estimation 198 | return x_seman, x_center 199 | 200 | else: 201 | return x_seman 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | -------------------------------------------------------------------------------- /testpytorch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import HoughVoting 3 | print(torch.__version__) 4 | a = torch.Tensor([[1,2],[3,4]]) 5 | print (a) 6 | b = a.flatten() 7 | print(b) 8 | 9 | print(HoughVoting.forward) 10 | # help(HoughVoting.forward) 11 | -------------------------------------------------------------------------------- /train_net.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # PoseCNN with pytorch 3 | # Author: university of michigan EECS442 4 | # -------------------------------------------------------- 5 | 6 | import os 7 | import random 8 | import time 9 | import numpy as np 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.parallel 13 | #import torch.backends.cudnn as cudnn 14 | import torch.optim as optim 15 | import torch.utils.data 16 | import torchvision.datasets as dset 17 | import torchvision.transforms as transforms 18 | import torchvision.utils as vutils 19 | from torch.autograd import Variable 20 | from tqdm import tqdm 21 | from eval_net import cal_AP 22 | import shutil 23 | import cv2 24 | import matplotlib.pyplot as plt 25 | import matplotlib.patches as patches 26 | from PIL import Image 27 | import pdb 28 | import copy 29 | 30 | #from datasets.ycb.dataset import PoseDataset as PoseDataset_ycb 31 | from datasets.linemod.dataset_posecnn import PoseDataset as PoseDataset_linemod 32 | from datasets.YCB.dataset import Posedataset as PoseDataset_ycb 33 | 34 | #from lib.network import PoseNet, PoseRefineNet 35 | #from lib.loss import Loss 36 | from lib.vgg16_convs import vgg16_convs 37 | from lib.vgg16_convs_combine_seg_center import vgg16_convs_comb_seg_center 38 | from lib.center_est_funcs import * 39 | 40 | class arguments(): 41 | def __init__(self): 42 | self.dataset = 'linemod' 43 | self.dataset_root = '/home/ubuntu/EECS442_CourseProject/datasets/linemod/Linemod_preprocessed' 44 | self.num_objects = 13 45 | 46 | self.flag_pretrained_vgg = False 47 | self.flag_pretrained = True 48 | self.path_pretrained = 'trained_model/pretrained-posecnn-linemod/checkpoint.pth.tar' 49 | # self.num_pretrain_param = 36 50 | # 26 for vgg part 36 for vgg+seg 51 | # 46 for vgg+seg+center 42 for vgg+seg+center(combine seg and center) 52 | self.num_pretrain_param_load = 46 53 | self.num_pretrain_param_freeze = 0 54 | 55 | self.save_model = True 56 | self.save_test_result = True 57 | self.save_train_result = True 58 | self.save_hough_result = True 59 | self.color = [(255, 255, 255), (0, 255, 0), (255, 0, 0), (0, 0, 255), (255, 255, 0), 60 | (255, 0, 255), (0, 255, 255), (128, 0, 0), (0, 128, 0), (0, 0, 128), 61 | (128, 128, 0), (128, 0, 128), (0, 128, 128),(64, 0, 0), (0, 64, 0), (0, 0, 64)] 62 | self.arch = 'Semantic_Segmentation' 63 | self.gpu = True 64 | self.niter_print = 50 65 | self.nepoch_save = 1 66 | self.num_pretrain_param_vgg = 26 67 | 68 | self.batch_size = 4 69 | self.workers = 0 70 | 71 | self.lr = 2e-4 72 | 73 | self.iteration = 2 74 | self.nepoch = 5 75 | 76 | self.repeat_epoch = 1 77 | 78 | self.noise_trans = 0.03 79 | self.manualSeed = 0 80 | self.num_units = 10 81 | self.scales = 1 82 | self.threshold_label = 1 83 | self.vote_threshold = 1 84 | self.refine_start = False 85 | 86 | # FOR CENTER ESTIMATION 87 | self.train_single_frame = True 88 | self.vertex_reg = True 89 | self.vertex_reg_hough = True 90 | self.combine_seg_center = True 91 | self.combine_loss = True 92 | 93 | opt = arguments() 94 | 95 | def save_image_fun(img, name): 96 | if img.ndim == 3: 97 | img = np.transpose(img, (1,2,0)).astype(np.int) 98 | name = os.path.join('log', name+'.jpg') 99 | cv2.imwrite(name, img) 100 | 101 | def outputtoimg(output): 102 | if output.ndim == 2: 103 | return output 104 | else: 105 | img = np.argmax(output, axis = 0) 106 | return img 107 | 108 | def imgscale(img, scale, offset): 109 | global opt 110 | return img*scale + offset 111 | 112 | def save_image( images, labels, output_seg, vertex_targets, output_center, epoch, mode = 'train', i=0, index = 0): 113 | start_str_img = None 114 | start_str_class = None 115 | if mode == 'train': 116 | start_str_img = 'report_result/seg_center_dir/epoch{0}_iter{1}_image{2}'.format(epoch, i, index) 117 | start_str_class = 'report_result/seg_center_dir/epoch{0}_iter{1}_image{2}'.format(epoch, i, index) 118 | else: 119 | start_str_img = 'test/image{0}'.format(index) 120 | start_str_class = 'test/image{0}'.format(index) 121 | 122 | save_image_fun(images.cpu().numpy()[0], start_str_img) 123 | save_image_fun(imgscale(labels[0].cpu().detach().numpy(), 255//opt.num_objects, 0), start_str_img+'_seg_gt') 124 | save_image_fun(imgscale(outputtoimg(output_seg.cpu().detach().numpy()[0]), 255//opt.num_objects, 0), start_str_img+'_seg') 125 | if opt.vertex_reg: 126 | for j in range(1, opt.num_objects): 127 | save_image_fun(imgscale(vertex_targets[0, j*3].cpu().detach().numpy(), 100, 100), start_str_class + '_class{0}_vertex_x_gt'.format(j)) 128 | save_image_fun(imgscale(vertex_targets[0, j*3+1].cpu().detach().numpy(), 100, 100), start_str_class + '_class{0}_vertex_y_gt'.format(j)) 129 | save_image_fun(imgscale(vertex_targets[0, j*3+2].cpu().detach().numpy(), 0.1, 0), start_str_class + '_class{0}_vertex_depth_gt'.format(j)) 130 | 131 | save_image_fun(imgscale(output_center[0, j*3].cpu().detach().numpy(), 100, 100), start_str_class + '_class{0}_vertex_x'.format(j)) 132 | save_image_fun(imgscale(output_center[0, j*3+1].cpu().detach().numpy(), 100, 100), start_str_class + '_class{0}_vertex_y'.format(j)) 133 | save_image_fun(imgscale(output_center[0, j*3+2].cpu().detach().numpy(), 0.1, 0), start_str_class + '_class{0}_vertex_depth'.format(j)) 134 | # save the image for the inverse of the difference between gt and prediction 135 | 136 | 137 | def save_bbox_center(images, output_hough, epoch, i): 138 | index_bbox = 0 139 | output_hough = output_hough.cpu().numpy() 140 | images_plot = images.cpu().numpy() 141 | images_plot = images_plot.transpose(0,2,3,1).astype(np.uint8) 142 | for ii in range(images.shape[0]): 143 | img3 = copy.deepcopy(images_plot[ii]) 144 | for jj in range(index_bbox, output_hough.shape[0]): 145 | if output_hough[jj,0] != ii: 146 | break 147 | index_bbox +=1 148 | width = output_hough[jj,4] - output_hough[jj,2] 149 | height = output_hough[jj,5] - output_hough[jj,3] 150 | if width<4 or height<4: 151 | continue 152 | img2 = np.zeros((480,640,3), np.uint8) 153 | index_class = int(output_hough[jj,1]) 154 | cv2.rectangle(img2, (int(output_hough[jj,2]), int(output_hough[jj,3])), 155 | (int(output_hough[jj,4]), int(output_hough[jj,5])),opt.color[index_class],3) 156 | c_x = int(output_hough[jj,2] + 0.5*width) 157 | c_y = int(output_hough[jj,3] + 0.5*height) 158 | cv2.circle(img2, (c_x, c_y), 3, opt.color[index_class], 2) 159 | index = np.where(img2>0) 160 | img3[index] = img2[index] 161 | name = 'log/report_result/bbox_center/epoch{0}_iter{1}_image{2}_bboxs.png'.format(epoch, i, ii) 162 | cv2.imwrite(name, img3) 163 | 164 | 165 | 166 | def train(trainloader, net, criterion, criterion_center, optimizer, device, device_cpu): 167 | global opt 168 | loss_his = [] 169 | images = None 170 | labels = None 171 | vertex_targets = None 172 | vertex_weights = None 173 | extents = None 174 | meta = None 175 | gt_hough = None 176 | extrin_matrixs_gt= None 177 | bboxs_gt = None 178 | for epoch in range(opt.nepoch): #TODO decide epochs 179 | print('-----------------Epoch = %d-----------------' % (epoch+1)) 180 | if torch.cuda.is_available(): 181 | torch.cuda.empty_cache() 182 | net.train() 183 | start = time.time() 184 | running_loss = 0.0 185 | for i, data in enumerate(trainloader): 186 | optimizer.zero_grad() 187 | if opt.train_single_frame: 188 | # train the network part by part 189 | if opt.vertex_reg == True: 190 | # Only train the center-voting part 191 | images, labels, vertex_targets, vertex_weights, extents, meta, gt_hough, extrin_matrixs_gt, bboxs_gt = data 192 | images = images.to(device) 193 | labels = labels.type('torch.LongTensor').to(device) 194 | vertex_targets = vertex_targets.to(device) 195 | vertex_weights = vertex_weights.to(device) 196 | extents = extents.to(device_cpu) 197 | meta = meta.to(device_cpu) 198 | gt_hough = gt_hough.to(device_cpu) 199 | # change all the tensor type to CPU 200 | output_seg, output_center_dir, output_hough = net(images, extents, meta, gt_hough, 1, device_cpu) 201 | # print("This is the output of hough voting: ", output_hough.cpu().numpy()) 202 | loss_seg = criterion(output_seg, labels) 203 | loss_center = criterion_center(output_center_dir, vertex_targets) 204 | loss = loss_seg + loss_center 205 | else: 206 | # Only train the segmentation part 207 | images, labels = data 208 | images = images.to(device) 209 | labels = labels.type('torch.LongTensor').to(device) 210 | output_seg = net(images) 211 | loss = criterion(output_seg, labels) 212 | else: 213 | # from the begining to the end 214 | print('Empty for this part') 215 | loss = 0 216 | loss.backward() 217 | optimizer.step() 218 | running_loss += loss.item() 219 | if i %opt.niter_print == opt.niter_print-1: 220 | end = time.time() 221 | print('[epoch %d, iter %5d] loss: %.3f eplased time %.3f' % 222 | (epoch + 1, i + 1, running_loss / opt.niter_print, end-start)) 223 | start = time.time() 224 | loss_his.append(running_loss / opt.niter_print) 225 | running_loss = 0.0 226 | if opt.save_train_result and (epoch % opt.nepoch_save == 0 or epoch == opt.nepoch-1) and i%50 == 0: 227 | save_image(images, labels, output_seg, vertex_targets, output_center_dir, epoch, 'train', i, i//50) 228 | if opt.save_hough_result: 229 | save_bbox_center(images, output_hough, epoch, i) 230 | # pdb.set_trace() 231 | return loss_his 232 | 233 | 234 | def test(testloader, net, criterion, criterion_center, device, device_cpu): 235 | ''' 236 | Function for testing. 237 | ''' 238 | global opt 239 | losses = 0. 240 | cnt = 0 241 | cnt_image = 0 242 | with torch.no_grad(): 243 | net = net.eval() 244 | loss = 0.0 245 | vertex_targets = None 246 | output_center = None 247 | for data in tqdm(testloader): 248 | if opt.train_single_frame: 249 | if opt.vertex_reg == True: 250 | # Only train the center-voting part 251 | images, labels, vertex_targets, vertex_weights, extents, meta, gt_hough, extrin_matrixs_gt, bboxs_gt = data 252 | images = images.to(device) 253 | labels = labels.type('torch.LongTensor').to(device) 254 | vertex_targets = vertex_targets.to(device) 255 | vertex_weights = vertex_weights.to(device) 256 | extents = extents.to(device_cpu) 257 | meta = meta.to(device_cpu) 258 | gt_hough = gt_hough.to(device_cpu) 259 | output_seg, output_center_dir, output_center= net(images, extents, meta, gt_hough, 0, device_cpu) 260 | 261 | loss_seg = criterion(output_seg, labels) 262 | loss_center = criterion_center(output_center_dir, vertex_targets) 263 | loss_temp = loss_seg + loss_center 264 | loss += loss_temp.item() 265 | else: 266 | # Only train the segmentation part 267 | images, labels = data 268 | images = images.to(device) 269 | labels = labels.type('torch.LongTensor').to(device) 270 | output_seg = net(images) 271 | loss_temp = criterion(output_seg, labels) 272 | loss += loss_temp.item() 273 | else: 274 | # this part corresponding to the network is end to end 275 | # and only have one loss 276 | print('Empty for this part') 277 | loss = 0 278 | pass 279 | 280 | if opt.save_test_result and cnt%4 == 3: 281 | cnt_image+=1 282 | save_image(images, labels, output_seg, vertex_targets, output_center_dir, 0, 'test', i, cnt_image) 283 | 284 | cnt += 1 285 | print(loss / cnt) 286 | return (loss/cnt) 287 | 288 | 289 | 290 | def loadpretrain(net, pretrained_dic, device, num): 291 | pretrained_list = list(pretrained_dic.items()) 292 | net_dic = net.state_dict() 293 | net_dic_new = net_dic 294 | count = 0 295 | for k, v in net_dic.items(): 296 | name_temp, value_pretrained = pretrained_list[count] 297 | net_dic_new[k] = value_pretrained 298 | count+=1 299 | if count >= num: 300 | break 301 | return net.load_state_dict(net_dic_new) 302 | 303 | 304 | def save_checkpoint(state, is_best, filename='trained_model/checkpoint.pth.tar'): 305 | torch.save(state, filename) 306 | if is_best: 307 | shutil.copyfile(filename, 'trained_model/model_best.pth.tar') 308 | 309 | 310 | def main(): 311 | opt.manualSeed = random.randint(1, 10000) 312 | random.seed(opt.manualSeed) 313 | torch.manual_seed(opt.manualSeed) 314 | device_cpu = torch.device('cpu') 315 | if opt.gpu: 316 | if torch.cuda.is_available(): 317 | device = torch.device('cuda:0') 318 | else: 319 | device = torch.device('cpu') 320 | else: 321 | device = torch.device('cpu') 322 | if torch.cuda.is_available(): 323 | torch.cuda.empty_cache() 324 | torch.backends.cudnn.benchmark = True 325 | 326 | if opt.dataset == 'ycb': 327 | opt.num_objects = 21 #number of object classes in the dataset 328 | opt.num_points = 1000 #number of points on the input pointcloud 329 | opt.outf = 'trained_models/ycb' #folder to save trained models 330 | opt.log_dir = 'experiments/logs/ycb' #folder to save logs 331 | opt.repeat_epoch = 1 #number of repeat times for one epoch training 332 | elif opt.dataset == 'linemod': 333 | opt.num_objects = 10 334 | opt.num_points = 500 335 | opt.outf = 'trained_models/linemod' 336 | opt.log_dir = 'experiments/logs/linemod' 337 | opt.repeat_epoch = 20 338 | else: 339 | print('Unknown dataset') 340 | return 341 | 342 | # check for the network mode 343 | if not opt.vertex_reg and opt.vertex_reg_hough: 344 | assert ValueError('Mode Incorrect') 345 | 346 | if opt.dataset == 'ycb': 347 | dataset = PoseDataset_ycb('train', opt.num_points, True, opt.dataset_root, 348 | opt.noise_trans, opt.refine_start) 349 | elif opt.dataset == 'linemod': 350 | dataset = PoseDataset_linemod('train', opt.num_points, True, opt.dataset_root, opt.noise_trans, 351 | opt.refine_start, False, True, opt.vertex_reg, opt.vertex_reg_hough) 352 | trainloader = torch.utils.data.DataLoader(dataset, batch_size=opt.batch_size, 353 | shuffle=True, num_workers=opt.workers) 354 | 355 | 356 | if opt.dataset == 'ycb': 357 | test_dataset = PoseDataset_ycb('test', opt.num_points, False, opt.dataset_root, 358 | 0.0, opt.refine_start) 359 | elif opt.dataset == 'linemod': 360 | test_dataset = PoseDataset_linemod('test', opt.num_points, False, opt.dataset_root, 361 | 0.0, opt.refine_start, 362 | False, True, opt.vertex_reg, opt.vertex_reg_hough) 363 | testdataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, 364 | shuffle=False, num_workers=opt.workers) 365 | 366 | if opt.dataset == 'ycb': 367 | pass 368 | else: 369 | ap_data = PoseDataset_linemod('test', opt.num_points, False, opt.dataset_root, 0.0, 370 | opt.refine_start, True, True, opt.vertex_reg, opt.vertex_reg_hough) 371 | ap_loader = torch.utils.data.DataLoader(ap_data, batch_size=1, shuffle=False, num_workers=opt.workers) 372 | 373 | opt.sym_list = dataset.get_sym_list() 374 | opt.num_points_mesh = dataset.get_num_points_mesh() 375 | # print(opt.sym_list) 376 | # print('>>>>>>>>----------Dataset loaded!---------<<<<<<<<\nlength of the training set: {0}\nlength of the testing set: {1}\nnumber of sample points on mesh: {2}\nsymmetry object list: {3}'.format(len(dataset), len(test_dataset), opt.num_points_mesh, opt.sym_list)) 377 | 378 | 379 | # Network, optimizer and loss 380 | net = vgg16_convs(None, opt.num_objects, opt.num_objects, opt.scales, opt.threshold_label, 381 | opt.vote_threshold, opt.vertex_reg, opt.vertex_reg, opt.vertex_reg_hough) 382 | # net = vgg16_convs_comb_seg_center(None, opt.num_objects, opt.num_objects, opt.scales, opt.threshold_label, 383 | # opt.vote_threshold, opt.vertex_reg, opt.combine_seg_center) 384 | 385 | optimizer = optim.Adam(net.parameters(), lr = opt.lr) 386 | 387 | weight_class = torch.from_numpy(dataset.weight_clsss).type('torch.FloatTensor').to(device) 388 | # criterion = nn.CrossEntropyLoss(weight_class) 389 | criterion = nn.CrossEntropyLoss() 390 | criterion_center = nn.SmoothL1Loss() 391 | 392 | # Load pretrained model 393 | if opt.flag_pretrained and not opt.flag_pretrained_vgg: 394 | # load out model trained before as initialization to continue 395 | if os.path.isfile(opt.path_pretrained): 396 | print("=> Loading Checkpoint '{}'".format(opt.path_pretrained)) 397 | pre_trained = torch.load(opt.path_pretrained) 398 | net_dic = net.state_dict() 399 | net_dic_new = net_dic 400 | pretrained_dic = pre_trained['state_dict'] 401 | pretrained_list = list(pretrained_dic.items()) 402 | # net.load_state_dict() 403 | if opt.num_pretrain_param_load > 0: 404 | count = 0 405 | for k, v in net_dic.items(): 406 | if count >= opt.num_pretrain_param_load: 407 | break 408 | name_temp, value_pretrained = pretrained_list[count] 409 | if opt.gpu: 410 | net_dic_new[k] = value_pretrained 411 | else: 412 | net_dic_new[k] = value_pretrained.cpu() 413 | count+=1 414 | 415 | 416 | net.load_state_dict(net_dic_new) 417 | """ 418 | optimizer.load_state_dict(pre_trained['optimizer']) 419 | for state in optimizer.state.values(): 420 | for k, v in state.items(): 421 | if torch.cuda.is_available: 422 | if isinstance(v, torch.Tensor): 423 | state[k] = v.cuda() 424 | """ 425 | print("=> Loaded Checkpoint '{}'".format(opt.path_pretrained)) 426 | else: 427 | assert ValueError("no pretrained_model found at {}".format(opt.path_pretrained)) 428 | 429 | count = 0 430 | for param in net.parameters(): 431 | if count >= opt.num_pretrain_param_freeze: 432 | break 433 | param.requires_grad = False 434 | count+=1 435 | elif not opt.flag_pretrained and opt.flag_pretrained_vgg: 436 | # load the pretrained weight of VGG16 net 437 | # 'vgg16': 'https://download.pytorch.org/models/vgg16-397923af.pth' 438 | pretrained_dic = torch.load('pretrained_model\\vgg16-397923af.pth') 439 | pretrained_list = list(pretrained_dic.items()) 440 | net_dic = net.state_dict() 441 | net_dic_new = net_dic 442 | count = 0 443 | for k, v in net_dic.items(): 444 | name_temp, value_pretrained = pretrained_list[count] 445 | net_dic_new[k] = value_pretrained 446 | count+=1 447 | if count >= opt.num_pretrain_param_vgg: 448 | break 449 | net.load_state_dict(net_dic_new) 450 | count = 0 451 | for param in net.parameters(): 452 | param.requires_grad = False 453 | count+=1 454 | if count>=opt.num_pretrain_param_vgg: 455 | break 456 | elif not opt.flag_pretrained and not opt.flag_pretrained_vgg: 457 | print('without laod any pretrained model') 458 | else: 459 | print('Collision with the flag of load vgg param and laod pretrain model') 460 | 461 | 462 | net.to(device) 463 | loss_his = [] 464 | loss_his = train(trainloader, net, criterion, criterion_center, optimizer, device, device_cpu) 465 | 466 | print('>>>>>>>>----------Training Finished!---------<<<<<<<<') 467 | 468 | test_loss = 0 469 | test_loss = test(testdataloader, net, criterion, criterion_center, device, device_cpu) 470 | 471 | print('>>>>>>>>----------AP---------<<<<<<<<') 472 | # aps = None 473 | # if opt.train_single_frame: 474 | # aps = cal_AP(ap_loader, net, criterion, device, opt.num_objects, opt) 475 | # aps = np.array(aps) 476 | # print('Final mean AP : {}'.format(np.mean(aps))) 477 | 478 | print('>>>>>>>>----------Save the model weights!---------<<<<<<<<') 479 | if opt.save_model: 480 | # save the trained model 481 | save_checkpoint({ 482 | 'epoch': opt.nepoch, 483 | 'arch': opt.arch, 484 | 'state_dict': net.state_dict(), 485 | 'test_loss': test_loss, 486 | 'aps': aps, 487 | 'optimizer' : optimizer.state_dict(), 488 | }, False) 489 | 490 | print('>>>>>>>>----------Loss History---------<<<<<<<<') 491 | np.save('log//loss//loss', np.array(loss_his)) 492 | plt.figure() 493 | plt.plot(loss_his) 494 | plt.show() 495 | # <<<<<<< HEAD 496 | plt.savefig('/home/ubuntu/EECS442_CourseProject/log/loss/unfreeze_seg_ctr.png') 497 | # ======= 498 | # plt.savefig('log//loss//loss.png') 499 | 500 | # >>>>>>> ce96c070c17b981e90464ae0b458ab905b1009db 501 | 502 | if __name__ == '__main__': 503 | main() 504 | --------------------------------------------------------------------------------