├── assert └── demo.gif ├── .gitignore ├── requirement.txt ├── config.py ├── README.md ├── dataset.py ├── cuboid_pnp_solver.py ├── cuboid.py ├── train.py ├── eval.py ├── demo.py ├── utils.py └── network └── rtpose_vgg.py /assert/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chinancheng/DOPE.pytorch/HEAD/assert/demo.gif -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | /data/* 3 | /temp/* 4 | /output/* 5 | /model/* 6 | /logs/* 7 | /__pycache__ 8 | /network/__pycache__ 9 | -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- 1 | opencv-python 2 | torch==0.4.1 3 | torchvision==0.2.1 4 | numpy 5 | pyrr 6 | tensorboardX 7 | argparse 8 | scipy 9 | tqdm -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | class Config(object): 2 | 3 | stride = 8 4 | crop_size = 256 5 | sigma = 5.0 6 | test_sigma = 0.1 7 | thresh_map = 0.1 8 | thresh_points = 0.5 9 | thresh_angle = 0.1 10 | grid = crop_size / stride 11 | vec_width = 3.0 12 | output_stage = 6 13 | threshold = 0.1 14 | @classmethod 15 | def describe(cls): 16 | text = 'Config:\n' 17 | attrs = [attr for attr in dir(cls) if not callable(getattr(cls, attr)) and not attr.startswith('__')] 18 | for attr in attrs: 19 | text += '\t{:s} = {:s}\n'.format(attr, str(getattr(cls, attr))) 20 | return text 21 | 22 | 23 | if __name__ == '__main__': 24 | def main(): 25 | print(Config.describe()) 26 | main() 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DOPE.pytorch 2 | This is an unofficial implementation of **[DOPE (Deep Object Pose Estimation for Semantic Robotic Grasping of Household Objects)](https://arxiv.org/abs/1809.10790)** trained on self-created synthetic bottle dataset. 3 | 4 | ![](./assert/demo.gif) 5 | 6 | ## Requirement 7 | ``` 8 | $ conda create -n DOPE python=3.6 9 | $ pip install -r requirement.txt 10 | $ conda activate DOPE 11 | ``` 12 | 13 | ## Usage 14 | ### Train and Eval 15 | [WIP] 16 | 17 | ### Demo 18 | * Logs 19 | [[Download](https://drive.google.com/drive/folders/1DtJ_yOZ946_tn-zGtEIdp2IkRRbtFzza?usp=sharing)] 20 | ``` 21 | DOPE.pytorch 22 | - logs 23 | - Jack_Daniels-checkpoint.pth 24 | - Jose_Cuervo-checkpoint.pth 25 | ``` 26 | 27 | 28 | * Data 29 | ``` 30 | DOPE.pytorch 31 | - data 32 | - Real_bottle_sequence 33 | - 000001.jpg 34 | - 000002.jpg 35 | ... 36 | - _object_settings.json 37 | - _camera_settings.json 38 | 39 | ``` 40 | * Run 41 | ``` 42 | python demo.py 43 | --path_to_data_dir ./data/Real_bottle_sequence 44 | --class_name Jack_Daniels 45 | --checkpoint ./logs/Jack_Daniels-checkpoint.pth 46 | --plot 47 | ``` 48 | 49 | ## Reference 50 | * DOPE (Deep Object Pose Estimation for Semantic Robotic Grasping of Household Objects) [[paper link](https://arxiv.org/abs/1809.10790)] 51 | * Some code is borrowed from below repos. 52 | * Official implementation from nvidia (inference code with ROS) [[Deep_Object_Pose](https://github.com/NVlabs/Deep_Object_Pose)] 53 | * Realtime_Multi-Person_Pose_Estimation [[pytorch_Realtime_Multi-Person_Pose_Estimation](https://github.com/tensorboy/pytorch_Realtime_Multi-Person_Pose_Estimation)] 54 | -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import json 3 | from torch.utils.data import dataset 4 | import glob 5 | import os 6 | import cv2 7 | from config import Config 8 | from utils import * 9 | import glob 10 | 11 | 12 | class Dataset(dataset.Dataset): 13 | def __init__(self, path_to_data, class_name, split='train', img_prefix='png'): 14 | self.split = split 15 | self.img_prefix = img_prefix 16 | self._path_to_data = path_to_data 17 | self._path_to_sequences = glob.glob(os.path.join(path_to_data, split, '*.json')) 18 | self.class_name = class_name 19 | self._num_sequences = len(self._path_to_sequences) 20 | self.crop_size = Config.crop_size 21 | self.stride = Config.stride 22 | self.grid = self.crop_size / self.stride 23 | 24 | def __len__(self): 25 | return self._num_sequences 26 | 27 | def __getitem__(self, index): 28 | label_file_path = self._path_to_sequences[index] 29 | img_file_path = label_file_path.replace('json', self.img_prefix) 30 | vertices = [] 31 | locations = [] 32 | img = cv2.imread(img_file_path) 33 | img = crop(img) 34 | self.ratio = img.shape[0]/self.crop_size 35 | img = cv2.resize(img, (self.crop_size, self.crop_size)) 36 | with open(label_file_path) as f: 37 | labels = json.load(f) 38 | for label in labels['objects']: 39 | if label['class'].find(self.class_name) != -1: 40 | vertex = label['projected_cuboid'] 41 | centroid = label['projected_cuboid_centroid'] 42 | location = label['location'] 43 | vertex.append(centroid) 44 | vertices.append(vertex) 45 | locations.append(location) 46 | heatmaps, pafs = self.get_ground_truth(vertices) 47 | img = preprocess(img).float() 48 | heatmaps = heatmaps.transpose((2, 0, 1)) 49 | heatmaps = torch.tensor(heatmaps, dtype=torch.float) 50 | pafs = pafs.transpose((2, 0, 1)) 51 | pafs = torch.tensor(pafs, dtype=torch.float) 52 | if len(locations) != 0: 53 | locations = torch.tensor(locations, dtype=torch.float) 54 | 55 | return img, heatmaps, pafs, locations, self.ratio 56 | 57 | def get_ground_truth(self, vertices): 58 | heatmaps = np.zeros((int(self.grid), int(self.grid), 9)) 59 | pafs = np.zeros((int(self.grid), int(self.grid), 16)) 60 | for vertex in vertices: 61 | for idx, point in enumerate(vertex): 62 | point = tuple([int(i/self.ratio) for i in point]) 63 | center_point = tuple([int(i/self.ratio) for i in vertex[-1]]) 64 | 65 | gaussian_map = heatmaps[:, :, idx] 66 | if idx < 8: 67 | count = np.zeros((int(self.grid), int(self.grid)), dtype=np.uint32) 68 | if point[0] >= 0 and point[0] < Config.crop_size and point[1] >= 0 and point[1] < Config.crop_size: 69 | centerA = np.array(point) 70 | centerB = np.array(center_point) 71 | pafs[:, :, idx*2:idx*2+2], count = generate_vecmap(centerA, centerB, pafs[:, :, idx*2:idx*2+2], count) 72 | heatmaps[:, :, idx] = generate_gaussianmap(point, gaussian_map) 73 | 74 | return heatmaps, pafs 75 | 76 | 77 | if __name__ == '__main__': 78 | def main(): 79 | path_to_data_dir = '/media/external/Bottle_dataset_split' 80 | dataset = Dataset(path_to_data_dir) 81 | print('len(dataset):', len(dataset)) 82 | print(dataset[0]) 83 | main() 84 | -------------------------------------------------------------------------------- /cuboid_pnp_solver.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 NVIDIA Corporation. All rights reserved. 2 | # This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 3 | # https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode 4 | 5 | import numpy as np 6 | from pyrr import Quaternion 7 | import cv2 8 | from cuboid import * 9 | 10 | 11 | class CuboidPNPSolver(object): 12 | """ 13 | This class is used to find the 6-DoF pose of a cuboid given its projected vertices. 14 | 15 | Runs perspective-n-point (PNP) algorithm. 16 | """ 17 | 18 | # Class variables 19 | cv2version = cv2.__version__.split('.') 20 | cv2majorversion = int(cv2version[0]) 21 | 22 | def __init__(self, object_name="", camera_intrinsic_matrix = None, cuboid3d = None, 23 | dist_coeffs = np.zeros((4, 1))): 24 | self.object_name = object_name 25 | if (not camera_intrinsic_matrix is None): 26 | self._camera_intrinsic_matrix = camera_intrinsic_matrix 27 | else: 28 | camera_intrinsic_matrix = np.array([ 29 | [0, 0, 0], 30 | [0, 0, 0], 31 | [0, 0, 0] 32 | ]) 33 | self._cuboid3d = cuboid3d 34 | 35 | self._dist_coeffs = dist_coeffs 36 | 37 | def set_camera_intrinsic_matrix(self, new_intrinsic_matrix): 38 | '''Sets the camera intrinsic matrix''' 39 | self._camera_intrinsic_matrix = new_intrinsic_matrix 40 | 41 | def solve_pnp(self, cuboid2d_points, pnp_algorithm = None): 42 | """ 43 | Detects the rotation and traslation 44 | of a cuboid object from its vertexes' 45 | 2D location in the image 46 | """ 47 | 48 | # Fallback to default PNP algorithm base on OpenCV version 49 | if pnp_algorithm is None: 50 | if CuboidPNPSolver.cv2majorversion == 2: 51 | pnp_algorithm = cv2.CV_ITERATIVE 52 | elif CuboidPNPSolver.cv2majorversion == 3: 53 | pnp_algorithm = cv2.SOLVEPNP_ITERATIVE 54 | # Alternative algorithms: 55 | # pnp_algorithm = SOLVE_PNP_P3P 56 | # pnp_algorithm = SOLVE_PNP_EPNP 57 | 58 | location = None 59 | quaternion = None 60 | projected_points = cuboid2d_points 61 | 62 | cuboid3d_points = np.array(self._cuboid3d.get_vertices()) 63 | obj_2d_points = [] 64 | obj_3d_points = [] 65 | 66 | for i in range(CuboidVertexType.TotalVertexCount): 67 | check_point_2d = cuboid2d_points[i] 68 | # Ignore invalid points 69 | if (check_point_2d is None): 70 | continue 71 | obj_2d_points.append(check_point_2d) 72 | obj_3d_points.append(cuboid3d_points[i]) 73 | 74 | obj_2d_points = np.array(obj_2d_points, dtype=float) 75 | obj_3d_points = np.array(obj_3d_points, dtype=float) 76 | 77 | valid_point_count = len(obj_2d_points) 78 | 79 | # Can only do PNP if we have more than 3 valid points 80 | is_points_valid = valid_point_count >= 4 81 | 82 | if is_points_valid: 83 | 84 | ret, rvec, tvec = cv2.solvePnP( 85 | obj_3d_points, 86 | obj_2d_points, 87 | self._camera_intrinsic_matrix, 88 | self._dist_coeffs, 89 | flags=pnp_algorithm 90 | ) 91 | 92 | if ret: 93 | location = list(x[0] for x in tvec) 94 | quaternion = self.convert_rvec_to_quaternion(rvec) 95 | 96 | projected_points, _ = cv2.projectPoints(cuboid3d_points, rvec, tvec, self._camera_intrinsic_matrix, self._dist_coeffs) 97 | projected_points = np.squeeze(projected_points) 98 | 99 | # If the location.Z is negative or object is behind the camera then flip both location and rotation 100 | x, y, z = location 101 | if z < 0: 102 | # Get the opposite location 103 | location = [-x, -y, -z] 104 | 105 | # Change the rotation by 180 degree 106 | rotate_angle = np.pi 107 | rotate_quaternion = Quaternion.from_axis_rotation(location, rotate_angle) 108 | quaternion = rotate_quaternion.cross(quaternion) 109 | 110 | return location, quaternion, projected_points 111 | 112 | def convert_rvec_to_quaternion(self, rvec): 113 | '''Convert rvec (which is log quaternion) to quaternion''' 114 | theta = np.sqrt(rvec[0] * rvec[0] + rvec[1] * rvec[1] + rvec[2] * rvec[2]) # in radians 115 | raxis = [rvec[0] / theta, rvec[1] / theta, rvec[2] / theta] 116 | 117 | # pyrr's Quaternion (order is XYZW), https://pyrr.readthedocs.io/en/latest/oo_api_quaternion.html 118 | return Quaternion.from_axis_rotation(raxis, theta) 119 | 120 | # Alternatively: pyquaternion 121 | # return Quaternion(axis=raxis, radians=theta) # uses OpenCV's Quaternion (order is WXYZ) 122 | 123 | def project_points(self, rvec, tvec): 124 | '''Project points from model onto image using rotation, translation''' 125 | output_points, tmp = cv2.projectPoints( 126 | self.__object_vertex_coordinates, 127 | rvec, 128 | tvec, 129 | self.__camera_intrinsic_matrix, 130 | self.__dist_coeffs) 131 | 132 | output_points = np.squeeze(output_points) 133 | 134 | return output_points 135 | -------------------------------------------------------------------------------- /cuboid.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 NVIDIA Corporation. All rights reserved. 2 | # This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 3 | # https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode 4 | 5 | from enum import IntEnum, unique 6 | import numpy as np 7 | import cv2 8 | from pyrr import Quaternion, Matrix44, Vector3, euler 9 | 10 | # Related to the object's local coordinate system 11 | # @unique 12 | class CuboidVertexType(IntEnum): 13 | FrontTopRight = 0 14 | FrontTopLeft = 1 15 | FrontBottomLeft = 2 16 | FrontBottomRight = 3 17 | RearTopRight = 4 18 | RearTopLeft = 5 19 | RearBottomLeft = 6 20 | RearBottomRight = 7 21 | Center = 8 22 | TotalCornerVertexCount = 8 # Corner vertexes doesn't include the center point 23 | TotalVertexCount = 9 24 | 25 | # List of the vertex indexes in each line edges of the cuboid 26 | CuboidLineIndexes = [ 27 | # Front face 28 | [ CuboidVertexType.FrontTopLeft, CuboidVertexType.FrontTopRight ], 29 | [ CuboidVertexType.FrontTopRight, CuboidVertexType.FrontBottomRight ], 30 | [ CuboidVertexType.FrontBottomRight, CuboidVertexType.FrontBottomLeft ], 31 | [ CuboidVertexType.FrontBottomLeft, CuboidVertexType.FrontTopLeft ], 32 | # Back face 33 | [ CuboidVertexType.RearTopLeft, CuboidVertexType.RearTopRight ], 34 | [ CuboidVertexType.RearTopRight, CuboidVertexType.RearBottomRight ], 35 | [ CuboidVertexType.RearBottomRight, CuboidVertexType.RearBottomLeft ], 36 | [ CuboidVertexType.RearBottomLeft, CuboidVertexType.RearTopLeft ], 37 | # Left face 38 | [ CuboidVertexType.FrontBottomLeft, CuboidVertexType.RearBottomLeft ], 39 | [ CuboidVertexType.FrontTopLeft, CuboidVertexType.RearTopLeft ], 40 | # Right face 41 | [ CuboidVertexType.FrontBottomRight, CuboidVertexType.RearBottomRight ], 42 | [ CuboidVertexType.FrontTopRight, CuboidVertexType.RearTopRight ], 43 | ] 44 | 45 | 46 | # ========================= Cuboid3d ========================= 47 | class Cuboid3d(): 48 | '''This class contains a 3D cuboid.''' 49 | 50 | # Create a box with a certain size 51 | def __init__(self, size3d = [1.0, 1.0, 1.0], center_location = [0, 0, 0], 52 | coord_system = None, parent_object = None): 53 | 54 | # NOTE: This local coordinate system is similar 55 | # to the intrinsic transform matrix of a 3d object 56 | self.center_location = center_location 57 | self.coord_system = coord_system 58 | self.size3d = size3d 59 | self._vertices = [0, 0, 0] * CuboidVertexType.TotalVertexCount 60 | 61 | self.generate_vertexes() 62 | 63 | def get_vertex(self, vertex_type): 64 | """Returns the location of a vertex. 65 | 66 | Args: 67 | vertex_type: enum of type CuboidVertexType 68 | 69 | Returns: 70 | Numpy array(3) - Location of the vertex type in the cuboid 71 | """ 72 | return self._vertices[vertex_type] 73 | 74 | def get_vertices(self): 75 | return self._vertices 76 | 77 | def generate_vertexes(self): 78 | width, height, depth = self.size3d 79 | 80 | # By default just use the normal OpenCV coordinate system 81 | if (self.coord_system is None): 82 | cx, cy, cz = self.center_location 83 | # X axis point to the right 84 | right = cx + width / 2.0 85 | left = cx - width / 2.0 86 | # Y axis point downward 87 | top = cy - height / 2.0 88 | bottom = cy + height / 2.0 89 | # Z axis point forward 90 | front = cz + depth / 2.0 91 | rear = cz - depth / 2.0 92 | 93 | # List of 8 vertices of the box 94 | self._vertices = [ 95 | [right, top, front], # Front Top Right 96 | [left, top, front], # Front Top Left 97 | [left, bottom, front], # Front Bottom Left 98 | [right, bottom, front], # Front Bottom Right 99 | [right, top, rear], # Rear Top Right 100 | [left, top, rear], # Rear Top Left 101 | [left, bottom, rear], # Rear Bottom Left 102 | [right, bottom, rear], # Rear Bottom Right 103 | self.center_location, # Center 104 | ] 105 | else: 106 | sx, sy, sz = self.size3d 107 | forward = np.array(self.coord_system.forward, dtype=float) * sy * 0.5 108 | up = np.array(self.coord_system.up, dtype=float) * sz * 0.5 109 | right = np.array(self.coord_system.right, dtype=float) * sx * 0.5 110 | center = np.array(self.center_location, dtype=float) 111 | self._vertices = [ 112 | center + forward + up + right, # Front Top Right 113 | center + forward + up - right, # Front Top Left 114 | center + forward - up - right, # Front Bottom Left 115 | center + forward - up + right, # Front Bottom Right 116 | center - forward + up + right, # Rear Top Right 117 | center - forward + up - right, # Rear Top Left 118 | center - forward - up - right, # Rear Bottom Left 119 | center - forward - up + right, # Rear Bottom Right 120 | self.center_location, # Center 121 | ] 122 | 123 | def get_projected_cuboid2d(self, cuboid_transform, camera_intrinsic_matrix): 124 | """ 125 | Projects the cuboid into the image plane using camera intrinsics. 126 | 127 | Args: 128 | cuboid_transform: the world transform of the cuboid 129 | camera_intrinsic_matrix: camera intrinsic matrix 130 | 131 | Returns: 132 | Cuboid2d - the projected cuboid points 133 | """ 134 | 135 | world_transform_matrix = cuboid_transform 136 | rvec = [0, 0, 0] 137 | tvec = [0, 0, 0] 138 | dist_coeffs = np.zeros((4, 1)) 139 | 140 | transformed_vertices = [0, 0, 0] * CuboidVertexType.TotalVertexCount 141 | for vertex_index in range(CuboidVertexType.TotalVertexCount): 142 | vertex3d = self._vertices[vertex_index] 143 | transformed_vertices[vertex_index] = world_transform_matrix * vertex3d 144 | 145 | projected_vertices = cv2.projectPoints(transformed_vertices, rvec, tvec, 146 | camera_intrinsic_matrix, dist_coeffs) 147 | 148 | return Cuboid2d(projected_vertices) 149 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from config import Config 4 | from network.rtpose_vgg import get_model, use_vgg 5 | from dataset import Dataset 6 | from torch.utils.data import DataLoader 7 | import torch.nn as nn 8 | import torch.optim 9 | import numpy as np 10 | from tensorboardX import SummaryWriter 11 | 12 | 13 | def _train(class_name, path_to_data_dir, path_to_logs_dir, batch_size, epochs, restore): 14 | 15 | # create tensorboard 16 | writer = SummaryWriter(path_to_logs_dir) 17 | 18 | # dataloader 19 | train_dataset = Dataset(class_name=class_name, path_to_data=path_to_data_dir) 20 | train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True, num_workers=0, drop_last=True) 21 | val_dataset = Dataset(path_to_data=path_to_data_dir, class_name=class_name, split='val') 22 | val_dataloader = DataLoader(val_dataset, batch_size, shuffle=False, num_workers=0, drop_last=True) 23 | 24 | # load model 25 | model = get_model(trunk='vgg19') 26 | model = model.cuda() 27 | use_vgg(model, './model', 'vgg19') 28 | 29 | # restore model 30 | if restore: 31 | model.load_state_dict(torch.load(restore)) 32 | 33 | model.train() 34 | 35 | # freeze low-level layer 36 | for i in range(20): 37 | for param in model.model0[i].parameters(): 38 | param.requires_grad = False 39 | trainable_vars = [param for param in model.parameters() if param.requires_grad] 40 | optimizer = torch.optim.Adam(trainable_vars, lr=0.0001) 41 | 42 | epoch = 0 43 | step = 1 44 | best_mse = 1.0 45 | 46 | while epoch != epochs: 47 | for batch_index, (images, heatmaps_target, pafs_target, _, _) in enumerate(train_dataloader): 48 | images = images.cuda() 49 | _, saved_for_loss = model(images) 50 | loss, heatmaps_losses, pafs_losses = _loss(saved_for_loss, heatmaps_target.cuda(), pafs_target.cuda()) 51 | optimizer.zero_grad() 52 | loss.backward() 53 | optimizer.step() 54 | if step % 10 == 0: 55 | print('Epoch: {}, Step: {}, Loss: {}' .format(epoch, step, loss.data.item())) 56 | writer.add_scalar('train_total_loss/loss', loss, step) 57 | for stage, (heatmaps_loss, pafs_loss) in enumerate(zip(heatmaps_losses, pafs_losses)): 58 | writer.add_scalar('train_heatmaps_loss/stage_{}' .format(str(stage)), heatmaps_loss, step) 59 | writer.add_scalar('train_pafs_loss/stage_{}' .format(str(stage)), pafs_loss, step) 60 | if step % 1000 == 0: 61 | pafs_loss, heatmaps_loss = _validate(model, val_dataloader) 62 | total_loss = pafs_loss + heatmaps_loss 63 | print('Validation Paf MSE: {} Heatmap MSE: {} Total MSE: {}' .format(pafs_loss, heatmaps_loss, total_loss)) 64 | writer.add_scalar('val/heatmaps_loss', heatmaps_loss, step) 65 | writer.add_scalar('val/pafs_loss', pafs_loss, step) 66 | writer.add_scalar('val/total_loss', total_loss, step) 67 | if total_loss < best_mse: 68 | print('Save checkpoint') 69 | torch.save(model.state_dict(), os.path.join(path_to_logs_dir, '{}-checkpoint-best.pth'.format(class_name))) 70 | best_mse = total_loss 71 | print('Best MSE: {}' .format(total_loss)) 72 | model.train() 73 | step += 1 74 | epoch += 1 75 | print('Save checkpoint') 76 | torch.save(model.state_dict(), os.path.join(path_to_logs_dir, '{}-checkpoint-last.pth'.format(class_name))) 77 | 78 | 79 | def _validate(model, val_dataloader): 80 | model.eval() 81 | total_pafs_mse = [] 82 | total_heatmaps_mse = [] 83 | for batch_index, (images, heatmaps_target, pafs_target) in enumerate(val_dataloader): 84 | images = images.cuda() 85 | out, _ = model(images) 86 | pafs_pred = out[0].detach().cpu().numpy() 87 | pafs_target = pafs_target.numpy() 88 | heatmaps_pred = out[1].detach().cpu().numpy() 89 | heatmaps_target = heatmaps_target.numpy() 90 | pafs_mse = np.nanmean((pafs_pred - pafs_target) **2) 91 | heatmaps_mse = np.nanmean((heatmaps_pred - heatmaps_target) **2) 92 | total_pafs_mse.append(pafs_mse) 93 | total_heatmaps_mse.append(heatmaps_mse) 94 | total_pafs_mse = np.array(total_pafs_mse).mean() 95 | total_heatmaps_mse = np.array(total_heatmaps_mse).mean() 96 | 97 | return total_pafs_mse, total_heatmaps_mse 98 | 99 | 100 | def _loss(saved_for_loss, heatmaps_target, pafs_target): 101 | criterion = nn.MSELoss().cuda() 102 | total_loss = 0 103 | heatmaps_losses = [] 104 | pafs_losses = [] 105 | for i in range(Config.output_stage): 106 | pafs_pred = saved_for_loss[2*i] 107 | heatmaps_pred = saved_for_loss[2*i+1] 108 | heatmaps_loss = criterion(heatmaps_pred, heatmaps_target) 109 | heatmaps_losses.append(heatmaps_loss) 110 | pafs_loss = criterion(pafs_pred, pafs_target) 111 | pafs_losses.append(pafs_loss) 112 | total_loss += heatmaps_loss + pafs_loss 113 | 114 | return total_loss, heatmaps_losses, pafs_losses 115 | 116 | 117 | if __name__ == '__main__': 118 | 119 | def main(args): 120 | path_to_data_dir = args.path_to_data_dir 121 | if not os.path.exists(path_to_data_dir): 122 | raise FileNotFoundError(path_to_data_dir) 123 | path_to_logs_dir = args.path_to_logs_dir 124 | if path_to_logs_dir: 125 | os.makedirs(path_to_logs_dir, exist_ok=True) 126 | class_name = args.class_name 127 | batch_size = args.batch_size 128 | epochs = args.epochs 129 | restore = args.restore_checkpoint 130 | _train(class_name, path_to_data_dir, path_to_logs_dir, batch_size, epochs, restore) 131 | 132 | parser = argparse.ArgumentParser() 133 | parser.add_argument('-d', '--path_to_data_dir', default='/media/ssd_external/Bottle_dataset_split', help='path to data directory') 134 | parser.add_argument('-l', '--path_to_logs_dir', default='./logs', help='path to logs directory') 135 | parser.add_argument('-b', '--batch_size', default=32, type=int, help='batch size') 136 | parser.add_argument('-e', '--epochs', default=10, type=int, help='epochs') 137 | parser.add_argument('-class', '--class_name', dest="class_name", choices=["Jack_Daniels", "Jose_Cuervo"], default="Jack_Daniels", type=str, help='the class name of object') 138 | parser.add_argument('-r', '--restore_checkpoint', default=None, 139 | help='path to restore checkpoint file, e.g., ./logs/model-100.pth') 140 | main(parser.parse_args()) 141 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from config import Config 4 | from network.rtpose_vgg import get_model, use_vgg 5 | from dataset import Dataset 6 | from torch.utils.data import DataLoader 7 | import torch.optim 8 | import numpy as np 9 | import json 10 | from utils import * 11 | from cuboid import Cuboid3d 12 | from pyrr import Quaternion 13 | from tqdm import tqdm 14 | 15 | 16 | def _eval(class_name, path_to_data_dir, path_to_checkpoint, img_prefix): 17 | 18 | # load pre-trained model 19 | model = get_model(trunk='vgg19') 20 | model = model.cuda() 21 | use_vgg(model, './model', 'vgg19') 22 | print("=> Load pre-trained model from {}".format(path_to_checkpoint)) 23 | model.load_state_dict(torch.load(path_to_checkpoint)) 24 | model.eval() 25 | 26 | # parameter of object size for pnp solver 27 | print("=> Load {} object size".format(class_name)) 28 | path_to_object_seetings = os.path.join(path_to_data_dir, '_object_settings.json') 29 | if not os.path.exists(path_to_object_seetings): 30 | raise FileNotFoundError(path_to_object_seetings) 31 | object_list = json.load(open(path_to_object_seetings))['exported_objects'] 32 | object_size = None 33 | for obj in object_list: 34 | if obj['class'].find(class_name) != -1: 35 | object_size = obj['cuboid_dimensions'] 36 | if not object_size: 37 | raise ValueError("Object size is none") 38 | _cuboid3d = Cuboid3d(object_size) 39 | cuboid3d_points = np.array(_cuboid3d.get_vertices()) 40 | 41 | # parameter of camera for pnp solver 42 | path_to_camera_seetings = os.path.join(path_to_data_dir, '_camera_settings.json') 43 | if not os.path.exists(path_to_camera_seetings): 44 | raise FileNotFoundError(path_to_camera_seetings) 45 | intrinsic_settings = json.load(open(path_to_camera_seetings))['camera_settings'][0]['intrinsic_settings'] 46 | matrix_camera = np.zeros((3, 3)) 47 | matrix_camera[0,0] = intrinsic_settings['fx'] 48 | matrix_camera[1,1] = intrinsic_settings['fy'] 49 | matrix_camera[0,2] = max(intrinsic_settings['cx'], intrinsic_settings['cy']) 50 | matrix_camera[1,2] = max(intrinsic_settings['cx'], intrinsic_settings['cy']) 51 | matrix_camera[2,2] = 1 52 | 53 | try: 54 | dist_coeffs = np.array(json.load(open(path_to_camera_seetings))['camera_settings'][0]["distortion_coefficients"]) 55 | except KeyError: 56 | dist_coeffs = np.zeros((4, 1)) 57 | 58 | # dataloader 59 | val_dataset = Dataset(path_to_data=path_to_data_dir, class_name=class_name, split='val', img_prefix=img_prefix) 60 | val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=0, drop_last=False) 61 | 62 | correct = 0 63 | wrong = 0 64 | # set threshold (cm) 65 | threshold = 3.0 66 | 67 | for batch_index, (images, _, _, location_targets, ratio) in tqdm(enumerate(val_dataloader)): 68 | images = images.cuda() 69 | output, _ = model(images) 70 | line, vertex = output[0], output[1] 71 | line, vertex = line.squeeze(), vertex.squeeze() 72 | objects, peaks = find_objects(vertex, line) 73 | location_predictions = [] 74 | if len(objects) > 0: 75 | for object in objects: 76 | cuboid2d_points = object[1] + [(object[0][0] * 8, object[0][1] * 8)] 77 | cuboid3d_points = np.array(cuboid3d_points) 78 | location = None 79 | quaternion = None 80 | obj_2d_points = [] 81 | obj_3d_points = [] 82 | 83 | for i in range(8): 84 | check_point_2d = cuboid2d_points[i] 85 | # Ignore invalid points 86 | if (check_point_2d is None): 87 | continue 88 | elif check_point_2d[0] < 0 or check_point_2d[1] < 0 or check_point_2d[ 89 | 0] >= Config.crop_size / Config.stride or check_point_2d[1] >= Config.crop_size / Config.stride: 90 | continue 91 | else: 92 | check_point_2d = (check_point_2d[0] * Config.stride * ratio, check_point_2d[1] * Config.stride * ratio) 93 | obj_2d_points.append(check_point_2d) 94 | obj_3d_points.append(cuboid3d_points[i]) 95 | projected_points = object[1] 96 | vertexes = projected_points.copy() 97 | centroid = tuple([int(point * Config.stride * ratio) for point in object[0]]) 98 | obj_2d_points = np.array(obj_2d_points, dtype=np.float32) 99 | obj_3d_points = np.array(obj_3d_points, dtype=np.float32) 100 | valid_point_count = len(obj_2d_points) 101 | if valid_point_count >= 4: 102 | ret, rvec, tvec = cv2.solvePnP(obj_3d_points, obj_2d_points, matrix_camera, dist_coeffs, 103 | flags=cv2.SOLVEPNP_ITERATIVE) 104 | if ret: 105 | location = list(x[0] for x in tvec) 106 | quaternion = convert_rvec_to_quaternion(rvec) 107 | 108 | projected_points, _ = cv2.projectPoints(cuboid3d_points, rvec, tvec, matrix_camera, 109 | dist_coeffs) 110 | projected_points = np.squeeze(projected_points) 111 | # If the location.Z is negative or object is behind the camera then flip both location and rotation 112 | x, y, z = location 113 | if z < 0: 114 | # Get the opposite location 115 | location = [-x, -y, -z] 116 | # Change the rotation by 180 degree 117 | rotate_angle = np.pi 118 | rotate_quaternion = Quaternion.from_axis_rotation(location, rotate_angle) 119 | quaternion = rotate_quaternion.cross(quaternion) 120 | vertexes = [tuple(p) for p in projected_points] 121 | location_predictions.append(location) 122 | location_predictions = np.array(location_predictions) 123 | if len(location_targets) == 0: 124 | wrong += len(location_predictions) 125 | else: 126 | location_targets = location_targets.cpu().data.numpy()[0] 127 | for location_target in location_targets: 128 | distances = [np.sqrt(np.sum(np.square(location_target - location_prediction/10.0))) for location_prediction in location_predictions] 129 | if len(distances) == 0: 130 | pass 131 | wrong += 1 132 | elif min(distances) > threshold: 133 | wrong += 1 134 | else: 135 | correct += 1 136 | 137 | print('Object: {} Accuracy: {}%'.format(class_name, correct/(wrong+correct)*100.0)) 138 | 139 | 140 | if __name__ == '__main__': 141 | 142 | def main(args): 143 | path_to_data_dir = args.path_to_data_dir 144 | if not os.path.exists(path_to_data_dir): 145 | raise FileNotFoundError(path_to_data_dir) 146 | path_to_checkpoint = args.checkpoint 147 | if not os.path.exists(path_to_checkpoint): 148 | raise FileNotFoundError(path_to_data_dir) 149 | class_name = args.class_name 150 | img_prefix = args.img_prefix 151 | _eval(class_name, path_to_data_dir, path_to_checkpoint, img_prefix) 152 | 153 | parser = argparse.ArgumentParser() 154 | parser.add_argument('-d', '--path_to_data_dir', default='/media/ssd_external/Bottle_dataset_split', help='path to data directory') 155 | parser.add_argument('-class', '--class_name', dest="class_name", choices=["Jack_Daniels", "Jose_Cuervo"], default="Jack_Daniels", type=str, help='the class name of object') 156 | parser.add_argument('-c', '--checkpoint', dest="checkpoint", required=True, type=str, 157 | help='the path of model checkpoint') 158 | parser.add_argument('-prefix', '--img_prefix', dest="img_prefix", default="png", type=str, 159 | help='the image prefix') 160 | main(parser.parse_args()) 161 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | from network.rtpose_vgg import get_model, use_vgg 2 | import cv2 3 | import glob 4 | import os 5 | import torch 6 | from utils import * 7 | from config import Config 8 | from argparse import ArgumentParser 9 | from cuboid import Cuboid3d 10 | from pyrr import Quaternion 11 | import json 12 | 13 | 14 | def main(args): 15 | output_dir = args.output_dir 16 | if output_dir: 17 | os.makedirs(output_dir, exist_ok=True) 18 | path_to_data_dir = args.path_to_data_dir 19 | if not os.path.exists(path_to_data_dir): 20 | raise FileNotFoundError(path_to_data_dir) 21 | path_to_checkpoint = args.checkpoint 22 | if not os.path.exists(path_to_checkpoint): 23 | raise FileNotFoundError(path_to_data_dir) 24 | class_name = args.class_name 25 | fps = args.fps 26 | img_prefix = args.img_prefix 27 | 28 | # load pre-trained model 29 | model = get_model(trunk='vgg19') 30 | model = model.cuda() 31 | use_vgg(model, './model', 'vgg19') 32 | print("=> Load pre-trained model from {}".format(path_to_checkpoint)) 33 | model.load_state_dict(torch.load(path_to_checkpoint)) 34 | model.eval() 35 | 36 | # parameter of object size for pnp solver 37 | print("=> Load {} object size".format(class_name)) 38 | path_to_object_seetings = os.path.join(path_to_data_dir, '_object_settings.json') 39 | if not os.path.exists(path_to_object_seetings): 40 | raise FileNotFoundError(path_to_object_seetings) 41 | object_list = json.load(open(path_to_object_seetings))['exported_objects'] 42 | object_size = None 43 | for obj in object_list: 44 | if obj['class'].find(class_name) != -1: 45 | object_size = obj['cuboid_dimensions'] 46 | if not object_size: 47 | raise ValueError("Object size is none") 48 | _cuboid3d = Cuboid3d(object_size) 49 | cuboid3d_points = np.array(_cuboid3d.get_vertices()) 50 | 51 | # parameter of camera for pnp solver 52 | path_to_camera_seetings = os.path.join(path_to_data_dir, '_camera_settings.json') 53 | if not os.path.exists(path_to_camera_seetings): 54 | raise FileNotFoundError(path_to_camera_seetings) 55 | intrinsic_settings = json.load(open(path_to_camera_seetings))['camera_settings'][0]['intrinsic_settings'] 56 | matrix_camera = np.zeros((3, 3)) 57 | matrix_camera[0, 0] = intrinsic_settings['fx'] 58 | matrix_camera[1, 1] = intrinsic_settings['fy'] 59 | matrix_camera[0, 2] = max(intrinsic_settings['cx'], intrinsic_settings['cy']) 60 | matrix_camera[1, 2] = max(intrinsic_settings['cx'], intrinsic_settings['cy']) 61 | matrix_camera[2, 2] = 1 62 | try: 63 | dist_coeffs = np.array(json.load(open(path_to_camera_seetings))['camera_settings'][0]["distortion_coefficients"]) 64 | except KeyError: 65 | dist_coeffs = np.zeros((4, 1)) 66 | path_to_sequences = sorted(glob.glob(os.path.join(path_to_data_dir, '*.{}'.format(img_prefix)))) 67 | 68 | for img_path in path_to_sequences: 69 | original_img = crop(cv2.imread(img_path)) 70 | ratio = max(original_img.shape[:2]) / Config.crop_size 71 | img = cv2.resize(original_img, (Config.crop_size, Config.crop_size)) 72 | img = preprocess(img).float() 73 | img = torch.unsqueeze(img, 0) 74 | out, _ = model(img.cuda()) 75 | line, vertex = out[0].squeeze(), out[1].squeeze() 76 | objects, peaks = find_objects(vertex, line) 77 | original_img = cv2.putText(original_img, "Class name: {}".format(class_name), (50, 50), 78 | cv2.FONT_HERSHEY_COMPLEX, 1, (255, 255, 255), 2) 79 | 80 | if len(objects) > 0: 81 | for object in objects: 82 | cuboid2d_points = object[1] + [(object[0][0] * 8, object[0][1] * 8)] 83 | cuboid3d_points = np.array(cuboid3d_points) 84 | location = None 85 | quaternion = None 86 | obj_2d_points = [] 87 | obj_3d_points = [] 88 | 89 | for i in range(8): 90 | check_point_2d = cuboid2d_points[i] 91 | # Ignore invalid points 92 | if check_point_2d is None: 93 | continue 94 | elif check_point_2d[0] < 0 or check_point_2d[1] < 0 or check_point_2d[0] >= Config.crop_size/Config.stride or check_point_2d[1] >= Config.crop_size/Config.stride: 95 | continue 96 | else: 97 | check_point_2d = (check_point_2d[0] * Config.stride * ratio, check_point_2d[1] * Config.stride * ratio) 98 | obj_2d_points.append(check_point_2d) 99 | obj_3d_points.append(cuboid3d_points[i]) 100 | centroid = tuple([int(point * Config.stride * ratio) for point in object[0]]) 101 | original_img = cv2.circle(original_img, centroid, 5, -1) 102 | obj_2d_points = np.array(obj_2d_points, dtype=float) 103 | obj_3d_points = np.array(obj_3d_points, dtype=float) 104 | valid_point_count = len(obj_2d_points) 105 | if valid_point_count >= 5: 106 | ret, rvec, tvec = cv2.solvePnP(obj_3d_points, obj_2d_points, matrix_camera, dist_coeffs, flags=cv2.SOLVEPNP_ITERATIVE) 107 | if ret: 108 | location = list(x[0] for x in tvec) 109 | quaternion = convert_rvec_to_quaternion(rvec) 110 | 111 | projected_points, _ = cv2.projectPoints(cuboid3d_points, rvec, tvec, matrix_camera, 112 | dist_coeffs) 113 | projected_points = np.squeeze(projected_points) 114 | # If the location.Z is negative or object is behind the camera then flip both location and rotation 115 | x, y, z = location 116 | original_img = cv2.putText(original_img, "Location Prediction: x: {:.2f} y: {:.2f} z: {:.2f}".format(x/10, y/10, z/10), (50, 150), cv2.FONT_HERSHEY_COMPLEX, 1, (255,255,255), 2) 117 | print("Location Prediction: x: {:.2f} y: {:.2f} z: {:.2f}".format(x/10, y/10, z/10)) 118 | if z < 0: 119 | # Get the opposite location 120 | location = [-x, -y, -z] 121 | 122 | # Change the rotation by 180 degree 123 | rotate_angle = np.pi 124 | rotate_quaternion = Quaternion.from_axis_rotation(location, rotate_angle) 125 | quaternion = rotate_quaternion.cross(quaternion) 126 | vertexes = [tuple(p) for p in projected_points] 127 | plot(original_img, vertexes) 128 | if args.save: 129 | if not os.path.exists(output_dir): 130 | os.makedirs(output_dir, exist_ok=True) 131 | output_path = os.path.join(output_dir, img_path.split('/')[-1]) 132 | print('=> Save {}' .format(output_path)) 133 | cv2.imwrite(output_path, original_img) 134 | if args.plot: 135 | original_img = cv2.resize(original_img, (600, 600)) 136 | cv2.imshow('prediction', original_img) 137 | cv2.waitKey(int(1000/fps)) 138 | 139 | 140 | def plot(img, _vertexes, color=(255, 255, 0), scale=1): 141 | vertexes = _vertexes.copy() 142 | # plot 143 | for point in vertexes: 144 | if point: 145 | point = tuple([int(p*scale) for p in point]) 146 | img = cv2.circle(img, point, 7, color, -1) 147 | line_1 = [0, 1, 5, 4, 4, 0, 1, 5, 7, 3, 2, 6] 148 | line_2 = [1, 5, 4, 0, 7, 3, 2, 6, 3, 2, 6, 7] 149 | for p1, p2 in zip(line_1, line_2): 150 | if vertexes[p1] and vertexes[p2]: 151 | _p1 = tuple([int(p * scale) for p in vertexes[p1]]) 152 | _p2 = tuple([int(p * scale) for p in vertexes[p2]]) 153 | img = cv2.line(img, _p1, _p2, color=color, thickness=2) 154 | if vertexes[4] and vertexes[3] and vertexes[8]: 155 | center = tuple([int(p * scale) for p in vertexes[8]]) 156 | norm = tuple([int(((p1 + p2) // 2 * scale - c) * 10 + c) for p1, p2, c in zip(vertexes[4], vertexes[3], center)]) 157 | img = cv2.circle(img, norm, 7, color, 1) 158 | img = cv2.line(img, center, norm, color=color, thickness=2) 159 | 160 | return img 161 | 162 | 163 | if __name__ == "__main__": 164 | parser = ArgumentParser(description="Semantic Segmentation parser") 165 | parser.add_argument('-class', '--class_name', dest="class_name", choices=["Jack_Daniels", "Jose_Cuervo"], default="Jack_Daniels", type=str, help='the class name of object') 166 | parser.add_argument('-d', '--path_to_data_dir', dest="path_to_data_dir", required=True, type=str, help='the path of rgb image') 167 | parser.add_argument('-c', '--checkpoint', dest="checkpoint", required=True, type=str, 168 | help='the path of model checkpoint') 169 | parser.add_argument('-o', '--output_dir', dest="output_dir", default=None, type=str, 170 | help='the path of output folder') 171 | parser.add_argument('-f', '--fps', dest="fps", default=5, type=int, 172 | help='FPS of input sequence') 173 | parser.add_argument('-s', '--save', dest="save", action='store_true') 174 | parser.add_argument('-p', '--plot', dest="plot", action='store_true') 175 | parser.add_argument('-prefix', '--img_prefix', dest="img_prefix", default="jpg", type=str, 176 | help='the image prefix') 177 | main(parser.parse_args()) 178 | 179 | 180 | 181 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from config import Config 3 | from torchvision import transforms 4 | from scipy.ndimage.filters import gaussian_filter 5 | from pyrr import Quaternion 6 | import cv2 7 | 8 | 9 | def preprocess(img): 10 | transform = transforms.Compose([ 11 | transforms.ToTensor(), 12 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 13 | ]) 14 | 15 | return transform(img) 16 | 17 | 18 | def crop(img): 19 | h, w = img.shape[:2] 20 | m = max(h, w) 21 | top = (m - h) // 2 22 | bottom = (m - h) // 2 23 | if top + bottom + h < m: 24 | bottom += 1 25 | left = (m - w) // 2 26 | right = (m - w) // 2 27 | if left + right + w < m: 28 | right += 1 29 | pad_image = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0)) 30 | 31 | return pad_image 32 | 33 | 34 | def generate_gaussianmap(center, accumulate_confid_map): 35 | crop_size = Config.crop_size 36 | stride = Config.stride 37 | sigma = Config.sigma 38 | 39 | grid = crop_size / stride 40 | start = stride / 2.0 - 0.5 41 | x_range = [i for i in range(int(grid))] 42 | y_range = [i for i in range(int(grid))] 43 | xx, yy = np.meshgrid(x_range, y_range) 44 | xx = xx * stride + start 45 | yy = yy * stride + start 46 | d2 = (xx - center[0]) ** 2 + (yy - center[1]) ** 2 47 | exponent = d2 / 2.0 / sigma / sigma 48 | mask = exponent <= 4.6052 49 | cofid_map = np.exp(-exponent) 50 | cofid_map = np.multiply(mask, cofid_map) 51 | accumulate_confid_map += cofid_map 52 | accumulate_confid_map[accumulate_confid_map > 1.0] = 1.0 53 | return accumulate_confid_map 54 | 55 | 56 | def generate_vecmap(centerA, centerB, accumulate_vec_map, count): 57 | centerA = centerA.astype(float) 58 | centerB = centerB.astype(float) 59 | crop_size = Config.crop_size 60 | stride = Config.stride 61 | grid = crop_size / stride 62 | 63 | thre = Config.vec_width 64 | centerB = centerB / stride 65 | centerA = centerA / stride 66 | 67 | limb_vec = centerB - centerA 68 | norm = np.linalg.norm(limb_vec) 69 | if (norm == 0.0): 70 | return accumulate_vec_map, count 71 | limb_vec_unit = limb_vec / norm 72 | 73 | # To make sure not beyond the border of this two points 74 | min_x = max(int(round(min(centerA[0], centerB[0]) - thre)), 0) 75 | max_x = min(int(round(max(centerA[0], centerB[0]) + thre)), grid) 76 | min_y = max(int(round(min(centerA[1], centerB[1]) - thre)), 0) 77 | max_y = min(int(round(max(centerA[1], centerB[1]) + thre)), grid) 78 | range_x = list(range(int(min_x), int(max_x), 1)) 79 | range_y = list(range(int(min_y), int(max_y), 1)) 80 | xx, yy = np.meshgrid(range_x, range_y) 81 | ba_x = xx - centerA[0] # the vector from (x,y) to centerA 82 | ba_y = yy - centerA[1] 83 | limb_width = np.abs(ba_x * limb_vec_unit[1] - ba_y * limb_vec_unit[0]) 84 | mask = limb_width < thre # mask is 2D 85 | 86 | vec_map = np.copy(accumulate_vec_map) * 0.0 87 | vec_map[yy, xx] = np.repeat(mask[:, :, np.newaxis], 2, axis=2) 88 | vec_map[yy, xx] *= limb_vec_unit[np.newaxis, np.newaxis, :] 89 | mask = np.logical_or.reduce( 90 | (np.abs(vec_map[:, :, 0]) > 0, np.abs(vec_map[:, :, 1]) > 0)) 91 | 92 | accumulate_vec_map = np.multiply( 93 | accumulate_vec_map, count[:, :, np.newaxis]) 94 | accumulate_vec_map += vec_map 95 | count[mask == True] += 1 96 | 97 | mask = count == 0 98 | 99 | count[mask == True] = 1 100 | 101 | accumulate_vec_map = np.divide(accumulate_vec_map, count[:, :, np.newaxis]) 102 | count[mask == True] = 0 103 | 104 | return accumulate_vec_map, count 105 | 106 | 107 | def find_objects(vertex2, aff, numvertex=8): 108 | '''Detects objects given network belief maps and affinities, using heuristic method''' 109 | 110 | all_peaks = [] 111 | peak_counter = 0 112 | for j in range(vertex2.size()[0]): 113 | belief = vertex2[j].clone() 114 | map_ori = belief.cpu().data.numpy() 115 | map = gaussian_filter(belief.cpu().data.numpy(), sigma=Config.test_sigma) 116 | p = 1 117 | map_left = np.zeros(map.shape) 118 | map_left[p:, :] = map[:-p, :] 119 | map_right = np.zeros(map.shape) 120 | map_right[:-p, :] = map[p:, :] 121 | map_up = np.zeros(map.shape) 122 | map_up[:, p:] = map[:, :-p] 123 | map_down = np.zeros(map.shape) 124 | map_down[:, :-p] = map[:, p:] 125 | 126 | peaks_binary = np.logical_and.reduce( 127 | ( 128 | map >= map_left, 129 | map >= map_right, 130 | map >= map_up, 131 | map >= map_down, 132 | map > Config.thresh_map) 133 | ) 134 | peaks = zip(np.nonzero(peaks_binary)[1], np.nonzero(peaks_binary)[0]) 135 | 136 | # Computing the weigthed average for localizing the peaks 137 | peaks = list(peaks) 138 | 139 | win = 5 140 | ran = win // 2 141 | peaks_avg = [] 142 | for p_value in range(len(peaks)): 143 | p = peaks[p_value] 144 | weights = np.zeros((win, win)) 145 | i_values = np.zeros((win, win)) 146 | j_values = np.zeros((win, win)) 147 | for i in range(-ran, ran + 1): 148 | for j in range(-ran, ran + 1): 149 | if p[1] + i < 0 \ 150 | or p[1] + i >= map_ori.shape[0] \ 151 | or p[0] + j < 0 \ 152 | or p[0] + j >= map_ori.shape[1]: 153 | continue 154 | 155 | i_values[j + ran, i + ran] = p[1] + i 156 | j_values[j + ran, i + ran] = p[0] + j 157 | 158 | weights[j + ran, i + ran] = (map_ori[p[1] + i, p[0] + j]) 159 | 160 | # if the weights are all zeros 161 | # then add the none continuous points 162 | OFFSET_DUE_TO_UPSAMPLING = 0.4395 163 | try: 164 | peaks_avg.append( 165 | (np.average(j_values, weights=weights) + OFFSET_DUE_TO_UPSAMPLING, 166 | np.average(i_values, weights=weights) + OFFSET_DUE_TO_UPSAMPLING)) 167 | except: 168 | peaks_avg.append((p[0] + OFFSET_DUE_TO_UPSAMPLING, p[1] + OFFSET_DUE_TO_UPSAMPLING)) 169 | # Note: Python3 doesn't support len for zip object 170 | peaks_len = min(len(np.nonzero(peaks_binary)[1]), len(np.nonzero(peaks_binary)[0])) 171 | 172 | peaks_with_score = [peaks_avg[x_] + (map_ori[peaks[x_][1], peaks[x_][0]],) for x_ in range(len(peaks))] 173 | 174 | id = range(peak_counter, peak_counter + peaks_len) 175 | 176 | peaks_with_score_and_id = [peaks_with_score[i] + (id[i],) for i in range(len(id))] 177 | 178 | all_peaks.append(peaks_with_score_and_id) 179 | peak_counter += peaks_len 180 | 181 | objects = [] 182 | 183 | # Check object centroid and build the objects if the centroid is found 184 | for nb_object in range(len(all_peaks[-1])): 185 | if all_peaks[-1][nb_object][2] > Config.thresh_points: 186 | objects.append([ 187 | [all_peaks[-1][nb_object][:2][0], all_peaks[-1][nb_object][:2][1]], 188 | [None for i in range(numvertex)], 189 | [None for i in range(numvertex)], 190 | all_peaks[-1][nb_object][2] 191 | ]) 192 | 193 | # Working with an output that only has belief maps 194 | if aff is None: 195 | if len(objects) > 0 and len(all_peaks) > 0 and len(all_peaks[0]) > 0: 196 | for i_points in range(8): 197 | if len(all_peaks[i_points]) > 0 and all_peaks[i_points][0][2] > Config.threshold: 198 | objects[0][1][i_points] = (all_peaks[i_points][0][0], all_peaks[i_points][0][1]) 199 | else: 200 | # For all points found 201 | for i_lists in range(len(all_peaks[:-1])): 202 | lists = all_peaks[i_lists] 203 | 204 | for candidate in lists: 205 | if candidate[2] < Config.thresh_points: 206 | continue 207 | 208 | i_best = -1 209 | best_dist = 10000 210 | best_angle = 100 211 | for i_obj in range(len(objects)): 212 | center = [objects[i_obj][0][0], objects[i_obj][0][1]] 213 | 214 | # integer is used to look into the affinity map, 215 | # but the float version is used to run 216 | point_int = [int(candidate[0]), int(candidate[1])] 217 | point = [candidate[0], candidate[1]] 218 | 219 | # look at the distance to the vector field. 220 | v_aff = np.array([ 221 | aff[i_lists * 2, 222 | point_int[1], 223 | point_int[0]].data.item(), 224 | aff[i_lists * 2 + 1, 225 | point_int[1], 226 | point_int[0]].data.item()]) * 10 227 | 228 | # normalize the vector 229 | xvec = v_aff[0] 230 | yvec = v_aff[1] 231 | 232 | norms = np.sqrt(xvec * xvec + yvec * yvec) 233 | 234 | xvec /= norms 235 | yvec /= norms 236 | 237 | v_aff = np.concatenate([[xvec], [yvec]]) 238 | 239 | v_center = np.array(center) - np.array(point) 240 | xvec = v_center[0] 241 | yvec = v_center[1] 242 | 243 | norms = np.sqrt(xvec * xvec + yvec * yvec) 244 | 245 | xvec /= norms 246 | yvec /= norms 247 | 248 | v_center = np.concatenate([[xvec], [yvec]]) 249 | 250 | # vector affinity 251 | dist_angle = np.linalg.norm(v_center - v_aff) 252 | 253 | # distance between vertexes 254 | dist_point = np.linalg.norm(np.array(point) - np.array(center)) 255 | 256 | if dist_angle < Config.thresh_angle \ 257 | and best_dist > 1000 \ 258 | or dist_angle < Config.thresh_angle \ 259 | and best_dist > dist_point: 260 | i_best = i_obj 261 | best_angle = dist_angle 262 | best_dist = dist_point 263 | 264 | if i_best is -1: 265 | continue 266 | 267 | if objects[i_best][1][i_lists] is None \ 268 | or best_angle < Config.thresh_angle \ 269 | and best_dist < objects[i_best][2][i_lists][1]: 270 | objects[i_best][1][i_lists] = (candidate[0], candidate[1]) 271 | objects[i_best][2][i_lists] = (best_angle, best_dist) 272 | 273 | return objects, all_peaks 274 | 275 | 276 | def convert_rvec_to_quaternion(rvec): 277 | '''Convert rvec (which is log quaternion) to quaternion''' 278 | theta = np.sqrt(rvec[0] * rvec[0] + rvec[1] * rvec[1] + rvec[2] * rvec[2]) # in radians 279 | raxis = [rvec[0] / theta, rvec[1] / theta, rvec[2] / theta] 280 | 281 | # pyrr's Quaternion (order is XYZW), https://pyrr.readthedocs.io/en/latest/oo_api_quaternion.html 282 | 283 | return Quaternion.from_axis_rotation(raxis, theta) 284 | -------------------------------------------------------------------------------- /network/rtpose_vgg.py: -------------------------------------------------------------------------------- 1 | """CPM Pytorch Implementation""" 2 | 3 | from collections import OrderedDict 4 | import os 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch.utils.data as data 9 | import torch.utils.model_zoo as model_zoo 10 | from torch.autograd import Variable 11 | from torch.nn import init 12 | from config import Config 13 | 14 | 15 | def make_stages(cfg_dict): 16 | """Builds CPM stages from a dictionary 17 | Args: 18 | cfg_dict: a dictionary 19 | """ 20 | layers = [] 21 | for i in range(len(cfg_dict) - 1): 22 | one_ = cfg_dict[i] 23 | for k, v in one_.items(): 24 | if 'pool' in k: 25 | layers += [nn.MaxPool2d(kernel_size=v[0], stride=v[1], 26 | padding=v[2])] 27 | else: 28 | conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1], 29 | kernel_size=v[2], stride=v[3], 30 | padding=v[4]) 31 | layers += [conv2d, nn.ReLU(inplace=True)] 32 | one_ = list(cfg_dict[-1].keys()) 33 | k = one_[0] 34 | v = cfg_dict[-1][k] 35 | conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1], 36 | kernel_size=v[2], stride=v[3], padding=v[4]) 37 | layers += [conv2d] 38 | return nn.Sequential(*layers) 39 | 40 | 41 | def make_vgg19_block(block): 42 | """Builds a vgg19 block from a dictionary 43 | Args: 44 | block: a dictionary 45 | """ 46 | layers = [] 47 | for i in range(len(block)): 48 | one_ = block[i] 49 | for k, v in one_.items(): 50 | if 'pool' in k: 51 | layers += [nn.MaxPool2d(kernel_size=v[0], stride=v[1], 52 | padding=v[2])] 53 | else: 54 | conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1], 55 | kernel_size=v[2], stride=v[3], 56 | padding=v[4]) 57 | layers += [conv2d, nn.ReLU(inplace=True)] 58 | return nn.Sequential(*layers) 59 | 60 | 61 | 62 | def get_model(trunk='vgg19'): 63 | """Creates the whole CPM model 64 | Args: 65 | trunk: string, 'vgg19' or 'mobilenet' 66 | Returns: Module, the defined model 67 | """ 68 | blocks = {} 69 | # block0 is the preprocessing stage 70 | if trunk == 'vgg19': 71 | block0 = [{'conv1_1': [3, 64, 3, 1, 1]}, 72 | {'conv1_2': [64, 64, 3, 1, 1]}, 73 | {'pool1_stage1': [2, 2, 0]}, 74 | {'conv2_1': [64, 128, 3, 1, 1]}, 75 | {'conv2_2': [128, 128, 3, 1, 1]}, 76 | {'pool2_stage1': [2, 2, 0]}, 77 | {'conv3_1': [128, 256, 3, 1, 1]}, 78 | {'conv3_2': [256, 256, 3, 1, 1]}, 79 | {'conv3_3': [256, 256, 3, 1, 1]}, 80 | {'conv3_4': [256, 256, 3, 1, 1]}, 81 | {'pool3_stage1': [2, 2, 0]}, 82 | {'conv4_1': [256, 512, 3, 1, 1]}, 83 | {'conv4_2': [512, 512, 3, 1, 1]}, 84 | {'conv4_3_CPM': [512, 256, 3, 1, 1]}, 85 | {'conv4_4_CPM': [256, 128, 3, 1, 1]}] 86 | 87 | elif trunk == 'mobilenet': 88 | block0 = [{'conv_bn': [3, 32, 2]}, # out: 3, 32, 184, 184 89 | {'conv_dw1': [32, 64, 1]}, # out: 32, 64, 184, 184 90 | {'conv_dw2': [64, 128, 2]}, # out: 64, 128, 92, 92 91 | {'conv_dw3': [128, 128, 1]}, # out: 128, 256, 92, 92 92 | {'conv_dw4': [128, 256, 2]}, # out: 256, 256, 46, 46 93 | {'conv4_3_CPM': [256, 256, 1, 3, 1]}, 94 | {'conv4_4_CPM': [256, 128, 1, 3, 1]}] 95 | 96 | # Stage 1 97 | blocks['block1_1'] = [{'conv5_1_CPM_L1': [128, 128, 3, 1, 1]}, 98 | {'conv5_2_CPM_L1': [128, 128, 3, 1, 1]}, 99 | {'conv5_3_CPM_L1': [128, 128, 3, 1, 1]}, 100 | {'conv5_4_CPM_L1': [128, 512, 1, 1, 0]}, 101 | {'conv5_5_CPM_L1': [512, 16, 1, 1, 0]}] 102 | 103 | blocks['block1_2'] = [{'conv5_1_CPM_L2': [128, 128, 3, 1, 1]}, 104 | {'conv5_2_CPM_L2': [128, 128, 3, 1, 1]}, 105 | {'conv5_3_CPM_L2': [128, 128, 3, 1, 1]}, 106 | {'conv5_4_CPM_L2': [128, 512, 1, 1, 0]}, 107 | {'conv5_5_CPM_L2': [512, 9, 1, 1, 0]}] 108 | 109 | # Stages 2 - 6 110 | for i in range(2, 7): 111 | blocks['block%d_1' % i] = [ 112 | {'Mconv1_stage%d_L1' % i: [153, 128, 7, 1, 3]}, 113 | {'Mconv2_stage%d_L1' % i: [128, 128, 7, 1, 3]}, 114 | {'Mconv3_stage%d_L1' % i: [128, 128, 7, 1, 3]}, 115 | {'Mconv4_stage%d_L1' % i: [128, 128, 7, 1, 3]}, 116 | {'Mconv5_stage%d_L1' % i: [128, 128, 7, 1, 3]}, 117 | {'Mconv6_stage%d_L1' % i: [128, 128, 1, 1, 0]}, 118 | {'Mconv7_stage%d_L1' % i: [128, 16, 1, 1, 0]} 119 | ] 120 | 121 | blocks['block%d_2' % i] = [ 122 | {'Mconv1_stage%d_L2' % i: [153, 128, 7, 1, 3]}, 123 | {'Mconv2_stage%d_L2' % i: [128, 128, 7, 1, 3]}, 124 | {'Mconv3_stage%d_L2' % i: [128, 128, 7, 1, 3]}, 125 | {'Mconv4_stage%d_L2' % i: [128, 128, 7, 1, 3]}, 126 | {'Mconv5_stage%d_L2' % i: [128, 128, 7, 1, 3]}, 127 | {'Mconv6_stage%d_L2' % i: [128, 128, 1, 1, 0]}, 128 | {'Mconv7_stage%d_L2' % i: [128, 9, 1, 1, 0]} 129 | ] 130 | 131 | models = {} 132 | 133 | if trunk == 'vgg19': 134 | print("Bulding VGG19") 135 | models['block0'] = make_vgg19_block(block0) 136 | 137 | for k, v in blocks.items(): 138 | models[k] = make_stages(list(v)) 139 | 140 | class rtpose_model(nn.Module): 141 | def __init__(self, model_dict): 142 | super(rtpose_model, self).__init__() 143 | self.model0 = model_dict['block0'] 144 | self.model1_1 = model_dict['block1_1'] 145 | self.model2_1 = model_dict['block2_1'] 146 | self.model3_1 = model_dict['block3_1'] 147 | self.model4_1 = model_dict['block4_1'] 148 | self.model5_1 = model_dict['block5_1'] 149 | self.model6_1 = model_dict['block6_1'] 150 | 151 | self.model1_2 = model_dict['block1_2'] 152 | self.model2_2 = model_dict['block2_2'] 153 | self.model3_2 = model_dict['block3_2'] 154 | self.model4_2 = model_dict['block4_2'] 155 | self.model5_2 = model_dict['block5_2'] 156 | self.model6_2 = model_dict['block6_2'] 157 | 158 | self._initialize_weights_norm() 159 | 160 | def forward(self, x): 161 | 162 | saved_for_loss = [] 163 | out1 = self.model0(x) 164 | 165 | out1_1 = self.model1_1(out1) 166 | out1_2 = self.model1_2(out1) 167 | out2 = torch.cat([out1_1, out1_2, out1], 1) 168 | saved_for_loss.append(out1_1) 169 | saved_for_loss.append(out1_2) 170 | out = (out1_1, out1_2) 171 | 172 | if Config.output_stage > 1: 173 | out2_1 = self.model2_1(out2) 174 | out2_2 = self.model2_2(out2) 175 | out3 = torch.cat([out2_1, out2_2, out1], 1) 176 | saved_for_loss.append(out2_1) 177 | saved_for_loss.append(out2_2) 178 | out = (out2_1, out2_2) 179 | 180 | if Config.output_stage > 2: 181 | out3_1 = self.model3_1(out3) 182 | out3_2 = self.model3_2(out3) 183 | out4 = torch.cat([out3_1, out3_2, out1], 1) 184 | saved_for_loss.append(out3_1) 185 | saved_for_loss.append(out3_2) 186 | out = (out3_1, out3_2) 187 | 188 | if Config.output_stage > 3: 189 | out4_1 = self.model4_1(out4) 190 | out4_2 = self.model4_2(out4) 191 | out5 = torch.cat([out4_1, out4_2, out1], 1) 192 | saved_for_loss.append(out4_1) 193 | saved_for_loss.append(out4_2) 194 | out = (out4_1, out4_2) 195 | 196 | if Config.output_stage > 4: 197 | out5_1 = self.model5_1(out5) 198 | out5_2 = self.model5_2(out5) 199 | out6 = torch.cat([out5_1, out5_2, out1], 1) 200 | saved_for_loss.append(out5_1) 201 | saved_for_loss.append(out5_2) 202 | out = (out5_1, out5_2) 203 | 204 | if Config.output_stage > 5: 205 | out6_1 = self.model6_1(out6) 206 | out6_2 = self.model6_2(out6) 207 | saved_for_loss.append(out6_1) 208 | saved_for_loss.append(out6_2) 209 | out = (out6_1, out6_2) 210 | 211 | return out, saved_for_loss 212 | 213 | def _initialize_weights_norm(self): 214 | 215 | for m in self.modules(): 216 | if isinstance(m, nn.Conv2d): 217 | init.normal_(m.weight, std=0.01) 218 | if m.bias is not None: # mobilenet conv2d doesn't add bias 219 | init.constant_(m.bias, 0.0) 220 | 221 | # last layer of these block don't have Relu 222 | init.normal_(self.model1_1[8].weight, std=0.01) 223 | init.normal_(self.model1_2[8].weight, std=0.01) 224 | 225 | init.normal_(self.model2_1[12].weight, std=0.01) 226 | init.normal_(self.model3_1[12].weight, std=0.01) 227 | init.normal_(self.model4_1[12].weight, std=0.01) 228 | init.normal_(self.model5_1[12].weight, std=0.01) 229 | init.normal_(self.model6_1[12].weight, std=0.01) 230 | 231 | init.normal_(self.model2_2[12].weight, std=0.01) 232 | init.normal_(self.model3_2[12].weight, std=0.01) 233 | init.normal_(self.model4_2[12].weight, std=0.01) 234 | init.normal_(self.model5_2[12].weight, std=0.01) 235 | init.normal_(self.model6_2[12].weight, std=0.01) 236 | 237 | model = rtpose_model(models) 238 | return model 239 | 240 | 241 | """Load pretrained model on Imagenet 242 | :param model, the PyTorch nn.Module which will train. 243 | :param model_path, the directory which load the pretrained model, will download one if not have. 244 | :param trunk, the feature extractor network of model. 245 | """ 246 | 247 | 248 | def use_vgg(model, model_path, trunk): 249 | model_urls = { 250 | 'vgg16': 'https://download.pytorch.org/models/vgg16-397923af.pth', 251 | 'ssd': 'https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth', 252 | 'vgg19': 'https://download.pytorch.org/models/vgg19-dcbb9e9d.pth'} 253 | 254 | number_weight = { 255 | 'vgg16': 18, 256 | 'ssd': 18, 257 | 'vgg19': 20} 258 | 259 | url = model_urls[trunk] 260 | 261 | if trunk == 'ssd': 262 | urllib.urlretrieve('https://s3.amazonaws.com/amdegroot-models/ssd300_mAP_77.43_v2.pth', 263 | os.path.join(model_path, 'ssd.pth')) 264 | vgg_state_dict = torch.load(os.path.join(model_path, 'ssd.pth')) 265 | print('loading SSD') 266 | else: 267 | vgg_state_dict = model_zoo.load_url(url, model_dir=model_path) 268 | vgg_keys = vgg_state_dict.keys() 269 | 270 | # load weights of vgg 271 | weights_load = {} 272 | # weight+bias,weight+bias.....(repeat 10 times) 273 | for i in range(number_weight[trunk]): 274 | weights_load[list(model.state_dict().keys())[i] 275 | ] = vgg_state_dict[list(vgg_keys)[i]] 276 | 277 | state = model.state_dict() 278 | state.update(weights_load) 279 | model.load_state_dict(state) 280 | --------------------------------------------------------------------------------