├── README.md
├── process_BundleTrack_data.py
└── process_data.py


/README.md:
--------------------------------------------------------------------------------
 1 | # ObjTracking
 2 | Please check out the HOI4D Challenge on the latest project website www.hoi4d.top !
 3 | ## Overview
 4 | This code base provides a benchmark for the HOI4D challenge object tracking task, and we provide the sript to preprocess dataset for Bundletrack, which is a sota categoray-level object pose tracking method.
 5 | ## Challege
 6 | For this challege, you need submmit a pred.npy file(your predicted results) to the leaderboard. The file pred.npy is a ndarray which is the prediction of test_wolabel.h5. You can download the example here: [Link](https://1drv.ms/u/s!ApQF_e_bw-USgjQCKg9hGJIijeqs?e=eGfohd)
 7 | 
 8 | ## Usage
 9 | After you process the data using our sripts, you can easily run [BundleTrack](https://github.com/wenbowen123/BundleTrack).
10 | ## Citation
11 | ```
12 | @InProceedings{Liu_2022_CVPR,
13 |     author    = {Liu, Yunze and Liu, Yun and Jiang, Che and Lyu, Kangbo and Wan, Weikang and Shen, Hao and Liang, Boqiang and Fu, Zhoujie and Wang, He and Yi, Li},
14 |     title     = {HOI4D: A 4D Egocentric Dataset for Category-Level Human-Object Interaction},
15 |     booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
16 |     month     = {June},
17 |     year      = {2022},
18 |     pages     = {21013-21022}
19 | }
20 | ```
21 | ```
22 | @inproceedings{wen2021bundletrack,
23 |   title={BundleTrack: 6D Pose Tracking for Novel Objects without Instance or Category-Level 3D Models},
24 |   author={Wen, B and Bekris, Kostas E},
25 |   booktitle={IEEE/RSJ International Conference on Intelligent Robots and Systems},
26 |   year={2021}
27 | }
28 | ```
29 | 
30 | 


--------------------------------------------------------------------------------
/process_BundleTrack_data.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import cv2
  4 | from process_data import shift_mask, get_specific_semantic_mask, read_pose_from_json
  5 | 
  6 | 
  7 | def get_video_names(dataset_dir, category):
  8 |     video_names = []
  9 |     for ZY in os.listdir(dataset_dir):
 10 |         p1 = os.path.join(dataset_dir, ZY)
 11 |         if not "ZY" in ZY:
 12 |             continue
 13 |         for H in os.listdir(p1):
 14 |             p2 = os.path.join(p1, H)
 15 |             if not "H" in H:
 16 |                 continue
 17 |             p3 = os.path.join(p2, category)
 18 |             if not os.path.isdir(p3):
 19 |                 continue
 20 |             for N in os.listdir(p3):
 21 |                 p4 = os.path.join(p3, N)
 22 |                 if not "N" in N:
 23 |                     continue
 24 |                 for S in os.listdir(p4):
 25 |                     p5 = os.path.join(p4, S)
 26 |                     if not "S" in S:
 27 |                         continue
 28 |                     for s in os.listdir(p5):
 29 |                         p6 = os.path.join(p5, s)
 30 |                         if not "s" in s:
 31 |                             continue
 32 |                         for T in os.listdir(p6):
 33 |                             p7 = os.path.join(p6, T)
 34 |                             if not "T" in T:
 35 |                                 continue
 36 |                             if ZY == "ZY20210800001":
 37 |                                 if not os.path.isfile(os.path.join(p7, "align_image", "0.jpg")):
 38 |                                     continue
 39 |                             else:
 40 |                                 if not os.path.isfile(os.path.join(p7, "shift_rgb", "0.jpg")):
 41 |                                     continue
 42 |                             flag = True
 43 |                             for i in range(300):
 44 |                                 pose_path = os.path.join(p7, "objpose", str(i) + ".json")
 45 |                                 if not os.path.isfile(pose_path):
 46 |                                     pose_path = os.path.join(p7, "objpose", str(i).zfill(5) + ".json")
 47 |                                 if not os.path.isfile(pose_path):
 48 |                                     flag = False
 49 |                                     break
 50 |                             if not flag:
 51 |                                 continue
 52 |                             video_names.append(os.path.join(ZY, H, category, N, S, s, T))
 53 |     return video_names
 54 | 
 55 | 
 56 | def prepare_BundleTrack_data(dataset_dir, HOI4D_Sim_dir, intrinsics_dir, video_name, save_dir):
 57 |     save_dir = os.path.join(save_dir, video_name.replace("/", "_"))
 58 |     rgb_dir = os.path.join(save_dir, "rgb")
 59 |     depth_dir = os.path.join(save_dir, "depth")
 60 |     masks_dir = os.path.join(save_dir, "masks")
 61 |     poses_dir = os.path.join(save_dir, "annotated_poses")
 62 |     os.makedirs(rgb_dir, exist_ok=True)
 63 |     os.makedirs(depth_dir, exist_ok=True)
 64 |     os.makedirs(masks_dir, exist_ok=True)
 65 |     os.makedirs(poses_dir, exist_ok=True)
 66 | 
 67 |     p1 = video_name.find("ZY")
 68 |     p2 = video_name[p1:].find("/")
 69 |     ZY = video_name[p1:p1+p2]
 70 |     intrinsic = np.load(os.path.join(intrinsics_dir, ZY, "intrin.npy"))
 71 |     np.savetxt(os.path.join(save_dir, "cam_K.txt"), intrinsic)
 72 | 
 73 |     for i in range(300):
 74 |         '''
 75 |         # HOI4D_Sim -> HOI4D_BundleTrack
 76 |         rgb_path = os.path.join(HOI4D_Sim_dir, video_name, "rgb", str(i).zfill(5) + ".png")
 77 |         depth_path = os.path.join(HOI4D_Sim_dir, video_name, "depth", str(i).zfill(5) + ".png")
 78 |         mask_path = os.path.join(HOI4D_Sim_dir, video_name, "2Dseg", str(i).zfill(5) + ".png")
 79 |         rgb = cv2.imread(rgb_path, cv2.IMREAD_UNCHANGED)
 80 |         depth = cv2.imread(depth_path, cv2.CV_16UC1)
 81 |         mask = cv2.imread(mask_path, cv2.IMREAD_UNCHANGED)
 82 |         '''
 83 |         # HOI4D_overall -> HOI4D_BundleTrack
 84 |         if ZY == "ZY20210800001":
 85 |             rgb_path = os.path.join(dataset_dir, video_name, "align_image", str(i) + ".jpg")
 86 |         else:
 87 |             rgb_path = os.path.join(dataset_dir, video_name, "shift_rgb", str(i) + ".jpg")
 88 |         depth_path = os.path.join(dataset_dir, video_name, "align_depth", str(i) + ".png")
 89 |         mask_path = os.path.join(dataset_dir, video_name, "refine_2Dseg", "mask", str(i).zfill(5) + ".png")  # unshifted
 90 |         rgb = cv2.imread(rgb_path, cv2.IMREAD_UNCHANGED)
 91 |         depth = cv2.imread(depth_path, cv2.CV_16UC1)
 92 |         mask = cv2.imread(mask_path, cv2.IMREAD_UNCHANGED)[:, :, ::-1]  # unshifted
 93 |         mask = shift_mask(mask, ZY)  # shifted
 94 |         
 95 |         object_mask = get_specific_semantic_mask(mask, label=1).astype(np.uint8)
 96 | 
 97 |         cv2.imwrite(os.path.join(rgb_dir, str(i).zfill(4) + ".png"), rgb)
 98 |         cv2.imwrite(os.path.join(depth_dir, str(i).zfill(4) + ".png"), depth)
 99 |         cv2.imwrite(os.path.join(masks_dir, str(i).zfill(4) + ".png"), object_mask)
100 | 
101 |         pose_path = os.path.join(dataset_dir, video_name, "objpose", str(i) + ".json")
102 |         if not os.path.isfile(pose_path):
103 |             pose_path = os.path.join(dataset_dir, video_name, "objpose", str(i).zfill(5) + ".json")
104 |         if not os.path.isfile(pose_path):
105 |             raise NotImplementedError
106 |         gt_pose = read_pose_from_json(pose_path)
107 |         T = np.eye(4)
108 |         T[:3, :3] = gt_pose["rotation"]
109 |         T[:3, 3:] = gt_pose["translation"].reshape(3, 1)
110 |         np.savetxt(os.path.join(poses_dir, str(i).zfill(4) + ".txt"), T)
111 | 
112 | 
113 | if __name__ == "__main__":
114 |     # HOI4D_Sim_dir = "/localdata_hdd1/HOI4D_Sim"
115 |     HOI4D_Sim_dir = None
116 | 
117 |     ############ CHANGE THIS ############
118 |     category = "C13"
119 |     dataset_dir = "/share/datasets/HOI4D_overall"
120 |     save_dir = os.path.join("/localdata_hdd1/HOI4D_BundleTrack", category)
121 |     intrinsics_dir = "/share/datasets/HOI4D_intrinsics"
122 |     ########################
123 | 
124 |     video_names = get_video_names(dataset_dir, category)
125 | 
126 |     wr = open(category + ".txt", "w")
127 |     for vn in video_names:
128 |         wr.write(vn + "\n")
129 |     wr.close()
130 | 
131 |     print("len(video_names) = ", len(video_names))
132 |     for video_name in video_names:
133 |         prepare_BundleTrack_data(dataset_dir, HOI4D_Sim_dir, intrinsics_dir, video_name, save_dir)
134 | 


--------------------------------------------------------------------------------
/process_data.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import json
  4 | import cv2
  5 | import numpy as np
  6 | from scipy.spatial.transform import Rotation as Rt
  7 | import open3d as o3d
  8 | 
  9 | 
 10 | def get_color_map(N=256):
 11 |     def bitget(byteval, idx):
 12 |         return ((byteval & (1 << idx)) != 0)
 13 | 
 14 |     cmap = np.zeros((N, 3), dtype=np.uint8)
 15 |     for i in range(N):
 16 |         r = g = b = 0
 17 |         c = i
 18 |         for j in range(8):
 19 |             r = r | (bitget(c, 0) << 7-j)
 20 |             g = g | (bitget(c, 1) << 7-j)
 21 |             b = b | (bitget(c, 2) << 7-j)
 22 |             c = c >> 3
 23 | 
 24 |         cmap[i] = np.array([r, g, b])
 25 | 
 26 |     return cmap
 27 | 
 28 | 
 29 | def shift_mask(mask, ZY):
 30 |     if ZY == "ZY20210800001":
 31 |         dx, dy = 0, 0
 32 |     elif (ZY == "ZY20210800002") or (ZY == "ZY20210800004"):
 33 |         dx, dy = 15, -3
 34 |     elif ZY == "ZY20210800003":
 35 |         dx, dy = 15, -30
 36 |     else:
 37 |         raise NotImplementedError
 38 | 
 39 |     rows, cols, _ = mask.shape
 40 |     MAT = np.float32([[1, 0, dx], [0, 1, dy]])
 41 |     dst = cv2.warpAffine(mask, MAT, (cols, rows))
 42 |     return dst
 43 | 
 44 | 
 45 | def read_pose_from_json(pose_path, num=0):
 46 |     raw_data = json.load(open(pose_path, "r"))
 47 |     if "dataList" in raw_data:
 48 |         raw_pose = raw_data["dataList"][num]
 49 |     else:
 50 |         raw_pose = raw_data["objects"][num]
 51 |     
 52 |     translation, rotation, scale = raw_pose["center"], raw_pose["rotation"], raw_pose["dimensions"]
 53 |     translation = np.float32([translation["x"], translation["y"], translation["z"]])
 54 |     rotation = np.float32([rotation["x"], rotation["y"], rotation["z"]])
 55 |     rotation = Rt.from_euler('XYZ', rotation).as_matrix()
 56 |     scale = np.float32([scale["length"], scale["width"], scale["height"]])
 57 | 
 58 |     pose = {
 59 |         "translation": translation.reshape(3, 1),
 60 |         "rotation": rotation,
 61 |         "scale": scale.reshape(3, 1),
 62 |     }
 63 |     return pose
 64 | 
 65 | 
 66 | def read_poses_from_pkl(poses_path):
 67 |     raw_poses = pickle.load(open(poses_path, "rb"))
 68 |     return raw_poses
 69 | 
 70 | 
 71 | def get_specific_semantic_mask(mask, label=0):
 72 |     color_map = get_color_map(N=10)
 73 |     c = color_map[label]
 74 |     specific_mask = (mask[..., 0] == c[0]) & (mask[..., 1] == c[1]) & (mask[..., 2] == c[2])
 75 |     return specific_mask
 76 | 
 77 | 
 78 | def read_object_pcd(video_dir, idx, sampling_ratio=0.1):
 79 |     # read data
 80 |     p1 = video_dir.find("ZY")
 81 |     p2 = video_dir[p1:].find("/")
 82 |     ZY = video_dir[p1:p1+p2]
 83 |     if ZY == "ZY20210800001":
 84 |         rgb_path = os.path.join(video_dir, "align_image", str(idx) + ".jpg")
 85 |     else:
 86 |         rgb_path = os.path.join(video_dir, "shift_rgb", str(idx) + ".jpg")
 87 |     depth_path = os.path.join(video_dir, "align_depth", str(idx) + ".png")
 88 |     mask_path = os.path.join(video_dir, "refine_2Dseg", "mask", str(idx).zfill(5) + ".png")  # unshifted
 89 |     rgb = o3d.io.read_image(rgb_path)
 90 |     depth = o3d.io.read_image(depth_path)
 91 |     mask = cv2.imread(mask_path, cv2.IMREAD_UNCHANGED)[:, :, ::-1]  # unshifted
 92 |     mask = shift_mask(mask, ZY)  # shifted
 93 | 
 94 |     # read intrinsic
 95 |     intrinsic_dir = "/share/datasets/HOI4D_intrinsics"
 96 |     intrinsic = np.load(os.path.join(intrinsic_dir, ZY, "intrin.npy"))  # shape = (3, 3)
 97 |     intrinsic = o3d.camera.PinholeCameraIntrinsic(1920, 1080, intrinsic[0, 0], intrinsic[1, 1], intrinsic[0, 2], intrinsic[1, 2])
 98 |     
 99 |     # process data
100 |     object_mask = get_specific_semantic_mask(mask, label=1)
101 |     object_depth = np.float32(depth)
102 |     object_depth[~object_mask] = 0
103 |     object_depth = o3d.geometry.Image(object_depth)
104 |     object_rgbd = o3d.geometry.RGBDImage.create_from_color_and_depth(rgb, object_depth, convert_rgb_to_intensity=False)
105 |     object_pcd = o3d.geometry.PointCloud.create_from_rgbd_image(object_rgbd, intrinsic, extrinsic=np.eye(4))
106 |     object_pcd = object_pcd.voxel_down_sample(voxel_size=0.001)
107 |     object_pcd, _ = object_pcd.remove_radius_outlier(nb_points=500, radius=0.03)
108 |     object_pcd = object_pcd.random_down_sample(sampling_ratio=0.1)
109 |     return object_pcd
110 | 
111 | 
112 | def read_model_pcd(model_path, N_points=10000):
113 |     model_mesh = o3d.io.read_triangle_mesh(model_path)
114 |     model_pcd = model_mesh.sample_points_uniformly(number_of_points=N_points, seed=0)
115 |     model_points = np.float32(model_pcd.points)
116 |     center = (np.min(model_points, axis=0) + np.max(model_points, axis=0)) / 2
117 |     model_points -= center
118 |     return model_points
119 | 
120 | 
121 | if __name__ == "__main__":
122 |     video_dir = "/share/datasets/HOI4D_overall/ZY20210800001/H1/C12/N33/S165/s02/T1"
123 |     idx = 0
124 | 
125 |     pose_path = os.path.join(video_dir, "objpose", str(idx) + ".json")
126 |     if not os.path.isfile(pose_path):
127 |         pose_path = os.path.join(video_dir, "objpose", str(idx).zfill(5) + ".json")
128 |     if not os.path.isfile(pose_path):
129 |         raise NotImplementedError
130 |     gt_pose = read_pose_from_json(pose_path)
131 |     object_pcd = read_object_pcd(video_dir, idx)
132 | 
133 |     object_points = np.float32(object_pcd.points)
134 |     object_colors = np.float32(object_pcd.colors)
135 |     object_points = (object_points - gt_pose["translation"].reshape(3)) @ gt_pose["rotation"]
136 |     object_pcd = o3d.geometry.PointCloud()
137 |     object_pcd.points = o3d.utility.Vector3dVector(object_points)
138 |     object_pcd.colors = o3d.utility.Vector3dVector(object_colors)
139 |     o3d.io.write_point_cloud("gt.ply", object_pcd)
140 | 


--------------------------------------------------------------------------------