├── BlazeposeDepthai.py ├── BlazeposeDepthaiEdge.py ├── BlazeposeRenderer.py ├── FPS.py ├── LICENSE.txt ├── README.md ├── custom_models ├── DetectionBestCandidate.py ├── DetectionBestCandidate_sh1.blob ├── DivideBy255.py ├── DivideBy255_sh1.blob ├── README.md └── convert_model.sh ├── demo.py ├── docker_tflite2tensorflow.sh ├── examples └── semaphore_alphabet │ ├── README.md │ ├── demo.py │ └── medias │ ├── semaphore.gif │ ├── semaphore.mp4 │ └── video.mp4 ├── img ├── 3d_visualizations.gif ├── 3d_world_visualization.gif ├── full_body_landmarks.png ├── pipeline_edge_mode.png ├── pipeline_host_mode.png └── taichi.gif ├── mediapipe_utils.py ├── models ├── convert_models.sh ├── gen_blob_shave.sh ├── pose_detection_sh4.blob ├── pose_landmark_full_sh4.blob ├── pose_landmark_heavy_sh4.blob └── pose_landmark_lite_sh4.blob ├── o3d_utils.py ├── requirements.txt └── template_manager_script.py /BlazeposeDepthai.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import mediapipe_utils as mpu 3 | import cv2 4 | from pathlib import Path 5 | from FPS import FPS, now 6 | from math import sin, cos 7 | import depthai as dai 8 | import time, sys 9 | 10 | SCRIPT_DIR = Path(__file__).resolve().parent 11 | POSE_DETECTION_MODEL = str(SCRIPT_DIR / "models/pose_detection_sh4.blob") 12 | LANDMARK_MODEL_FULL = str(SCRIPT_DIR / "models/pose_landmark_full_sh4.blob") 13 | LANDMARK_MODEL_HEAVY = str(SCRIPT_DIR / "models/pose_landmark_heavy_sh4.blob") 14 | LANDMARK_MODEL_LITE = str(SCRIPT_DIR / "models/pose_landmark_lite_sh4.blob") 15 | 16 | 17 | def to_planar(arr: np.ndarray, shape: tuple) -> np.ndarray: 18 | return cv2.resize(arr, shape).transpose(2,0,1).flatten() 19 | 20 | class BlazeposeDepthai: 21 | """ 22 | Blazepose body pose detector 23 | Arguments: 24 | - input_src: frame source, 25 | - "rgb" or None: OAK* internal color camera, 26 | - "rgb_laconic": same as "rgb" but without sending the frames to the host, 27 | - a file path of an image or a video, 28 | - an integer (eg 0) for a webcam id, 29 | - pd_model: Blazepose detection model blob file (if None, takes the default value POSE_DETECTION_MODEL), 30 | - pd_score: confidence score to determine whether a detection is reliable (a float between 0 and 1). 31 | - lm_model: Blazepose landmark model blob file 32 | - None or "full": the default blob file LANDMARK_MODEL_FULL, 33 | - "lite": the default blob file LANDMARK_MODEL_LITE, 34 | - "heavy": default blob file LANDMARK_MODEL_HEAVY, 35 | - a path of a blob file. 36 | - lm_score_thresh : confidence score to determine whether landmarks prediction is reliable (a float between 0 and 1). 37 | - xyz: boolean, when True get the (x, y, z) coords of the reference point (center of the hips) (if the device supports depth measures). 38 | - crop : boolean which indicates if square cropping is done or not 39 | - smoothing: boolean which indicates if smoothing filtering is applied 40 | - filter_window_size and filter_velocity_scale: 41 | The filter keeps track (on a window of specified size) of 42 | value changes over time, which as result gives velocity of how value 43 | changes over time. With higher velocity it weights new values higher. 44 | - higher filter_window_size adds to lag and to stability 45 | - lower filter_velocity_scale adds to lag and to stability 46 | 47 | - internal_fps : when using the internal color camera as input source, set its FPS to this value (calling setFps()). 48 | - resolution : sensor resolution "full" (1920x1080) or "ultra" (3840x2160), 49 | - internal_frame_height : when using the internal color camera, set the frame height (calling setIspScale()). 50 | The width is calculated accordingly to height and depends on value of 'crop' 51 | - stats : boolean, when True, display some statistics when exiting. 52 | - trace: boolean, when True print some debug messages 53 | - force_detection: boolean, force person detection on every frame (never use landmarks from previous frame to determine ROI) 54 | """ 55 | def __init__(self, input_src="rgb", 56 | pd_model=None, 57 | pd_score_thresh=0.5, 58 | lm_model=None, 59 | lm_score_thresh=0.7, 60 | xyz=False, 61 | crop=False, 62 | smoothing= True, 63 | internal_fps=None, 64 | resolution="full", 65 | internal_frame_height=1080, 66 | stats=False, 67 | trace=False, 68 | force_detection=False 69 | ): 70 | 71 | self.pd_model = pd_model if pd_model else POSE_DETECTION_MODEL 72 | print(f"Pose detection blob file : {self.pd_model}") 73 | self.rect_transf_scale = 1.25 74 | if lm_model is None or lm_model == "full": 75 | self.lm_model = LANDMARK_MODEL_FULL 76 | elif lm_model == "lite": 77 | self.lm_model = LANDMARK_MODEL_LITE 78 | elif lm_model == "heavy": 79 | self.lm_model = LANDMARK_MODEL_HEAVY 80 | else: 81 | self.lm_model = lm_model 82 | print(f"Landmarks using blob file : {self.lm_model}") 83 | 84 | self.pd_score_thresh = pd_score_thresh 85 | self.lm_score_thresh = lm_score_thresh 86 | self.smoothing = smoothing 87 | self.crop = crop 88 | self.internal_fps = internal_fps 89 | self.stats = stats 90 | self.force_detection = force_detection 91 | self.presence_threshold = 0.5 92 | self.visibility_threshold = 0.5 93 | 94 | self.device = dai.Device() 95 | self.xyz = False 96 | 97 | if input_src == None or input_src == "rgb" or input_src == "rgb_laconic": 98 | # Note that here (in Host mode), specifying "rgb_laconic" has no effect 99 | # Color camera frame is systematically transferred to the host 100 | self.input_type = "rgb" # OAK* internal color camera 101 | if internal_fps is None: 102 | if "heavy" in str(lm_model): 103 | self.internal_fps = 10 104 | elif "full" in str(lm_model): 105 | self.internal_fps = 8 106 | else: # Light 107 | self.internal_fps = 13 108 | else: 109 | self.internal_fps = internal_fps 110 | print(f"Internal camera FPS set to: {self.internal_fps}") 111 | if resolution == "full": 112 | self.resolution = (1920, 1080) 113 | elif resolution == "ultra": 114 | self.resolution = (3840, 2160) 115 | else: 116 | print(f"Error: {resolution} is not a valid resolution !") 117 | sys.exit() 118 | print("Sensor resolution:", self.resolution) 119 | 120 | self.video_fps = self.internal_fps # Used when saving the output in a video file. Should be close to the real fps 121 | 122 | if xyz: 123 | # Check if the device supports stereo 124 | cameras = self.device.getConnectedCameras() 125 | if dai.CameraBoardSocket.LEFT in cameras and dai.CameraBoardSocket.RIGHT in cameras: 126 | self.xyz = True 127 | else: 128 | print("Warning: depth unavailable on this device, 'xyz' argument is ignored") 129 | 130 | if self.crop: 131 | self.frame_size, self.scale_nd = mpu.find_isp_scale_params(internal_frame_height) 132 | self.img_h = self.img_w = self.frame_size 133 | self.pad_w = self.pad_h = 0 134 | self.crop_w = (int(round(self.resolution[0] * self.scale_nd[0] / self.scale_nd[1])) - self.img_w) // 2 135 | 136 | else: 137 | width, self.scale_nd = mpu.find_isp_scale_params(internal_frame_height * 1920 / 1080, is_height=False) 138 | self.img_h = int(round(self.resolution[1] * self.scale_nd[0] / self.scale_nd[1])) 139 | self.img_w = int(round(self.resolution[0] * self.scale_nd[0] / self.scale_nd[1])) 140 | self.pad_h = (self.img_w - self.img_h) // 2 141 | self.pad_w = 0 142 | self.frame_size = self.img_w 143 | self.crop_w = 0 144 | 145 | print(f"Internal camera image size: {self.img_w} x {self.img_h} - crop_w:{self.crop_w} pad_h: {self.pad_h}") 146 | 147 | elif input_src.endswith('.jpg') or input_src.endswith('.png') : 148 | self.input_type= "image" 149 | self.img = cv2.imread(input_src) 150 | self.video_fps = 25 151 | self.img_h, self.img_w = self.img.shape[:2] 152 | else: 153 | self.input_type = "video" 154 | if input_src.isdigit(): 155 | input_type = "webcam" 156 | input_src = int(input_src) 157 | self.cap = cv2.VideoCapture(input_src) 158 | self.video_fps = int(self.cap.get(cv2.CAP_PROP_FPS)) 159 | self.img_w = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) 160 | self.img_h = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 161 | print("Video FPS:", self.video_fps) 162 | 163 | if self.input_type != "rgb": 164 | print(f"Original frame size: {self.img_w}x{self.img_h}") 165 | if self.crop: 166 | self.frame_size = min(self.img_w, self.img_h) # // 16 * 16 167 | else: 168 | self.frame_size = max(self.img_w, self.img_h) #// 16 * 16 169 | self.crop_w = max((self.img_w - self.frame_size) // 2, 0) 170 | if self.crop_w: print("Cropping on width :", self.crop_w) 171 | self.crop_h = max((self.img_h - self.frame_size) // 2, 0) 172 | if self.crop_h: print("Cropping on height :", self.crop_h) 173 | 174 | self.pad_w = max((self.frame_size - self.img_w) // 2, 0) 175 | if self.pad_w: print("Padding on width :", self.pad_w) 176 | self.pad_h = max((self.frame_size - self.img_h) // 2, 0) 177 | if self.pad_h: print("Padding on height :", self.pad_h) 178 | 179 | 180 | print(f"Frame working size: {self.img_w}x{self.img_h}") 181 | 182 | self.nb_kps = 33 # Number of "viewable" keypoints 183 | 184 | if self.smoothing: 185 | 186 | self.filter_landmarks = mpu.LandmarksSmoothingFilter( 187 | frequency=self.video_fps, 188 | min_cutoff=0.05, 189 | beta=80, 190 | derivate_cutoff=1 191 | ) 192 | # landmarks_aux corresponds to the 2 landmarks used to compute the ROI in next frame 193 | self.filter_landmarks_aux = mpu.LandmarksSmoothingFilter( 194 | frequency=self.video_fps, 195 | min_cutoff=0.01, 196 | beta=10, 197 | derivate_cutoff=1 198 | ) 199 | self.filter_landmarks_world = mpu.LandmarksSmoothingFilter( 200 | frequency=self.video_fps, 201 | min_cutoff=0.1, 202 | beta=40, 203 | derivate_cutoff=1, 204 | disable_value_scaling=True 205 | ) 206 | if self.xyz: 207 | self.filter_xyz = mpu.LowPassFilter(alpha=0.25) 208 | 209 | # Create SSD anchors 210 | self.anchors = mpu.generate_blazepose_anchors() 211 | self.nb_anchors = self.anchors.shape[0] 212 | print(f"{self.nb_anchors} anchors have been created") 213 | 214 | # Define and start pipeline 215 | self.pd_input_length = 224 216 | self.lm_input_length = 256 217 | usb_speed = self.device.getUsbSpeed() 218 | self.device.startPipeline(self.create_pipeline()) 219 | print(f"Pipeline started - USB speed: {str(usb_speed).split('.')[-1]}") 220 | 221 | # Define data queues 222 | if self.input_type == "rgb": 223 | self.q_video = self.device.getOutputQueue(name="cam_out", maxSize=1, blocking=False) 224 | self.q_pre_pd_manip_cfg = self.device.getInputQueue(name="pre_pd_manip_cfg") 225 | if self.xyz: 226 | self.q_spatial_data = self.device.getOutputQueue(name="spatial_data_out", maxSize=1, blocking=False) 227 | self.q_spatial_config = self.device.getInputQueue("spatial_calc_config_in") 228 | 229 | else: 230 | self.q_pd_in = self.device.getInputQueue(name="pd_in") 231 | self.q_pd_out = self.device.getOutputQueue(name="pd_out", maxSize=4, blocking=True) 232 | self.q_lm_in = self.device.getInputQueue(name="lm_in") 233 | self.q_lm_out = self.device.getOutputQueue(name="lm_out", maxSize=4, blocking=True) 234 | 235 | 236 | self.fps = FPS() 237 | 238 | self.nb_frames = 0 239 | self.nb_pd_inferences = 0 240 | self.nb_lm_inferences = 0 241 | self.nb_lm_inferences_after_landmarks_ROI = 0 242 | self.nb_frames_no_body = 0 243 | 244 | self.glob_pd_rtrip_time = 0 245 | self.glob_lm_rtrip_time = 0 246 | 247 | self.use_previous_landmarks = False 248 | 249 | self.cfg_pre_pd = dai.ImageManipConfig() 250 | self.cfg_pre_pd.setResizeThumbnail(self.pd_input_length, self.pd_input_length) 251 | 252 | def create_pipeline(self): 253 | print("Creating pipeline...") 254 | # Start defining a pipeline 255 | pipeline = dai.Pipeline() 256 | # pipeline.setOpenVINOVersion(version = dai.OpenVINO.Version.VERSION_2021_4) 257 | 258 | 259 | if self.input_type == "rgb": 260 | # ColorCamera 261 | print("Creating Color Camera...") 262 | cam = pipeline.createColorCamera() 263 | if self.resolution[0] == 1920: 264 | cam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P) 265 | else: 266 | cam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_4_K) 267 | cam.setInterleaved(False) 268 | cam.setIspScale(self.scale_nd[0], self.scale_nd[1]) 269 | cam.setFps(self.internal_fps) 270 | cam.setBoardSocket(dai.CameraBoardSocket.RGB) 271 | 272 | if self.crop: 273 | cam.setVideoSize(self.frame_size, self.frame_size) 274 | cam.setPreviewSize(self.frame_size, self.frame_size) 275 | else: 276 | cam.setVideoSize(self.img_w, self.img_h) 277 | cam.setPreviewSize(self.img_w, self.img_h) 278 | 279 | cam_out = pipeline.createXLinkOut() 280 | cam_out.setStreamName("cam_out") 281 | cam_out.input.setQueueSize(1) 282 | cam_out.input.setBlocking(False) 283 | cam.video.link(cam_out.input) 284 | 285 | # Define pose detection pre processing (resize preview to (self.pd_input_length, self.pd_input_length)) 286 | print("Creating Pose Detection pre processing image manip...") 287 | pre_pd_manip = pipeline.create(dai.node.ImageManip) 288 | pre_pd_manip.setMaxOutputFrameSize(self.pd_input_length*self.pd_input_length*3) 289 | pre_pd_manip.setWaitForConfigInput(True) 290 | pre_pd_manip.inputImage.setQueueSize(1) 291 | pre_pd_manip.inputImage.setBlocking(False) 292 | cam.preview.link(pre_pd_manip.inputImage) 293 | 294 | pre_pd_manip_cfg_in = pipeline.create(dai.node.XLinkIn) 295 | pre_pd_manip_cfg_in.setStreamName("pre_pd_manip_cfg") 296 | pre_pd_manip_cfg_in.out.link(pre_pd_manip.inputConfig) 297 | 298 | if self.xyz: 299 | 300 | # For now, RGB needs fixed focus to properly align with depth. 301 | # The value used during calibration should be used here 302 | calib_data = self.device.readCalibration() 303 | calib_lens_pos = calib_data.getLensPosition(dai.CameraBoardSocket.RGB) 304 | print(f"RGB calibration lens position: {calib_lens_pos}") 305 | cam.initialControl.setManualFocus(calib_lens_pos) 306 | 307 | mono_resolution = dai.MonoCameraProperties.SensorResolution.THE_400_P 308 | left = pipeline.createMonoCamera() 309 | left.setBoardSocket(dai.CameraBoardSocket.LEFT) 310 | left.setResolution(mono_resolution) 311 | left.setFps(self.internal_fps) 312 | 313 | right = pipeline.createMonoCamera() 314 | right.setBoardSocket(dai.CameraBoardSocket.RIGHT) 315 | right.setResolution(mono_resolution) 316 | right.setFps(self.internal_fps) 317 | 318 | stereo = pipeline.createStereoDepth() 319 | stereo.setConfidenceThreshold(230) 320 | # LR-check is required for depth alignment 321 | stereo.setLeftRightCheck(True) 322 | stereo.setDepthAlign(dai.CameraBoardSocket.RGB) 323 | stereo.setSubpixel(False) # subpixel True -> latency 324 | 325 | spatial_location_calculator = pipeline.createSpatialLocationCalculator() 326 | spatial_location_calculator.setWaitForConfigInput(True) 327 | spatial_location_calculator.inputDepth.setBlocking(False) 328 | spatial_location_calculator.inputDepth.setQueueSize(1) 329 | 330 | spatial_data_out = pipeline.createXLinkOut() 331 | spatial_data_out.setStreamName("spatial_data_out") 332 | spatial_data_out.input.setQueueSize(1) 333 | spatial_data_out.input.setBlocking(False) 334 | 335 | spatial_calc_config_in = pipeline.createXLinkIn() 336 | spatial_calc_config_in.setStreamName("spatial_calc_config_in") 337 | 338 | left.out.link(stereo.left) 339 | right.out.link(stereo.right) 340 | 341 | stereo.depth.link(spatial_location_calculator.inputDepth) 342 | 343 | spatial_location_calculator.out.link(spatial_data_out.input) 344 | spatial_calc_config_in.out.link(spatial_location_calculator.inputConfig) 345 | 346 | 347 | # Define pose detection model 348 | print("Creating Pose Detection Neural Network...") 349 | pd_nn = pipeline.createNeuralNetwork() 350 | pd_nn.setBlobPath(str(Path(self.pd_model).resolve().absolute())) 351 | # Increase threads for detection 352 | # pd_nn.setNumInferenceThreads(2) 353 | # Specify that network takes latest arriving frame in non-blocking manner 354 | # Pose detection input 355 | if self.input_type == "rgb": 356 | pre_pd_manip.out.link(pd_nn.input) 357 | else: 358 | pd_in = pipeline.createXLinkIn() 359 | pd_in.setStreamName("pd_in") 360 | pd_in.out.link(pd_nn.input) 361 | 362 | # Pose detection output 363 | pd_out = pipeline.createXLinkOut() 364 | pd_out.setStreamName("pd_out") 365 | pd_nn.out.link(pd_out.input) 366 | 367 | 368 | # Define landmark model 369 | print("Creating Landmark Neural Network...") 370 | lm_nn = pipeline.createNeuralNetwork() 371 | lm_nn.setBlobPath(str(Path(self.lm_model).resolve().absolute())) 372 | lm_nn.setNumInferenceThreads(1) 373 | # Landmark input 374 | lm_in = pipeline.createXLinkIn() 375 | lm_in.setStreamName("lm_in") 376 | lm_in.out.link(lm_nn.input) 377 | # Landmark output 378 | lm_out = pipeline.createXLinkOut() 379 | lm_out.setStreamName("lm_out") 380 | lm_nn.out.link(lm_out.input) 381 | 382 | print("Pipeline created.") 383 | return pipeline 384 | 385 | def is_present(self, body, lm_id): 386 | return body.presence[lm_id] > self.presence_threshold 387 | 388 | def is_visible(self, body, lm_id): 389 | if body.visibility[lm_id] > self.visibility_threshold and \ 390 | 0 <= body.landmarks[lm_id][0] < self.img_w and \ 391 | 0 <= body.landmarks[lm_id][1] < self.img_h : 392 | return True 393 | else: 394 | return False 395 | 396 | def query_body_xyz(self, body): 397 | # We want the 3d position (x,y,z) in meters of the body reference keypoint 398 | # in the camera coord system. 399 | # The reference point is either : 400 | # - the middle of the hips if both hips are present (presence of rght and left hips > threshold), 401 | # - the middle of the shoulders in case at leats one hip is not present and 402 | # both shoulders are present, 403 | # - None otherwise 404 | if self.is_visible(body, mpu.KEYPOINT_DICT['right_hip']) and self.is_visible(body, mpu.KEYPOINT_DICT['left_hip']): 405 | body.xyz_ref = "mid_hips" 406 | body.xyz_ref_coords_pixel = np.mean([ 407 | body.landmarks[mpu.KEYPOINT_DICT['right_hip']][:2], 408 | body.landmarks[mpu.KEYPOINT_DICT['left_hip']][:2]], 409 | axis=0) 410 | elif self.is_visible(body, mpu.KEYPOINT_DICT['right_shoulder']) and self.is_visible(body, mpu.KEYPOINT_DICT['left_shoulder']): 411 | body.xyz_ref = "mid_shoulders" 412 | body.xyz_ref_coords_pixel = np.mean([ 413 | body.landmarks[mpu.KEYPOINT_DICT['right_shoulder']][:2], 414 | body.landmarks[mpu.KEYPOINT_DICT['left_shoulder']][:2]], 415 | axis=0) 416 | else: 417 | body.xyz_ref = None 418 | return 419 | # Prepare the request to SpatialLocationCalculator 420 | # ROI : small rectangular zone around the reference keypoint 421 | half_zone_size = int(max(body.rect_w_a / 90, 4)) 422 | xc = int(body.xyz_ref_coords_pixel[0] + self.crop_w) 423 | yc = int(body.xyz_ref_coords_pixel[1]) 424 | roi_left = max(0, xc - half_zone_size) 425 | roi_right = min(self.img_w-1, xc + half_zone_size) 426 | roi_top = max(0, yc - half_zone_size) 427 | roi_bottom = min(self.img_h-1, yc + half_zone_size) 428 | roi_topleft = dai.Point2f(roi_left, roi_top) 429 | roi_bottomright = dai.Point2f(roi_right, roi_bottom) 430 | # Config 431 | conf_data = dai.SpatialLocationCalculatorConfigData() 432 | conf_data.depthThresholds.lowerThreshold = 100 433 | conf_data.depthThresholds.upperThreshold = 10000 434 | # conf_data.roi = dai.Rect(roi_center, roi_size) 435 | conf_data.roi = dai.Rect(roi_topleft, roi_bottomright) 436 | cfg = dai.SpatialLocationCalculatorConfig() 437 | cfg.setROIs([conf_data]) 438 | # spatial_rtrip_time = now() 439 | self.q_spatial_config.send(cfg) 440 | 441 | # Receives spatial locations 442 | spatial_data = self.q_spatial_data.get().getSpatialLocations() 443 | # self.glob_spatial_rtrip_time += now() - spatial_rtrip_time 444 | # self.nb_spatial_requests += 1 445 | sd = spatial_data[0] 446 | body.xyz_zone = [ 447 | int(sd.config.roi.topLeft().x) - self.crop_w, 448 | int(sd.config.roi.topLeft().y), 449 | int(sd.config.roi.bottomRight().x) - self.crop_w, 450 | int(sd.config.roi.bottomRight().y) 451 | ] 452 | body.xyz = np.array([ 453 | sd.spatialCoordinates.x, 454 | sd.spatialCoordinates.y, 455 | sd.spatialCoordinates.z 456 | ]) 457 | if self.smoothing: 458 | body.xyz = self.filter_xyz.apply(body.xyz) 459 | 460 | def pd_postprocess(self, inference): 461 | scores = np.array(inference.getLayerFp16("Identity_1"), dtype=np.float16) # 2254 462 | bboxes = np.array(inference.getLayerFp16("Identity"), dtype=np.float16).reshape((self.nb_anchors,12)) # 2254x12 463 | # Decode bboxes 464 | bodies = mpu.decode_bboxes(self.pd_score_thresh, scores, bboxes, self.anchors, best_only=True) 465 | if bodies: 466 | body = bodies[0] 467 | else: 468 | return None 469 | mpu.detections_to_rect(body) 470 | mpu.rect_transformation(body, self.frame_size, self.frame_size, self.rect_transf_scale) 471 | return body 472 | 473 | def lm_postprocess(self, body, inference): 474 | # The output names of the landmarks model are : 475 | # Identity_1 (1x1) : score (previously output_poseflag) 476 | # Identity_2 (1x128x128x1) (previously output_segmentation) 477 | # Identity_3 (1x64x64x39) (previously output_heatmap) 478 | # Identity_4 (1x117) world 3D landmarks (previously world_3d) 479 | # Identity (1x195) image 3D landmarks (previously ld_3d) 480 | body.lm_score = inference.getLayerFp16("Identity_1")[0] 481 | if body.lm_score > self.lm_score_thresh: 482 | 483 | lm_raw = np.array(inference.getLayerFp16("Identity")).reshape(-1,5) 484 | # Each keypoint have 5 information: 485 | # - X,Y coordinates are local to the body of 486 | # interest and range from [0.0, 255.0]. 487 | # - Z coordinate is measured in "image pixels" like 488 | # the X and Y coordinates and represents the 489 | # distance relative to the plane of the subject's 490 | # hips, which is the origin of the Z axis. Negative 491 | # values are between the hips and the camera; 492 | # positive values are behind the hips. Z coordinate 493 | # scale is similar with X, Y scales but has different 494 | # nature as obtained not via human annotation, by 495 | # fitting synthetic data (GHUM model) to the 2D 496 | # annotation. 497 | # - Visibility, after user-applied sigmoid denotes the 498 | # probability that a keypoint is located within the 499 | # frame and not occluded by another bigger body 500 | # part or another object. 501 | # - Presence, after user-applied sigmoid denotes the 502 | # probability that a keypoint is located within the 503 | # frame. 504 | 505 | # Normalize x,y,z. Scaling in z = scaling in x = 1/self.lm_input_length 506 | lm_raw[:,:3] /= self.lm_input_length 507 | # Apply sigmoid on visibility and presence (if used later) 508 | body.visibility = 1 / (1 + np.exp(-lm_raw[:,3])) 509 | body.presence = 1 / (1 + np.exp(-lm_raw[:,4])) 510 | 511 | # body.norm_landmarks contains the normalized ([0:1]) 3D coordinates of landmarks in the square rotated body bounding box 512 | body.norm_landmarks = lm_raw[:,:3] 513 | # Now calculate body.landmarks = the landmarks in the image coordinate system (in pixel) (body.landmarks) 514 | src = np.array([(0, 0), (1, 0), (1, 1)], dtype=np.float32) 515 | dst = np.array([ (x, y) for x,y in body.rect_points[1:]], dtype=np.float32) # body.rect_points[0] is left bottom point and points going clockwise! 516 | mat = cv2.getAffineTransform(src, dst) 517 | lm_xy = np.expand_dims(body.norm_landmarks[:self.nb_kps+2,:2], axis=0) 518 | lm_xy = np.squeeze(cv2.transform(lm_xy, mat)) 519 | # A segment of length 1 in the coordinates system of body bounding box takes body.rect_w_a pixels in the 520 | # original image. Then we arbitrarily divide by 4 for a more realistic appearance. 521 | lm_z = body.norm_landmarks[:self.nb_kps+2,2:3] * body.rect_w_a / 4 522 | lm_xyz = np.hstack((lm_xy, lm_z)) 523 | 524 | # World landmarks are predicted in meters rather than in pixels of the image 525 | # and have origin in the middle of the hips rather than in the corner of the 526 | # pose image (cropped with given rectangle). Thus only rotation (but not scale 527 | # and translation) is applied to the landmarks to transform them back to 528 | # original coordinates. 529 | body.landmarks_world = np.array(inference.getLayerFp16("Identity_4")).reshape(-1,3)[:self.nb_kps] 530 | sin_rot = sin(body.rotation) 531 | cos_rot = cos(body.rotation) 532 | rot_m = np.array([[cos_rot, sin_rot], [-sin_rot, cos_rot]]) 533 | body.landmarks_world[:,:2] = np.dot(body.landmarks_world[:,:2], rot_m) 534 | 535 | if self.smoothing: 536 | timestamp = now() 537 | object_scale = body.rect_w_a 538 | lm_xyz[:self.nb_kps] = self.filter_landmarks.apply(lm_xyz[:self.nb_kps], timestamp, object_scale) 539 | lm_xyz[self.nb_kps:] = self.filter_landmarks_aux.apply(lm_xyz[self.nb_kps:], timestamp, object_scale) 540 | body.landmarks_world = self.filter_landmarks_world.apply(body.landmarks_world, timestamp) 541 | 542 | body.landmarks = lm_xyz.astype(np.int32) 543 | 544 | # body_from_landmarks will be used to initialize the bounding rotated rectangle in the next frame 545 | # The only information we need are the 2 landmarks 33 and 34 546 | self.body_from_landmarks = mpu.Body(pd_kps=body.landmarks[self.nb_kps:self.nb_kps+2,:2]/self.frame_size) 547 | 548 | # If we added padding to make the image square, we need to remove this padding from landmark coordinates and from rect_points 549 | if self.pad_h > 0: 550 | body.landmarks[:,1] -= self.pad_h 551 | for i in range(len(body.rect_points)): 552 | body.rect_points[i][1] -= self.pad_h 553 | if self.pad_w > 0: 554 | body.landmarks[:,0] -= self.pad_w 555 | for i in range(len(body.rect_points)): 556 | body.rect_points[i][0] -= self.pad_w 557 | 558 | 559 | def next_frame(self): 560 | 561 | self.fps.update() 562 | 563 | if self.input_type == "rgb": 564 | in_video = self.q_video.get() 565 | video_frame = in_video.getCvFrame() 566 | if self.pad_h: 567 | square_frame = cv2.copyMakeBorder(video_frame, self.pad_h, self.pad_h, self.pad_w, self.pad_w, cv2.BORDER_CONSTANT) 568 | else: 569 | square_frame = video_frame 570 | # For debugging 571 | # if not self.crop: 572 | # lb = self.q_lb_out.get() 573 | # if lb: 574 | # lb = lb.getCvFrame() 575 | # cv2.imshow("letterbox", lb) 576 | else: 577 | if self.input_type == "image": 578 | frame = self.img.copy() 579 | else: 580 | ok, frame = self.cap.read() 581 | if not ok: 582 | return None, None 583 | # Cropping and/or padding of the video frame 584 | video_frame = frame[self.crop_h:self.crop_h+self.frame_size, self.crop_w:self.crop_w+self.frame_size] 585 | if self.pad_h or self.pad_w: 586 | square_frame = cv2.copyMakeBorder(video_frame, self.pad_h, self.pad_h, self.pad_w, self.pad_w, cv2.BORDER_CONSTANT) 587 | else: 588 | square_frame = video_frame 589 | 590 | if self.force_detection or not self.use_previous_landmarks: 591 | if self.input_type == "rgb": 592 | self.q_pre_pd_manip_cfg.send(self.cfg_pre_pd) 593 | else: 594 | frame_nn = dai.ImgFrame() 595 | frame_nn.setTimestamp(time.monotonic()) 596 | frame_nn.setWidth(self.pd_input_length) 597 | frame_nn.setHeight(self.pd_input_length) 598 | frame_nn.setData(to_planar(square_frame, (self.pd_input_length, self.pd_input_length))) 599 | pd_rtrip_time = now() 600 | self.q_pd_in.send(frame_nn) 601 | 602 | # Get pose detection 603 | inference = self.q_pd_out.get() 604 | if self.input_type != "rgb" and (self.force_detection or not self.use_previous_landmarks): 605 | pd_rtrip_time = now() - pd_rtrip_time 606 | self.glob_pd_rtrip_time += pd_rtrip_time 607 | body = self.pd_postprocess(inference) 608 | self.nb_pd_inferences += 1 609 | else: 610 | body = self.body_from_landmarks 611 | mpu.detections_to_rect(body) # self.regions.pd_kps are initialized from landmarks on previous frame 612 | mpu.rect_transformation(body, self.frame_size, self.frame_size, self.rect_transf_scale) 613 | 614 | 615 | # Landmarks 616 | if body: 617 | frame_nn = mpu.warp_rect_img(body.rect_points, square_frame, self.lm_input_length, self.lm_input_length) 618 | frame_nn = frame_nn / 255. 619 | nn_data = dai.NNData() 620 | nn_data.setLayer("input_1", to_planar(frame_nn, (self.lm_input_length, self.lm_input_length))) 621 | lm_rtrip_time = now() 622 | self.q_lm_in.send(nn_data) 623 | 624 | # Get landmarks 625 | inference = self.q_lm_out.get() 626 | lm_rtrip_time = now() - lm_rtrip_time 627 | self.glob_lm_rtrip_time += lm_rtrip_time 628 | self.nb_lm_inferences += 1 629 | self.lm_postprocess(body, inference) 630 | if body.lm_score < self.lm_score_thresh: 631 | body = None 632 | self.use_previous_landmarks = False 633 | if self.smoothing: 634 | self.filter_landmarks.reset() 635 | self.filter_landmarks_aux.reset() 636 | self.filter_landmarks_world.reset() 637 | else: 638 | self.use_previous_landmarks = True 639 | if self.xyz: 640 | self.query_body_xyz(body) 641 | 642 | else: 643 | self.use_previous_landmarks = False 644 | if self.smoothing: 645 | self.filter_landmarks.reset() 646 | self.filter_landmarks_aux.reset() 647 | self.filter_landmarks_world.reset() 648 | if self.xyz: self.filter_xyz.reset() 649 | 650 | return video_frame, body 651 | 652 | 653 | def exit(self): 654 | self.device.close() 655 | # Print some stats 656 | if self.stats: 657 | print(f"FPS : {self.fps.get_global():.1f} f/s (# frames = {self.fps.nbf})") 658 | print(f"# pose detection inferences : {self.nb_pd_inferences}") 659 | print(f"# landmark inferences : {self.nb_lm_inferences}") 660 | if self.input_type != "rgb" and self.nb_pd_inferences != 0: print(f"Pose detection round trip : {self.glob_pd_rtrip_time/self.nb_pd_inferences*1000:.1f} ms") 661 | if self.nb_lm_inferences != 0: print(f"Landmark round trip : {self.glob_lm_rtrip_time/self.nb_lm_inferences*1000:.1f} ms") 662 | 663 | 664 | -------------------------------------------------------------------------------- /BlazeposeDepthaiEdge.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | from numpy.core.fromnumeric import trace 4 | import mediapipe_utils as mpu 5 | from pathlib import Path 6 | from FPS import FPS, now 7 | import depthai as dai 8 | import marshal 9 | import sys 10 | from string import Template 11 | from math import sin, cos 12 | 13 | SCRIPT_DIR = Path(__file__).resolve().parent 14 | POSE_DETECTION_MODEL = str(SCRIPT_DIR / "models/pose_detection_sh4.blob") 15 | LANDMARK_MODEL_FULL = str(SCRIPT_DIR / "models/pose_landmark_full_sh4.blob") 16 | LANDMARK_MODEL_HEAVY = str(SCRIPT_DIR / "models/pose_landmark_heavy_sh4.blob") 17 | LANDMARK_MODEL_LITE = str(SCRIPT_DIR / "models/pose_landmark_lite_sh4.blob") 18 | DETECTION_POSTPROCESSING_MODEL = str(SCRIPT_DIR / "custom_models/DetectionBestCandidate_sh1.blob") 19 | DIVIDE_BY_255_MODEL = str(SCRIPT_DIR / "custom_models/DivideBy255_sh1.blob") 20 | TEMPLATE_MANAGER_SCRIPT = str(SCRIPT_DIR / "template_manager_script.py") 21 | 22 | 23 | def to_planar(arr: np.ndarray, shape: tuple) -> np.ndarray: 24 | return cv2.resize(arr, shape).transpose(2,0,1).flatten() 25 | 26 | class BlazeposeDepthai: 27 | """ 28 | Blazepose body pose detector 29 | Arguments: 30 | - input_src: frame source, 31 | - "rgb" or None: OAK* internal color camera, 32 | - "rgb_laconic": same as "rgb" but without sending the frames to the host, 33 | Note that as we are in Edge mode, input sources coming from the host like a image or a video is not supported 34 | - pd_model: Blazepose detection model blob file (if None, takes the default value POSE_DETECTION_MODEL), 35 | - pd_score: confidence score to determine whether a detection is reliable (a float between 0 and 1). 36 | - pp_model: detection postprocessing model blob file (if None, takes the default value DETECTION_POSTPROCESSING_MODEL),, 37 | - lm_model: Blazepose landmark model blob file 38 | - None or "full": the default blob file LANDMARK_MODEL_FULL, 39 | - "lite": the default blob file LANDMARK_MODEL_LITE, 40 | - "831": the full model from previous version of mediapipe (0.8.3.1) LANDMARK_MODEL_FULL_0831, 41 | - a path of a blob file. 42 | - lm_score_thresh : confidence score to determine whether landmarks prediction is reliable (a float between 0 and 1). 43 | - xyz: boolean, when True get the (x, y, z) coords of the reference point (center of the hips) (if the device supports depth measures). 44 | - crop : boolean which indicates if square cropping is done or not 45 | - smoothing: boolean which indicates if smoothing filtering is applied 46 | - filter_window_size and filter_velocity_scale: 47 | The filter keeps track (on a window of specified size) of 48 | value changes over time, which as result gives velocity of how value 49 | changes over time. With higher velocity it weights new values higher. 50 | - higher filter_window_size adds to lag and to stability 51 | - lower filter_velocity_scale adds to lag and to stability 52 | 53 | - internal_fps : when using the internal color camera as input source, set its FPS to this value (calling setFps()). 54 | - internal_frame_height : when using the internal color camera, set the frame height (calling setIspScale()). 55 | The width is calculated accordingly to height and depends on value of 'crop' 56 | - stats : boolean, when True, display some statistics when exiting. 57 | - trace: boolean, when True print some debug messages 58 | - force_detection: boolean, force person detection on every frame (never use landmarks from previous frame to determine ROI) 59 | """ 60 | def __init__(self, input_src="rgb", 61 | pd_model=None, 62 | pd_score_thresh=0.5, 63 | pp_model=None, 64 | lm_model=None, 65 | lm_score_thresh=0.7, 66 | xyz = False, 67 | crop=False, 68 | smoothing= True, 69 | filter_window_size=5, 70 | filter_velocity_scale=10, 71 | stats=False, 72 | internal_fps=None, 73 | internal_frame_height=1080, 74 | trace=False, 75 | force_detection=False): 76 | 77 | self.pd_model = pd_model if pd_model else POSE_DETECTION_MODEL 78 | self.pp_model = pp_model if pd_model else DETECTION_POSTPROCESSING_MODEL 79 | self.divide_by_255_model = DIVIDE_BY_255_MODEL 80 | print(f"Pose detection blob file : {self.pd_model}") 81 | self.rect_transf_scale = 1.25 82 | if lm_model is None or lm_model == "full": 83 | self.lm_model = LANDMARK_MODEL_FULL 84 | elif lm_model == "lite": 85 | self.lm_model = LANDMARK_MODEL_LITE 86 | elif lm_model == "heavy": 87 | self.lm_model = LANDMARK_MODEL_HEAVY 88 | else: 89 | self.lm_model = lm_model 90 | print(f"Landmarks using blob file : {self.lm_model}") 91 | 92 | self.pd_score_thresh = pd_score_thresh 93 | self.lm_score_thresh = lm_score_thresh 94 | self.smoothing = smoothing 95 | self.crop = crop 96 | self.internal_fps = internal_fps 97 | self.stats = stats 98 | self.presence_threshold = 0.5 99 | self.visibility_threshold = 0.5 100 | 101 | self.trace = trace 102 | self.force_detection = force_detection 103 | 104 | self.device = dai.Device() 105 | self.xyz = False 106 | 107 | if input_src == None or input_src == "rgb" or input_src == "rgb_laconic": 108 | self.input_type = "rgb" # OAK* internal color camera 109 | self.laconic = input_src == "rgb_laconic" # Camera frames are not sent to the host 110 | if xyz: 111 | # Check if the device supports stereo 112 | cameras = self.device.getConnectedCameras() 113 | if dai.CameraBoardSocket.LEFT in cameras and dai.CameraBoardSocket.RIGHT in cameras: 114 | self.xyz = True 115 | else: 116 | print("Warning: depth unavailable on this device, 'xyz' argument is ignored") 117 | 118 | if internal_fps is None: 119 | if "full" in str(self.lm_model): 120 | self.internal_fps = 18 if self.xyz else 20 121 | elif "heavy" in str(lm_model): 122 | self.internal_fps = 7 if self.xyz else 8 123 | else: # "lite" 124 | self.internal_fps = 22 if self.xyz else 26 125 | else: 126 | self.internal_fps = internal_fps 127 | print(f"Internal camera FPS set to: {self.internal_fps}") 128 | 129 | self.video_fps = self.internal_fps # Used when saving the output in a video file. Should be close to the real fps 130 | 131 | if self.crop: 132 | self.frame_size, self.scale_nd = mpu.find_isp_scale_params(internal_frame_height) 133 | self.img_h = self.img_w = self.frame_size 134 | self.pad_w = self.pad_h = 0 135 | self.crop_w = (int(round(1920 * self.scale_nd[0] / self.scale_nd[1])) - self.img_w) // 2 136 | 137 | else: 138 | width, self.scale_nd = mpu.find_isp_scale_params(internal_frame_height * 1920 / 1080, is_height=False) 139 | self.img_h = int(round(1080 * self.scale_nd[0] / self.scale_nd[1])) 140 | self.img_w = int(round(1920 * self.scale_nd[0] / self.scale_nd[1])) 141 | self.pad_h = (self.img_w - self.img_h) // 2 142 | self.pad_w = 0 143 | self.frame_size = self.img_w 144 | self.crop_w = 0 145 | 146 | print(f"Internal camera image size: {self.img_w} x {self.img_h} - pad_h: {self.pad_h}") 147 | 148 | else: 149 | print("Invalid input source:", input_src) 150 | sys.exit() 151 | 152 | self.nb_kps = 33 153 | 154 | if self.smoothing: 155 | self.filter_landmarks = mpu.LandmarksSmoothingFilter( 156 | frequency=self.video_fps, 157 | min_cutoff=0.05, 158 | beta=80, 159 | derivate_cutoff=1 160 | ) 161 | # landmarks_aux corresponds to the 2 landmarks used to compute the ROI in next frame 162 | self.filter_landmarks_aux = mpu.LandmarksSmoothingFilter( 163 | frequency=self.video_fps, 164 | min_cutoff=0.01, 165 | beta=10, 166 | derivate_cutoff=1 167 | ) 168 | self.filter_landmarks_world = mpu.LandmarksSmoothingFilter( 169 | frequency=self.video_fps, 170 | min_cutoff=0.1, 171 | beta=40, 172 | derivate_cutoff=1, 173 | disable_value_scaling=True 174 | ) 175 | if self.xyz: 176 | self.filter_xyz = mpu.LowPassFilter(alpha=0.25) 177 | 178 | # Define and start pipeline 179 | usb_speed = self.device.getUsbSpeed() 180 | self.device.startPipeline(self.create_pipeline()) 181 | print(f"Pipeline started - USB speed: {str(usb_speed).split('.')[-1]}") 182 | 183 | # Define data queues 184 | if not self.laconic: 185 | self.q_video = self.device.getOutputQueue(name="cam_out", maxSize=1, blocking=False) 186 | self.q_manager_out = self.device.getOutputQueue(name="manager_out", maxSize=1, blocking=False) 187 | # For debugging 188 | # self.q_pre_pd_manip_out = self.device.getOutputQueue(name="pre_pd_manip_out", maxSize=1, blocking=False) 189 | # self.q_pre_lm_manip_out = self.device.getOutputQueue(name="pre_lm_manip_out", maxSize=1, blocking=False) 190 | 191 | self.fps = FPS() 192 | 193 | self.nb_pd_inferences = 0 194 | self.nb_lm_inferences = 0 195 | self.nb_lm_inferences_after_landmarks_ROI = 0 196 | self.nb_frames_no_body = 0 197 | 198 | def create_pipeline(self): 199 | print("Creating pipeline...") 200 | # Start defining a pipeline 201 | pipeline = dai.Pipeline() 202 | pipeline.setOpenVINOVersion(dai.OpenVINO.Version.VERSION_2021_4) 203 | self.pd_input_length = 224 204 | self.lm_input_length = 256 205 | 206 | # ColorCamera 207 | print("Creating Color Camera...") 208 | cam = pipeline.create(dai.node.ColorCamera) 209 | cam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P) 210 | cam.setInterleaved(False) 211 | cam.setIspScale(self.scale_nd[0], self.scale_nd[1]) 212 | cam.setFps(self.internal_fps) 213 | cam.setBoardSocket(dai.CameraBoardSocket.RGB) 214 | 215 | if self.crop: 216 | cam.setVideoSize(self.frame_size, self.frame_size) 217 | cam.setPreviewSize(self.frame_size, self.frame_size) 218 | else: 219 | cam.setVideoSize(self.img_w, self.img_h) 220 | cam.setPreviewSize(self.img_w, self.img_h) 221 | 222 | if not self.laconic: 223 | cam_out = pipeline.create(dai.node.XLinkOut) 224 | cam_out.setStreamName("cam_out") 225 | cam_out.input.setQueueSize(1) 226 | cam_out.input.setBlocking(False) 227 | cam.video.link(cam_out.input) 228 | 229 | 230 | # Define manager script node 231 | manager_script = pipeline.create(dai.node.Script) 232 | manager_script.setScript(self.build_manager_script()) 233 | 234 | if self.xyz: 235 | print("Creating MonoCameras, Stereo and SpatialLocationCalculator nodes...") 236 | # For now, RGB needs fixed focus to properly align with depth. 237 | # The value used during calibration should be used here 238 | calib_data = self.device.readCalibration() 239 | calib_lens_pos = calib_data.getLensPosition(dai.CameraBoardSocket.RGB) 240 | print(f"RGB calibration lens position: {calib_lens_pos}") 241 | cam.initialControl.setManualFocus(calib_lens_pos) 242 | 243 | mono_resolution = dai.MonoCameraProperties.SensorResolution.THE_400_P 244 | left = pipeline.createMonoCamera() 245 | left.setBoardSocket(dai.CameraBoardSocket.LEFT) 246 | left.setResolution(mono_resolution) 247 | left.setFps(self.internal_fps) 248 | 249 | right = pipeline.createMonoCamera() 250 | right.setBoardSocket(dai.CameraBoardSocket.RIGHT) 251 | right.setResolution(mono_resolution) 252 | right.setFps(self.internal_fps) 253 | 254 | stereo = pipeline.createStereoDepth() 255 | stereo.setConfidenceThreshold(230) 256 | # LR-check is required for depth alignment 257 | stereo.setLeftRightCheck(True) 258 | stereo.setDepthAlign(dai.CameraBoardSocket.RGB) 259 | stereo.setSubpixel(False) # subpixel True -> latency 260 | # MEDIAN_OFF necessary in depthai 2.7.2. 261 | # Otherwise : [critical] Fatal error. Please report to developers. Log: 'StereoSipp' '533' 262 | # stereo.setMedianFilter(dai.StereoDepthProperties.MedianFilter.MEDIAN_OFF) 263 | 264 | spatial_location_calculator = pipeline.createSpatialLocationCalculator() 265 | spatial_location_calculator.setWaitForConfigInput(True) 266 | spatial_location_calculator.inputDepth.setBlocking(False) 267 | spatial_location_calculator.inputDepth.setQueueSize(1) 268 | 269 | left.out.link(stereo.left) 270 | right.out.link(stereo.right) 271 | 272 | stereo.depth.link(spatial_location_calculator.inputDepth) 273 | 274 | manager_script.outputs['spatial_location_config'].link(spatial_location_calculator.inputConfig) 275 | spatial_location_calculator.out.link(manager_script.inputs['spatial_data']) 276 | 277 | # Define pose detection pre processing (resize preview to (self.pd_input_length, self.pd_input_length)) 278 | print("Creating Pose Detection pre processing image manip...") 279 | pre_pd_manip = pipeline.create(dai.node.ImageManip) 280 | pre_pd_manip.setMaxOutputFrameSize(self.pd_input_length*self.pd_input_length*3) 281 | pre_pd_manip.setWaitForConfigInput(True) 282 | pre_pd_manip.inputImage.setQueueSize(1) 283 | pre_pd_manip.inputImage.setBlocking(False) 284 | cam.preview.link(pre_pd_manip.inputImage) 285 | manager_script.outputs['pre_pd_manip_cfg'].link(pre_pd_manip.inputConfig) 286 | 287 | # For debugging 288 | # pre_pd_manip_out = pipeline.createXLinkOut() 289 | # pre_pd_manip_out.setStreamName("pre_pd_manip_out") 290 | # pre_pd_manip.out.link(pre_pd_manip_out.input) 291 | 292 | # Define pose detection model 293 | print("Creating Pose Detection Neural Network...") 294 | pd_nn = pipeline.create(dai.node.NeuralNetwork) 295 | pd_nn.setBlobPath(self.pd_model) 296 | # Increase threads for detection 297 | # pd_nn.setNumInferenceThreads(2) 298 | pre_pd_manip.out.link(pd_nn.input) 299 | 300 | # Define pose detection post processing "model" 301 | print("Creating Pose Detection post processing Neural Network...") 302 | post_pd_nn = pipeline.create(dai.node.NeuralNetwork) 303 | post_pd_nn.setBlobPath(self.pp_model) 304 | pd_nn.out.link(post_pd_nn.input) 305 | post_pd_nn.out.link(manager_script.inputs['from_post_pd_nn']) 306 | 307 | # Define link to send result to host 308 | manager_out = pipeline.create(dai.node.XLinkOut) 309 | manager_out.setStreamName("manager_out") 310 | manager_script.outputs['host'].link(manager_out.input) 311 | 312 | # Define landmark pre processing image manip 313 | print("Creating Landmark pre processing image manip...") 314 | pre_lm_manip = pipeline.create(dai.node.ImageManip) 315 | pre_lm_manip.setMaxOutputFrameSize(self.lm_input_length*self.lm_input_length*3) 316 | pre_lm_manip.setWaitForConfigInput(True) 317 | pre_lm_manip.inputImage.setQueueSize(1) 318 | pre_lm_manip.inputImage.setBlocking(False) 319 | cam.preview.link(pre_lm_manip.inputImage) 320 | 321 | # For debugging 322 | # pre_lm_manip_out = pipeline.createXLinkOut() 323 | # pre_lm_manip_out.setStreamName("pre_lm_manip_out") 324 | # pre_lm_manip.out.link(pre_lm_manip_out.input) 325 | 326 | manager_script.outputs['pre_lm_manip_cfg'].link(pre_lm_manip.inputConfig) 327 | 328 | # Define normalization model between ImageManip and landmark model 329 | # This is a temporary step. Could be removed when support of setFrameType(RGBF16F16F16p) in ImageManip node 330 | print("Creating DiveideBy255 Neural Network...") 331 | divide_nn = pipeline.create(dai.node.NeuralNetwork) 332 | divide_nn.setBlobPath(self.divide_by_255_model) 333 | pre_lm_manip.out.link(divide_nn.input) 334 | 335 | # Define landmark model 336 | print("Creating Landmark Neural Network...") 337 | lm_nn = pipeline.create(dai.node.NeuralNetwork) 338 | lm_nn.setBlobPath(self.lm_model) 339 | # lm_nn.setNumInferenceThreads(1) 340 | 341 | divide_nn.out.link(lm_nn.input) 342 | lm_nn.out.link(manager_script.inputs['from_lm_nn']) 343 | 344 | print("Pipeline created.") 345 | 346 | return pipeline 347 | 348 | def build_manager_script(self): 349 | ''' 350 | The code of the scripting node 'manager_script' depends on : 351 | - the NN model (full, lite, 831), 352 | - the score threshold, 353 | - the video frame shape 354 | So we build this code from the content of the file template_manager_script.py which is a python template 355 | ''' 356 | # Read the template 357 | with open(TEMPLATE_MANAGER_SCRIPT, 'r') as file: 358 | template = Template(file.read()) 359 | 360 | # Perform the substitution 361 | code = template.substitute( 362 | _TRACE = "node.warn" if self.trace else "#", 363 | _pd_score_thresh = self.pd_score_thresh, 364 | _lm_score_thresh = self.lm_score_thresh, 365 | _force_detection = self.force_detection, 366 | _pad_h = self.pad_h, 367 | _img_h = self.img_h, 368 | _img_w = self.img_w, 369 | _frame_size = self.frame_size, 370 | _crop_w = self.crop_w, 371 | _rect_transf_scale = self.rect_transf_scale, 372 | _IF_XYZ = "" if self.xyz else '"""', 373 | _buffer_size = 2910 if self.xyz else 2863, 374 | _visibility_threshold = self.visibility_threshold 375 | ) 376 | # Remove comments and empty lines 377 | import re 378 | code = re.sub(r'"{3}.*?"{3}', '', code, flags=re.DOTALL) 379 | code = re.sub(r'#.*', '', code) 380 | code = re.sub('\n\s*\n', '\n', code) 381 | # For debugging 382 | if self.trace: 383 | with open("tmp_code.py", "w") as file: 384 | file.write(code) 385 | 386 | return code 387 | 388 | def is_present(self, body, lm_id): 389 | return body.presence[lm_id] > self.presence_threshold 390 | 391 | def lm_postprocess(self, body, lms, lms_world): 392 | # lms : landmarks sent by Manager script node to host (list of 39*5 elements for full body or 31*5 for upper body) 393 | lm_raw = np.array(lms).reshape(-1,5) 394 | # Each keypoint have 5 information: 395 | # - X,Y coordinates are local to the body of 396 | # interest and range from [0.0, 255.0]. 397 | # - Z coordinate is measured in "image pixels" like 398 | # the X and Y coordinates and represents the 399 | # distance relative to the plane of the subject's 400 | # hips, which is the origin of the Z axis. Negative 401 | # values are between the hips and the camera; 402 | # positive values are behind the hips. Z coordinate 403 | # scale is similar with X, Y scales but has different 404 | # nature as obtained not via human annotation, by 405 | # fitting synthetic data (GHUM model) to the 2D 406 | # annotation. 407 | # - Visibility, after user-applied sigmoid denotes the 408 | # probability that a keypoint is located within the 409 | # frame and not occluded by another bigger body 410 | # part or another object. 411 | # - Presence, after user-applied sigmoid denotes the 412 | # probability that a keypoint is located within the 413 | # frame. 414 | 415 | # Normalize x,y,z. Scaling in z = scaling in x = 1/self.lm_input_length 416 | lm_raw[:,:3] /= self.lm_input_length 417 | # Apply sigmoid on visibility and presence (if used later) 418 | body.visibility = 1 / (1 + np.exp(-lm_raw[:,3])) 419 | body.presence = 1 / (1 + np.exp(-lm_raw[:,4])) 420 | 421 | # body.norm_landmarks contains the normalized ([0:1]) 3D coordinates of landmarks in the square rotated body bounding box 422 | body.norm_landmarks = lm_raw[:,:3] 423 | # Now calculate body.landmarks = the landmarks in the image coordinate system (in pixel) (body.landmarks) 424 | src = np.array([(0, 0), (1, 0), (1, 1)], dtype=np.float32) 425 | dst = np.array([ (x, y) for x,y in body.rect_points[1:]], dtype=np.float32) # body.rect_points[0] is left bottom point and points going clockwise! 426 | mat = cv2.getAffineTransform(src, dst) 427 | lm_xy = np.expand_dims(body.norm_landmarks[:self.nb_kps,:2], axis=0) 428 | lm_xy = np.squeeze(cv2.transform(lm_xy, mat)) 429 | 430 | # A segment of length 1 in the coordinates system of body bounding box takes body.rect_w_a pixels in the 431 | # original image. Then we arbitrarily divide by 4 for a more realistic appearance. 432 | lm_z = body.norm_landmarks[:self.nb_kps,2:3] * body.rect_w_a / 4 433 | lm_xyz = np.hstack((lm_xy, lm_z)) 434 | 435 | # World landmarks are predicted in meters rather than in pixels of the image 436 | # and have origin in the middle of the hips rather than in the corner of the 437 | # pose image (cropped with given rectangle). Thus only rotation (but not scale 438 | # and translation) is applied to the landmarks to transform them back to 439 | # original coordinates. 440 | body.landmarks_world = np.array(lms_world).reshape(-1,3) 441 | sin_rot = sin(body.rotation) 442 | cos_rot = cos(body.rotation) 443 | rot_m = np.array([[cos_rot, sin_rot], [-sin_rot, cos_rot]]) 444 | body.landmarks_world[:,:2] = np.dot(body.landmarks_world[:,:2], rot_m) 445 | 446 | if self.smoothing: 447 | timestamp = now() 448 | object_scale = body.rect_w_a 449 | lm_xyz[:self.nb_kps] = self.filter_landmarks.apply(lm_xyz[:self.nb_kps], timestamp, object_scale) 450 | lm_xyz[self.nb_kps:] = self.filter_landmarks_aux.apply(lm_xyz[self.nb_kps:], timestamp, object_scale) 451 | body.landmarks_world = self.filter_landmarks_world.apply(body.landmarks_world, timestamp) 452 | 453 | body.landmarks = lm_xyz.astype(np.int32) 454 | # If we added padding to make the image square, we need to remove this padding from landmark coordinates and from rect_points 455 | if self.pad_h > 0: 456 | body.landmarks[:,1] -= self.pad_h 457 | for i in range(len(body.rect_points)): 458 | body.rect_points[i][1] -= self.pad_h 459 | # if self.pad_w > 0: 460 | # body.landmarks[:,0] -= self.pad_w 461 | # for i in range(len(body.rect_points)): 462 | # body.rect_points[i][0] -= self.pad_w 463 | 464 | 465 | def next_frame(self): 466 | 467 | self.fps.update() 468 | 469 | if self.laconic: 470 | video_frame = np.zeros((self.frame_size, self.frame_size, 3), dtype=np.uint8) 471 | else: 472 | in_video = self.q_video.get() 473 | video_frame = in_video.getCvFrame() 474 | 475 | # For debugging 476 | # pre_pd_manip = self.q_pre_pd_manip_out.tryGet() 477 | # if pre_pd_manip: 478 | # pre_pd_manip = pre_pd_manip.getCvFrame() 479 | # cv2.imshow("pre_pd_manip", pre_pd_manip) 480 | # pre_lm_manip = self.q_pre_lm_manip_out.tryGet() 481 | # if pre_lm_manip: 482 | # pre_lm_manip = pre_lm_manip.getCvFrame() 483 | # cv2.imshow("pre_lm_manip", pre_lm_manip) 484 | 485 | # Get result from device 486 | res = marshal.loads(self.q_manager_out.get().getData()) 487 | if res["type"] != 0 and res["lm_score"] > self.lm_score_thresh: 488 | body = mpu.Body() 489 | body.rect_x_center_a = res["rect_center_x"] * self.frame_size 490 | body.rect_y_center_a = res["rect_center_y"] * self.frame_size 491 | body.rect_w_a = body.rect_h_a = res["rect_size"] * self.frame_size 492 | body.rotation = res["rotation"] 493 | body.rect_points = mpu.rotated_rect_to_points(body.rect_x_center_a, body.rect_y_center_a, body.rect_w_a, body.rect_h_a, body.rotation) 494 | body.lm_score = res["lm_score"] 495 | self.lm_postprocess(body, res['lms'], res['lms_world']) 496 | if self.xyz: 497 | if res['xyz_ref'] == 0: 498 | body.xyz_ref = None 499 | else: 500 | if res['xyz_ref'] == 1: 501 | body.xyz_ref = "mid_hips" 502 | else: # res['xyz_ref'] == 2: 503 | body.xyz_ref = "mid_shoulders" 504 | body.xyz = np.array(res["xyz"]) 505 | if self.smoothing: 506 | body.xyz = self.filter_xyz.apply(body.xyz) 507 | body.xyz_zone = np.array(res["xyz_zone"]) 508 | body.xyz_ref_coords_pixel = np.mean(body.xyz_zone.reshape((2,2)), axis=0) 509 | 510 | 511 | else: 512 | body = None 513 | if self.smoothing: 514 | self.filter_landmarks.reset() 515 | self.filter_landmarks_aux.reset() 516 | self.filter_landmarks_world.reset() 517 | if self.xyz: self.filter_xyz.reset() 518 | 519 | # Statistics 520 | if self.stats: 521 | if res["type"] == 0: 522 | self.nb_pd_inferences += 1 523 | self.nb_frames_no_body += 1 524 | else: 525 | self.nb_lm_inferences += 1 526 | if res["type"] == 1: 527 | self.nb_pd_inferences += 1 528 | else: # res["type"] == 2 529 | self.nb_lm_inferences_after_landmarks_ROI += 1 530 | if res["lm_score"] < self.lm_score_thresh: self.nb_frames_no_body += 1 531 | 532 | return video_frame, body 533 | 534 | 535 | def exit(self): 536 | self.device.close() 537 | # Print some stats 538 | if self.stats: 539 | print(f"FPS : {self.fps.get_global():.1f} f/s (# frames = {self.fps.nbf})") 540 | print(f"# frames without body : {self.nb_frames_no_body}") 541 | print(f"# pose detection inferences : {self.nb_pd_inferences}") 542 | print(f"# landmark inferences : {self.nb_lm_inferences} - # after pose detection: {self.nb_lm_inferences - self.nb_lm_inferences_after_landmarks_ROI} - # after landmarks ROI prediction: {self.nb_lm_inferences_after_landmarks_ROI}") 543 | 544 | 545 | 546 | 547 | -------------------------------------------------------------------------------- /BlazeposeRenderer.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from o3d_utils import Visu3D 4 | import mediapipe_utils as mpu 5 | 6 | 7 | 8 | # LINE_BODY and COLORS_BODY are used when drawing the skeleton in 3D. 9 | rgb = {"right":(0,1,0), "left":(1,0,0), "middle":(1,1,0)} 10 | LINES_BODY = [[9,10],[4,6],[1,3], 11 | [12,14],[14,16],[16,20],[20,18],[18,16], 12 | [12,11],[11,23],[23,24],[24,12], 13 | [11,13],[13,15],[15,19],[19,17],[17,15], 14 | [24,26],[26,28],[32,30], 15 | [23,25],[25,27],[29,31]] 16 | 17 | COLORS_BODY = ["middle","right","left", 18 | "right","right","right","right","right", 19 | "middle","middle","middle","middle", 20 | "left","left","left","left","left", 21 | "right","right","right","left","left","left"] 22 | COLORS_BODY = [rgb[x] for x in COLORS_BODY] 23 | 24 | 25 | 26 | 27 | class BlazeposeRenderer: 28 | def __init__(self, 29 | tracker, 30 | show_3d=None, 31 | output=None): 32 | self.tracker = tracker 33 | self.show_3d = show_3d 34 | self.fram = None 35 | self.pause = False 36 | 37 | # Rendering flags 38 | self.show_rot_rect = False 39 | self.show_landmarks = True 40 | self.show_score = False 41 | self.show_fps = True 42 | 43 | self.show_xyz_zone = self.show_xyz = self.tracker.xyz 44 | 45 | if self.show_3d == "mixed" and not self.tracker.xyz: 46 | print("'mixed' 3d visualization needs the tracker to be in 'xyz' mode !") 47 | print("3d visualization falling back to 'world' mode.") 48 | self.show_3d = 'world' 49 | if self.show_3d == "image": 50 | self.vis3d = Visu3D(zoom=0.7, segment_radius=3) 51 | z = min(tracker.img_h, tracker.img_w)/3 52 | self.vis3d.create_grid([0,tracker.img_h,-z],[tracker.img_w,tracker.img_h,-z],[tracker.img_w,tracker.img_h,z],[0,tracker.img_h,z],5,2) # Floor 53 | self.vis3d.create_grid([0,0,z],[tracker.img_w,0,z],[tracker.img_w,tracker.img_h,z],[0,tracker.img_h,z],5,2) # Wall 54 | self.vis3d.init_view() 55 | elif self.show_3d == "world": 56 | self.vis3d = Visu3D(bg_color=(0.2, 0.2, 0.2), zoom=1.1, segment_radius=0.01) 57 | self.vis3d.create_grid([-1,1,-1],[1,1,-1],[1,1,1],[-1,1,1],2,2) # Floor 58 | self.vis3d.create_grid([-1,1,1],[1,1,1],[1,-1,1],[-1,-1,1],2,2) # Wall 59 | self.vis3d.init_view() 60 | elif self.show_3d == "mixed": 61 | self.vis3d = Visu3D(bg_color=(0.4, 0.4, 0.4), zoom=0.7, segment_radius=0.01) 62 | half_length = 3 63 | grid_depth = 5 64 | self.vis3d.create_grid([-half_length,1,0],[half_length,1,0],[half_length,1,grid_depth],[-half_length,1,grid_depth],2*half_length,grid_depth) # Floor 65 | self.vis3d.create_grid([-half_length,1,grid_depth],[half_length,1,grid_depth],[half_length,-1,grid_depth],[-half_length,-1,grid_depth],2*half_length,2) # Wall 66 | self.vis3d.create_camera() 67 | self.vis3d.init_view() 68 | 69 | if output is None: 70 | self.output = None 71 | else: 72 | fourcc = cv2.VideoWriter_fourcc(*"MJPG") 73 | self.output = cv2.VideoWriter(output,fourcc,tracker.video_fps,(tracker.img_w, tracker.img_h)) 74 | 75 | def is_present(self, body, lm_id): 76 | return body.presence[lm_id] > self.tracker.presence_threshold 77 | 78 | def draw_landmarks(self, body): 79 | if self.show_rot_rect: 80 | cv2.polylines(self.frame, [np.array(body.rect_points)], True, (0,255,255), 2, cv2.LINE_AA) 81 | if self.show_landmarks: 82 | list_connections = LINES_BODY 83 | lines = [np.array([body.landmarks[point,:2] for point in line]) for line in list_connections if self.is_present(body, line[0]) and self.is_present(body, line[1])] 84 | cv2.polylines(self.frame, lines, False, (255, 180, 90), 2, cv2.LINE_AA) 85 | 86 | # for i,x_y in enumerate(body.landmarks_padded[:,:2]): 87 | for i,x_y in enumerate(body.landmarks[:self.tracker.nb_kps,:2]): 88 | if self.is_present(body, i): 89 | if i > 10: 90 | color = (0,255,0) if i%2==0 else (0,0,255) 91 | elif i == 0: 92 | color = (0,255,255) 93 | elif i in [4,5,6,8,10]: 94 | color = (0,255,0) 95 | else: 96 | color = (0,0,255) 97 | cv2.circle(self.frame, (x_y[0], x_y[1]), 4, color, -11) 98 | if self.show_score: 99 | h, w = self.frame.shape[:2] 100 | cv2.putText(self.frame, f"Landmark score: {body.lm_score:.2f}", 101 | (20, h-60), 102 | cv2.FONT_HERSHEY_PLAIN, 2, (255,255,0), 2) 103 | 104 | if self.show_xyz and body.xyz_ref: 105 | x0, y0 = body.xyz_ref_coords_pixel.astype(np.int32) 106 | x0 -= 50 107 | y0 += 40 108 | cv2.rectangle(self.frame, (x0,y0), (x0+100, y0+85), (220,220,240), -1) 109 | cv2.putText(self.frame, f"X:{body.xyz[0]/10:3.0f} cm", (x0+10, y0+20), cv2.FONT_HERSHEY_PLAIN, 1, (20,180,0), 2) 110 | cv2.putText(self.frame, f"Y:{body.xyz[1]/10:3.0f} cm", (x0+10, y0+45), cv2.FONT_HERSHEY_PLAIN, 1, (255,0,0), 2) 111 | cv2.putText(self.frame, f"Z:{body.xyz[2]/10:3.0f} cm", (x0+10, y0+70), cv2.FONT_HERSHEY_PLAIN, 1, (0,0,255), 2) 112 | if self.show_xyz_zone and body.xyz_ref: 113 | # Show zone on which the spatial data were calculated 114 | cv2.rectangle(self.frame, tuple(body.xyz_zone[0:2]), tuple(body.xyz_zone[2:4]), (180,0,180), 2) 115 | 116 | def draw_3d(self, body): 117 | self.vis3d.clear() 118 | self.vis3d.try_move() 119 | self.vis3d.add_geometries() 120 | if body is not None: 121 | points = body.landmarks if self.show_3d == "image" else body.landmarks_world 122 | draw_skeleton = True 123 | if self.show_3d == "mixed": 124 | if body.xyz_ref: 125 | """ 126 | Beware, the y value of landmarks_world coordinates is negative for landmarks 127 | above the mid hips (like shoulders) and negative for landmarks below (like feet). 128 | The y value of (x,y,z) coordinates given by depth sensor is negative in the lower part 129 | of the image and positive in the upper part. 130 | """ 131 | translation = body.xyz / 1000 132 | translation[1] = -translation[1] 133 | if body.xyz_ref == "mid_hips": 134 | points = points + translation 135 | elif body.xyz_ref == "mid_shoulders": 136 | mid_hips_to_mid_shoulders = np.mean([ 137 | points[mpu.KEYPOINT_DICT['right_shoulder']], 138 | points[mpu.KEYPOINT_DICT['left_shoulder']]], 139 | axis=0) 140 | points = points + translation - mid_hips_to_mid_shoulders 141 | else: 142 | draw_skeleton = False 143 | if draw_skeleton: 144 | lines = LINES_BODY 145 | colors = COLORS_BODY 146 | for i,a_b in enumerate(lines): 147 | a, b = a_b 148 | if self.is_present(body, a) and self.is_present(body, b): 149 | self.vis3d.add_segment(points[a], points[b], color=colors[i]) 150 | self.vis3d.render() 151 | 152 | 153 | def draw(self, frame, body): 154 | if not self.pause: 155 | self.frame = frame 156 | if body: 157 | self.draw_landmarks(body) 158 | self.body = body 159 | elif self.frame is None: 160 | self.frame = frame 161 | self.body = None 162 | # else: self.frame points to previous frame 163 | if self.show_3d: 164 | self.draw_3d(self.body) 165 | return self.frame 166 | 167 | def exit(self): 168 | if self.output: 169 | self.output.release() 170 | 171 | def waitKey(self, delay=1): 172 | if self.show_fps: 173 | self.tracker.fps.draw(self.frame, orig=(50,50), size=1, color=(240,180,100)) 174 | cv2.imshow("Blazepose", self.frame) 175 | if self.output: 176 | self.output.write(self.frame) 177 | key = cv2.waitKey(delay) 178 | if key == 32: 179 | # Pause on space bar 180 | self.pause = not self.pause 181 | elif key == ord('r'): 182 | self.show_rot_rect = not self.show_rot_rect 183 | elif key == ord('l'): 184 | self.show_landmarks = not self.show_landmarks 185 | elif key == ord('s'): 186 | self.show_score = not self.show_score 187 | elif key == ord('f'): 188 | self.show_fps = not self.show_fps 189 | elif key == ord('x'): 190 | if self.tracker.xyz: 191 | self.show_xyz = not self.show_xyz 192 | elif key == ord('z'): 193 | if self.tracker.xyz: 194 | self.show_xyz_zone = not self.show_xyz_zone 195 | return key 196 | 197 | -------------------------------------------------------------------------------- /FPS.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Sep 22 15:29:32 2017 4 | 5 | @author: geaxx 6 | """ 7 | import time 8 | import cv2 9 | 10 | def now(): 11 | return time.perf_counter() 12 | 13 | class FPS: # To measure the number of frame per second 14 | def __init__(self, mean_nb_frames=10): 15 | self.nbf = -1 16 | self.fps = 0 17 | self.start = 0 18 | self.stop = 0 19 | self.local_start = 0 20 | self.mean_nb_frames = mean_nb_frames 21 | 22 | def update(self): 23 | if self.nbf%self.mean_nb_frames == 0: 24 | if self.start != 0: 25 | self.stop = now() 26 | self.fps = self.mean_nb_frames/(self.stop-self.local_start) 27 | self.local_start = self.stop 28 | else : 29 | self.start = self.local_start = now() 30 | self.nbf+=1 31 | 32 | def get(self): 33 | return self.fps 34 | 35 | def get_global(self): 36 | if self.stop == 0: self.stop = now() 37 | return self.nbf/(self.stop-self.start) 38 | 39 | def draw(self, win, orig=(10,30), font=cv2.FONT_HERSHEY_SIMPLEX, size=2, color=(0,255,0), thickness=2): 40 | cv2.putText(win,f"FPS={self.get():.2f}",orig,font,size,color,thickness) 41 | 42 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) [2021] [geaxgx] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Blazepose tracking with DepthAI 2 | 3 | Running Google Mediapipe single body pose tracking models on [DepthAI](https://docs.luxonis.com/en/gen2/) hardware (OAK-1, OAK-D, ...). 4 | 5 | The Blazepose landmark models available in this repository are the version "full", "lite" and "heavy" of mediapipe 0.8.6 (2021/07), 6 | 7 | The pose detection model comes from mediapipe 0.8.4 and is compatible with the 3 landmark models (the 0.8.6 version currently cannot be converted into a Myriad blob). 8 | 9 |

10 | 11 | For the challenger Movenet on DepthAI, please visit : [depthai_movenet](https://github.com/geaxgx/depthai_movenet) 12 | 13 | For an OpenVINO version of Blazepose, please visit : [openvino_blazepose](https://github.com/geaxgx/openvino_blazepose) 14 | 15 | 16 | - [Blazepose tracking with DepthAI](#blazepose-tracking-with-depthai) 17 | - [Architecture: Host mode vs Edge mode](#architecture-host-mode-vs-edge-mode) 18 | - [Inferred 3D vs Measured 3D](#inferred-3d-vs-measured-3d) 19 | - [Install](#install) 20 | - [Run](#run) 21 | - [Mediapipe models](#mediapipe-models) 22 | - [Custom models](#custom-models) 23 | - [Landmarks](#landmarks) 24 | - [Code](#code) 25 | - [Examples](#examples) 26 | - [Credits](#credits) 27 | 28 | ## Architecture: Host mode vs Edge mode 29 | Two modes are available: 30 | - **Host mode :** aside the neural networks that run on the device, almost all the processing is run on the host (the only processing done on the device is the letterboxing operation before the pose detection network when using the device camera as video source). **Use this mode when you want to infer on external input source (videos, images).** 31 | - **Edge mode :** most of the processing (neural networks, post-processings, image manipulations) is run on the device thaks to the depthai scripting node feature. It works only with the device camera but is **definitely the best option when working with the internal camera** (much faster than in Host mode). The data exchanged between the host and the device is minimal: the landmarks of detected body (~3kB/frame). In Edge mode, you can choose not to send the camera video frame to the host (by specifying 'rgb_laconic' as input). 32 | 33 | |Landmark model (Edge mode)| FPS (FPS with 'xyz' option)| 34 | |-|-| 35 | |Full|20 (18)| 36 | |Lite|26 (22)| 37 | |Heavy|8 (7)| 38 |
39 | 40 |

Host mode

41 |

Edge mode

42 | 43 | 44 | For depth-capable devices, when measuring the 3D location of a reference point, more nodes are used and not represented here (2 mono cameras, stereo node, spatial location calculator). 45 | 46 | *Note : the Edge mode schema is missing a custom NeuralNetwork node between the ImageManip node on the right and the landmark NeuralNetwork. This custom NeuralNetwork runs a very simple model that normalize (divide by 255) the output image from the ImageManip node. This is a temporary fix, should be removed when depthai ImageManip node will support setFrameType(RGBF16F16F16p).* 47 | 48 | ## Inferred 3D vs Measured 3D 49 | 50 | * **Inferred 3D** : the Landmark model is able to infer 3D (x,y,z) landmarks. Actually, the 0.8.6 model yields 2 outputs with 3D landmarks : Identity (accessed via *body.landmarks* with the API) and Identity_4 (accessed via *body.world_landmarks*). It may sound as redundant information but there is a difference: *world landmarks* are real-world 3D coordinates in meters with the origin at the center between hips. *world landmarks* share the same landmark topology as *landmarks*. However, *landmarks* provide coordinates (in pixels) of a 3D object projected onto the 2D image surface, while *world landmarks* provide coordinates (in meters) of the 3D object itself. 51 | * **Measured 3D** : for devices able to measure depth (like OAK-D), we can determine the real 3D location of a point of the image in camera coordinate system. So one idea would be to measure the 3D locations of each inferred 2D body landmarks. It turns out it is not a good idea in practice for at least 2 reasons. First, a inferred keypoint may stand "outside" of its counterpart in the image and therefore in the aligned depth frame. It happens probably more frequently with extremities, and it can be explained by the inaccuracy of the model which is never 100% perfect. Secondly, we can't get depth for hidden keypoints. An alternative solution, implemented here, is to combine the inferred 3D *world landmarks* with the measured 3D location of one reference point, the center between hips. Compared to extremities, this reference point can be more robustly measured. 52 | 53 | The image below demonstrates the 3 modes of 3D visualization: 54 | 1) **Image mode** (top-right), based on *body.landmarks*. Note that the size of the drawn skeleton depends on the distance camera-body, but that the mid hips reference point is restricted and can only moved inside a plane parallel to the wall grid; 55 | 2) **World mode** (bottom-left), based on *body.world_landmarks*. Note the mid hips reference point is fixed and the size of the skeleton does not change; 56 | 3) **Mixed mode** (bottom right), mixing *body.world_landmarks* with measured 3D location of the reference point. Like in World mode, the size of the skeleton does not change. But the mid hips reference point is not restricted any more. 57 | 58 |

3D visualizations

59 | 60 | 61 | ## Install 62 | 63 | 64 | Install the python packages (depthai, opencv, open3d) with the following command: 65 | 66 | ``` 67 | python3 -m pip install -r requirements.txt 68 | ``` 69 | 70 | ## Run 71 | 72 | **Usage:** 73 | 74 | ``` 75 | -> python3 demo.py -h 76 | usage: demo.py [-h] [-e] [-i INPUT] [--pd_m PD_M] [--lm_m LM_M] [-xyz] [-c] 77 | [--no_smoothing] [-f INTERNAL_FPS] 78 | [--internal_frame_height INTERNAL_FRAME_HEIGHT] [-s] [-t] 79 | [--force_detection] [-3 {None,image,mixed,world}] 80 | [-o OUTPUT] 81 | 82 | optional arguments: 83 | -h, --help show this help message and exit 84 | -e, --edge Use Edge mode (postprocessing runs on the device) 85 | 86 | Tracker arguments: 87 | -i INPUT, --input INPUT 88 | 'rgb' or 'rgb_laconic' or path to video/image file to 89 | use as input (default=rgb) 90 | --pd_m PD_M Path to an .blob file for pose detection model 91 | --lm_m LM_M Landmark model ('full' or 'lite' or 'heavy') or path 92 | to an .blob file 93 | -xyz, --xyz Get (x,y,z) coords of reference body keypoint in 94 | camera coord system (only for compatible devices) 95 | -c, --crop Center crop frames to a square shape before feeding 96 | pose detection model 97 | --no_smoothing Disable smoothing filter 98 | -f INTERNAL_FPS, --internal_fps INTERNAL_FPS 99 | Fps of internal color camera. Too high value lower NN 100 | fps (default= depends on the model) 101 | --internal_frame_height INTERNAL_FRAME_HEIGHT 102 | Internal color camera frame height in pixels 103 | (default=640) 104 | -s, --stats Print some statistics at exit 105 | -t, --trace Print some debug messages 106 | --force_detection Force person detection on every frame (never use 107 | landmarks from previous frame to determine ROI) 108 | 109 | Renderer arguments: 110 | -3 {None,image,mixed,world}, --show_3d {None,image,mixed,world} 111 | Display skeleton in 3d in a separate window. See 112 | README for description. 113 | -o OUTPUT, --output OUTPUT 114 | Path to output video file 115 | ``` 116 | **Examples :** 117 | 118 | - To use default internal color camera as input with the model "full" in Host mode: 119 | 120 | ```python3 demo.py``` 121 | 122 | - To use default internal color camera as input with the model "full" in Edge mode [**preferred**]: 123 | 124 | ```python3 demo.py -e``` 125 | 126 | - To use a file (video or image) as input : 127 | 128 | ```python3 demo.py -i filename``` 129 | 130 | - To use the model "lite" : 131 | 132 | ```python3 demo.py -lm_m lite``` 133 | 134 | - To measure body spatial location in camera coordinate system (only for depth-capable device like OAK-D): 135 | ```python3 demo.py -e -xyz``` 136 | 137 | The measure is made only on one reference point: 138 | - the middle of the hips if both hips are visible; 139 | - the middle of the shoulders if hips are not visible and both shoulders are visible. 140 | 141 | - To show the skeleton in 3D 'world' mode (-xyz flag needed): 142 | 143 | ```python3 demo.py -e -xyz -3 world``` 144 | 145 |

World mode

146 | 147 | Note that the floor and wall grids does not correspond to a real floor and wall. Each grid square size is 1m x 1m. 148 | 149 | - When using the internal camera, to change its FPS to 15 : 150 | 151 | ```python3 demo.py --internal_fps 15``` 152 | 153 | Note: by default, the default internal camera FPS depends on the model, the mode (Edge vs Host), the use of depth ("-xyz"). These default values are based on my own observations. **Please, don't hesitate to play with this parameter to find the optimal value.** If you observe that your FPS is well below the default value, you should lower the FPS with this option until the set FPS is just above the observed FPS. 154 | 155 | - When using the internal camera, you probably don't need to work with the full resolution. You can set a lower resolution (and win a bit of FPS) by using this option: 156 | 157 | ```python3 demo.py --internal_frame_size 450``` 158 | 159 | Note: currently, depthai supports only some possible values for this argument. The value you specify will be replaced by the closest possible value (here 432 instead of 450). 160 | 161 | - By default, temporal filters smooth the landmark positions. Use *--no_smoothing* to disable the filter. 162 | 163 | |Keypress in OpenCV window|Function| 164 | |-|-| 165 | |*Esc*|Exit| 166 | |*space*|Pause| 167 | |r|Show/hide the bounding rotated rectangle around the body| 168 | |l|Show/hide landmarks| 169 | |s|Show/hide landmark score| 170 | |f|Show/hide FPS| 171 | |x|Show/hide (x,y,z) coordinates (only on depth-capable devices and if using "-xyz" flag)| 172 | |z|Show/hide the square zone used to measure depth (only on depth-capable devices and if using "-xyz" flag)| 173 | 174 | If using a 3D visualization mode ("-3" or "--show_3d"): 175 | |Keypress in Open3d window|Function| 176 | |-|-| 177 | |o|Oscillating (rotating back and forth) of the view| 178 | |r|Continuous rotating of the view| 179 | |s|Stop oscillating or rotating| 180 | |*Up*|Increasing rotating or oscillating speed| 181 | |*Down*|Decreasing rotating or oscillating speed| 182 | |*Right* or *Left*|Change the point of view to a predefined position| 183 | |*Mouse*|Freely change the point of view| 184 | 185 | ## Mediapipe models 186 | You can directly find the model files (.xml and .bin) under the 'models' directory. Below I describe how to get the files in case you need to regenerate the models. 187 | 188 | 1) Clone this github repository in a local directory (DEST_DIR) 189 | 2) In DEST_DIR/models directory, download the tflite models from [this archive](https://drive.google.com/file/d/1bEL4zmh2PEFsRfmFOofP0rbGNo-O1p5_/view?usp=sharing). The archive contains: 190 | * Pose detection model from Mediapipe 0.8.4, 191 | * Full, Lite anf Hevay pose landmark modelfrom Mediapipe 0.8.6. 192 | 193 | *Note: the Pose detection model from Mediapipe 0.8.6 can't currently be converted (more info [here](https://github.com/PINTO0309/tflite2tensorflow/issues/11)).* 194 | 195 | 3) Install the amazing [PINTO's tflite2tensorflow tool](https://github.com/PINTO0309/tflite2tensorflow). Use the docker installation which includes many packages including a recent version of Openvino. 196 | 3) From DEST_DIR, run the tflite2tensorflow container: ```./docker_tflite2tensorflow.sh``` 197 | 4) From the running container: 198 | ``` 199 | cd workdir/models 200 | ./convert_models.sh 201 | ``` 202 | The *convert_models.sh* converts the tflite models in tensorflow (.pb), then converts the pb file into Openvino IR format (.xml and .bin), and finally converts the IR files in MyriadX format (.blob). 203 | 204 | 5) By default, the number of SHAVES associated with the blob files is 4. In case you want to generate new blobs with different number of shaves, you can use the script *gen_blob_shave.sh*: 205 | ``` 206 | # Example: to generate blobs for 6 shaves 207 | ./gen_blob_shave.sh -m pd -n 6 # will generate pose_detection_sh6.blob 208 | ./gen_blob_shave.sh -m full -n 6 # will generate pose_landmark_full_sh6.blob 209 | ``` 210 | 211 | **Explanation about the Model Optimizer params :** 212 | - The preview of the OAK-* color camera outputs BGR [0, 255] frames . The original tflite pose detection model is expecting RGB [-1, 1] frames. ```--reverse_input_channels``` converts BGR to RGB. ```--mean_values [127.5,127.5,127.5] --scale_values [127.5,127.5,127.5]``` normalizes the frames between [-1, 1]. 213 | - The original landmark model is expecting RGB [0, 1] frames. Therefore, the following arguments are used ```--reverse_input_channels```, but unlike the detection model, we choose to do the normalization in the python code and not in the models (via ```--scale_values```). Indeed, we have observed a better accuracy with FP16 models when doing the normalization of the inputs outside of the models ([a possible explanation](https://github.com/PINTO0309/tflite2tensorflow/issues/9#issuecomment-842460014)). 214 | 215 | ## Custom models 216 | 217 | The `custom_models` directory contains the code to build the following custom models: 218 | - DetectionBestCandidate: this model processes the outputs of the pose detection network (a 1x2254x1 tensor for the scores and a 1x2254x12 for the regressors) and yields the regressor with the highest score. 219 | - DivideBy255: this model transforms an 256x256 RGB888p ([0, 255]) image to a 256x256 RGBF16F16F16p image ([0., 1.]). 220 | 221 | The method used to build these models is well explained on the [rahulrav's blog](https://rahulrav.com/blog/depthai_camera.html). 222 | 223 | 224 | ## Landmarks 225 |

226 | 227 |

228 | 229 |

230 | Source 231 |

232 | 233 | 234 | ## Code 235 | 236 | To facilitate reusability, the code is splitted in 2 classes: 237 | - **BlazeposeDepthai**, which is responsible of computing the body landmarks. The importation of this class depends on the mode: 238 | ``` 239 | # For Host mode: 240 | from BlazeposeDepthai import BlazeposeDepthai 241 | ``` 242 | ``` 243 | # For Edge mode: 244 | from BlazeposeDepthaiEdge import BlazeposeDepthai 245 | ``` 246 | - **BlazeposeRenderer**, which is responsible of rendering the landmarks and the skeleton on the video frame. 247 | 248 | This way, you can replace the renderer from this repository and write and personalize your own renderer (for some projects, you may not even need a renderer). 249 | 250 | The file ```demo.py``` is a representative example of how to use these classes: 251 | ``` 252 | from BlazeposeDepthaiEdge import BlazeposeDepthai 253 | from BlazeposeRenderer import BlazeposeRenderer 254 | 255 | # The argparse stuff has been removed to keep only the important code 256 | 257 | tracker = BlazeposeDepthai(input_src=args.input, 258 | pd_model=args.pd_m, 259 | lm_model=args.lm_m, 260 | smoothing=not args.no_smoothing, 261 | xyz=args.xyz, 262 | crop=args.crop, 263 | internal_fps=args.internal_fps, 264 | internal_frame_height=args.internal_frame_height, 265 | force_detection=args.force_detection, 266 | stats=args.stats, 267 | trace=args.trace) 268 | 269 | renderer = BlazeposeRenderer( 270 | pose, 271 | show_3d=args.show_3d, 272 | output=args.output) 273 | 274 | while True: 275 | # Run blazepose on next frame 276 | frame, body = tracker.next_frame() 277 | if frame is None: break 278 | # Draw 2d skeleton 279 | frame = renderer.draw(frame, body) 280 | key = renderer.waitKey(delay=1) 281 | if key == 27 or key == ord('q'): 282 | break 283 | renderer.exit() 284 | tracker.exit() 285 | ``` 286 | 287 | For more information on: 288 | - the arguments of the tracker, please refer to the docstring of class `BlazeposeDepthai` or `BlazeposeDepthaiEdge` in `BlazeposeDepthai.py` or `BlazeposeDepthaiEdge.py`; 289 | - the attributes of the 'body' element you can exploit in your program, please refer to the doctring of class `Body` in `mediapipe_utils.py`. 290 | 291 | ## Examples 292 | 293 | ||| 294 | |-|-| 295 | |[Semaphore alphabet](examples/semaphore_alphabet) |Sempahore alphabet| 296 | 297 | ## Credits 298 | * [Google Mediapipe](https://github.com/google/mediapipe) 299 | * Katsuya Hyodo a.k.a [Pinto](https://github.com/PINTO0309), the Wizard of Model Conversion ! 300 | * [Tai Chi Step by Step For Beginners Training Session 4](https://www.youtube.com/watch?v=oawZ_7wNWrU&ab_channel=MasterSongKungFu) 301 | * [Semaphore with The RCR Museum](https://www.youtube.com/watch?v=DezaTjQYPh0&ab_channel=TheRoyalCanadianRegimentMuseum) 302 | -------------------------------------------------------------------------------- /custom_models/DetectionBestCandidate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import sys, os 4 | sys.path.insert(1, os.path.realpath(os.path.pardir)) 5 | from mediapipe_utils import generate_blazepose_anchors 6 | import numpy as np 7 | from math import pi 8 | 9 | anchors = generate_blazepose_anchors() 10 | print(f"Nb anchors: {len(anchors)}") 11 | 12 | detection_input_length = 224 13 | 14 | class DetectionBestCandidate(nn.Module): 15 | def __init__(self): 16 | super(DetectionBestCandidate, self).__init__() 17 | # anchors shape is nb_anchorsx4 [x_center, y_center, width, height] 18 | # Here: width and height is always 1, so we keep just [x_center, y_center] 19 | self.anchors = torch.from_numpy(anchors[:,:2]) 20 | self.plus_anchor_center = np.array([[1,0,0,0,1,0,1,0,1,0,1,0], [0,1,0,0,0,1,0,1,0,1,0,1]], dtype=np.float) 21 | self.plus_anchor_center = torch.from_numpy(self.plus_anchor_center) 22 | 23 | def forward(self, x, y): 24 | # x.shape: 1xnb_anchorsx1 25 | # y.shape: 1xnb_anchorsx12 26 | 27 | # decode_bboxes 28 | best_id = torch.argmax(x) 29 | score = x[0][best_id] 30 | score = torch.sigmoid(score) 31 | bbox = y[0,best_id] 32 | bbox = bbox/detection_input_length + torch.mm(self.anchors[best_id].unsqueeze(0), self.plus_anchor_center)[0] 33 | 34 | # bbox : the 4 first elements describe the square bounding box around the head (not used for determining rotated rectangle around the body) 35 | # The 8 following corresponds to 4 (x,y) normalized keypoints coordinates (useful for determining rotated rectangle) 36 | # For full body, keypoints 1 and 2 are used. 37 | # For upper body, keypoints 3 and 4 are used. 38 | # Here we are only interested in full body 39 | 40 | # detections_to_rect (1st part) 41 | sqn_rr_center_xy = bbox[4:6] 42 | sqn_scale_xy = bbox[6:8] 43 | 44 | return torch.cat((score, sqn_rr_center_xy, sqn_scale_xy)) 45 | 46 | def test(): 47 | 48 | model = DetectionBestCandidate() 49 | X = torch.randn(1, len(anchors), 1, dtype=torch.float) 50 | Y = torch.randn(1, len(anchors), 12, dtype=torch.float) 51 | result = model(X, Y) 52 | print(result) 53 | 54 | def export_onnx(): 55 | """ 56 | Exports the model to an ONNX file. 57 | """ 58 | model = DetectionBestCandidate() 59 | X = torch.randn(1, len(anchors), 1, dtype=torch.float) 60 | Y = torch.randn(1, len(anchors), 12, dtype=torch.float) 61 | onnx_name = "DetectionBestCandidate.onnx" 62 | 63 | print(f"Generating {onnx_name}") 64 | torch.onnx.export( 65 | model, 66 | (X, Y), 67 | onnx_name, 68 | opset_version=10, 69 | do_constant_folding=True, 70 | # verbose=True, 71 | input_names=['Identity_1', 'Identity'], 72 | output_names=['result'] 73 | ) 74 | 75 | if __name__ == "__main__": 76 | 77 | test() 78 | export_onnx() -------------------------------------------------------------------------------- /custom_models/DetectionBestCandidate_sh1.blob: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/custom_models/DetectionBestCandidate_sh1.blob -------------------------------------------------------------------------------- /custom_models/DivideBy255.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import sys, os 4 | sys.path.insert(1, os.path.realpath(os.path.pardir)) 5 | import numpy as np 6 | from math import pi 7 | 8 | 9 | 10 | detection_input_length = 224 11 | 12 | class DivideBy255(nn.Module): 13 | def __init__(self): 14 | super(DivideBy255, self).__init__() 15 | 16 | def forward(self, x): 17 | return x / 255.0 18 | 19 | def test(): 20 | 21 | model = DivideBy255() 22 | 23 | X = torch.ones(1, 3, 256, 256, dtype=torch.float) 24 | result = model(X) 25 | print(result) 26 | 27 | def export_onnx(): 28 | """ 29 | Exports the model to an ONNX file. 30 | """ 31 | model = DivideBy255() 32 | X = torch.randn(1, 3, 256, 256, dtype=torch.float) 33 | onnx_name = "DivideBy255.onnx" 34 | 35 | print(f"Generating {onnx_name}") 36 | torch.onnx.export( 37 | model, 38 | (X), 39 | onnx_name, 40 | opset_version=10, 41 | do_constant_folding=True, 42 | # verbose=True, 43 | # input_names=['Identity_1', 'Identity'], 44 | output_names=['input_1'] 45 | ) 46 | 47 | if __name__ == "__main__": 48 | 49 | test() 50 | export_onnx() -------------------------------------------------------------------------------- /custom_models/DivideBy255_sh1.blob: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/custom_models/DivideBy255_sh1.blob -------------------------------------------------------------------------------- /custom_models/README.md: -------------------------------------------------------------------------------- 1 | ## Generation of DetectionBestCandidate.blob 2 | 1) pytorch is needed 3 | 2) Generate the ONNX model *DetectionBestCandidate.onnx* : 4 | ``` 5 | python DetectionBestCandidate.py 6 | ``` 7 | 8 | 3) Start the tflite2tensorflow docker container: 9 | ``` 10 | ../docker_tflite2tensorflow.sh 11 | ``` 12 | 4) From the container shell: 13 | ``` 14 | cd workdir 15 | ./convert_model.sh -m DetectionBestCandidate # Use `-n #` if you want to change the number of shaves 16 | ``` 17 | 18 | ## Generation of DivideBy255.blob 19 | Same as above with 'DivideBy255' instead of 'DetectionBestCandidate'. 20 | 21 | -------------------------------------------------------------------------------- /custom_models/convert_model.sh: -------------------------------------------------------------------------------- 1 | # This script has to be run from the docker container started by ./docker_tflite2tensorflow.sh 2 | 3 | usage () 4 | { 5 | echo "Generate a blob from an ONNX model with a specified number of shaves and cmx (nb cmx = nb shaves)" 6 | echo 7 | echo "Usage: ${0} [-m model] [-n nb_shaves]" 8 | echo 9 | echo "model: DetectionBestCandidate or DivideBy255 or other" 10 | echo "nb_shaves must be between 1 and 13 (default=1)" 11 | } 12 | 13 | while getopts ":hm:n:" opt; do 14 | case ${opt} in 15 | h ) 16 | usage 17 | exit 0 18 | ;; 19 | m ) 20 | model=$OPTARG 21 | ;; 22 | n ) 23 | nb_shaves=$OPTARG 24 | ;; 25 | : ) 26 | echo "Error: -$OPTARG requires an argument." 27 | usage 28 | exit 1 29 | ;; 30 | \? ) 31 | echo "Invalid option: -$OPTARG" 32 | usage 33 | exit 1 34 | ;; 35 | esac 36 | done 37 | 38 | if [ -z "$model" ] 39 | then 40 | usage 41 | exit 1 42 | fi 43 | if [ ! -f $model_onnx ] 44 | then 45 | echo "The model ${model_onnx} does not exist" 46 | echo "Have you run 'python DetectionBestCandidate.py' ?" 47 | exit 1 48 | fi 49 | 50 | if [ -z "$nb_shaves" ] 51 | then 52 | nb_shaves=1 53 | fi 54 | if [ $nb_shaves -lt 1 -o $nb_shaves -gt 13 ] 55 | then 56 | echo "Invalid number of shaves !" 57 | usage 58 | exit 1 59 | fi 60 | 61 | if [ $model = DivideBy255 ] 62 | then 63 | precision=U8 64 | else 65 | precision=FP16 66 | fi 67 | 68 | 69 | model_onnx="${model}.onnx" 70 | model_xml="${model}.xml" 71 | model_blob="${model}_sh${nb_shaves}.blob" 72 | 73 | echo Model: $model_xml $model_blob 74 | echo Shaves $nb_shaves 75 | 76 | ARG=$1 77 | source /opt/intel/openvino_2021/bin/setupvars.sh 78 | 79 | # Use FP16 and make the batch_size explicit. 80 | python /opt/intel/openvino_2021/deployment_tools/model_optimizer/mo_onnx.py \ 81 | --input_model $model_onnx --data_type half 82 | /opt/intel/openvino_2021/deployment_tools/tools/compile_tool/compile_tool -d MYRIAD \ 83 | -m $model_xml \ 84 | -ip $precision \ 85 | -VPU_NUMBER_OF_SHAVES $nb_shaves \ 86 | -VPU_NUMBER_OF_CMX_SLICES $nb_shaves \ 87 | -o $model_blob 88 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from BlazeposeRenderer import BlazeposeRenderer 4 | import argparse 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('-e', '--edge', action="store_true", 8 | help="Use Edge mode (postprocessing runs on the device)") 9 | parser_tracker = parser.add_argument_group("Tracker arguments") 10 | parser_tracker.add_argument('-i', '--input', type=str, default="rgb", 11 | help="'rgb' or 'rgb_laconic' or path to video/image file to use as input (default=%(default)s)") 12 | parser_tracker.add_argument("--pd_m", type=str, 13 | help="Path to an .blob file for pose detection model") 14 | parser_tracker.add_argument("--lm_m", type=str, 15 | help="Landmark model ('full' or 'lite' or 'heavy') or path to an .blob file") 16 | parser_tracker.add_argument('-xyz', '--xyz', action="store_true", 17 | help="Get (x,y,z) coords of reference body keypoint in camera coord system (only for compatible devices)") 18 | parser_tracker.add_argument('-c', '--crop', action="store_true", 19 | help="Center crop frames to a square shape before feeding pose detection model") 20 | parser_tracker.add_argument('--no_smoothing', action="store_true", 21 | help="Disable smoothing filter") 22 | parser_tracker.add_argument('-f', '--internal_fps', type=int, 23 | help="Fps of internal color camera. Too high value lower NN fps (default= depends on the model)") 24 | parser_tracker.add_argument('--internal_frame_height', type=int, default=640, 25 | help="Internal color camera frame height in pixels (default=%(default)i)") 26 | parser_tracker.add_argument('-s', '--stats', action="store_true", 27 | help="Print some statistics at exit") 28 | parser_tracker.add_argument('-t', '--trace', action="store_true", 29 | help="Print some debug messages") 30 | parser_tracker.add_argument('--force_detection', action="store_true", 31 | help="Force person detection on every frame (never use landmarks from previous frame to determine ROI)") 32 | 33 | parser_renderer = parser.add_argument_group("Renderer arguments") 34 | parser_renderer.add_argument('-3', '--show_3d', choices=[None, "image", "world", "mixed"], default=None, 35 | help="Display skeleton in 3d in a separate window. See README for description.") 36 | parser_renderer.add_argument("-o","--output", 37 | help="Path to output video file") 38 | 39 | 40 | args = parser.parse_args() 41 | 42 | if args.edge: 43 | from BlazeposeDepthaiEdge import BlazeposeDepthai 44 | else: 45 | from BlazeposeDepthai import BlazeposeDepthai 46 | tracker = BlazeposeDepthai(input_src=args.input, 47 | pd_model=args.pd_m, 48 | lm_model=args.lm_m, 49 | smoothing=not args.no_smoothing, 50 | xyz=args.xyz, 51 | crop=args.crop, 52 | internal_fps=args.internal_fps, 53 | internal_frame_height=args.internal_frame_height, 54 | force_detection=args.force_detection, 55 | stats=True, 56 | trace=args.trace) 57 | 58 | renderer = BlazeposeRenderer( 59 | tracker, 60 | show_3d=args.show_3d, 61 | output=args.output) 62 | 63 | while True: 64 | # Run blazepose on next frame 65 | frame, body = tracker.next_frame() 66 | if frame is None: break 67 | # Draw 2d skeleton 68 | frame = renderer.draw(frame, body) 69 | key = renderer.waitKey(delay=1) 70 | if key == 27 or key == ord('q'): 71 | break 72 | renderer.exit() 73 | tracker.exit() -------------------------------------------------------------------------------- /docker_tflite2tensorflow.sh: -------------------------------------------------------------------------------- 1 | # Used to convert original Mediapipe tflite models to Openvino IR and MyriadX blob 2 | # https://github.com/PINTO0309/tflite2tensorflow 3 | # 4 | # First run: docker pull pinto0309/tflite2tensorflow 5 | docker run -it --rm \ 6 | -v `pwd`:/home/user/workdir \ 7 | -v /tmp/.X11-unix/:/tmp/.X11-unix:rw \ 8 | --device /dev/video0:/dev/video0:mwr \ 9 | --net=host \ 10 | -e XDG_RUNTIME_DIR=$XDG_RUNTIME_DIR \ 11 | -e DISPLAY=$DISPLAY \ 12 | --privileged \ 13 | pinto0309/tflite2tensorflow:latest bash 14 | 15 | -------------------------------------------------------------------------------- /examples/semaphore_alphabet/README.md: -------------------------------------------------------------------------------- 1 | # Semaphore alphabet with Blazepose on DepthAI 2 | 3 | This demo demonstrates the recognition of semaphore alphabet. 4 | 5 | For each arm (segment shoulder-elbow), its angle with the vertical axe is calculated. The pair of left and right arm angles gives the letter. 6 | 7 | 8 | ![Semaphore alphabet](medias/semaphore.gif) 9 | 10 | ## Usage 11 | 12 | ``` 13 | -> python3 demo.py -h 14 | usage: demo.py [-h] [-m {full,lite,831}] [-i INPUT] [-o OUTPUT] 15 | 16 | optional arguments: 17 | -h, --help show this help message and exit 18 | -m {full,lite,831}, --model {full,lite,831} 19 | Landmark model to use (default=full 20 | -i INPUT, --input INPUT 21 | 'rgb' or 'rgb_laconic' or path to video/image file to 22 | use as input (default: rgb) 23 | -o OUTPUT, --output OUTPUT 24 | Path to output video file 25 | ``` 26 | 27 | ## Credits 28 | 29 | * [Semaphore with The RCR Museum](https://www.youtube.com/watch?v=DezaTjQYPh0&ab_channel=TheRoyalCanadianRegimentMuseum) -------------------------------------------------------------------------------- /examples/semaphore_alphabet/demo.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | from math import atan2, degrees 3 | import sys 4 | sys.path.append("../..") 5 | from BlazeposeDepthai import BlazeposeDepthai 6 | from BlazeposeRenderer import BlazeposeRenderer 7 | from mediapipe_utils import KEYPOINT_DICT 8 | import argparse 9 | 10 | 11 | # For gesture demo 12 | semaphore_flag = { 13 | (3,4):'A', (2,4):'B', (1,4):'C', (0,4):'D', 14 | (4,7):'E', (4,6):'F', (4,5):'G', (2,3):'H', 15 | (0,3):'I', (0,6):'J', (3,0):'K', (3,7):'L', 16 | (3,6):'M', (3,5):'N', (2,1):'O', (2,0):'P', 17 | (2,7):'Q', (2,6):'R', (2,5):'S', (1,0):'T', 18 | (1,7):'U', (0,5):'V', (7,6):'W', (7,5):'X', 19 | (1,6):'Y', (5,6):'Z', 20 | } 21 | 22 | def recognize_gesture(b): 23 | # b: body 24 | 25 | def angle_with_y(v): 26 | # v: 2d vector (x,y) 27 | # Returns angle in degree of v with y-axis of image plane 28 | if v[1] == 0: 29 | return 90 30 | angle = atan2(v[0], v[1]) 31 | return degrees(angle) 32 | 33 | # For the demo, we want to recognize the flag semaphore alphabet 34 | # For this task, we just need to measure the angles of both arms with vertical 35 | right_arm_angle = angle_with_y(b.landmarks[KEYPOINT_DICT['right_elbow'],:2] - b.landmarks[KEYPOINT_DICT['right_shoulder'],:2]) 36 | left_arm_angle = angle_with_y(b.landmarks[KEYPOINT_DICT['left_elbow'],:2] - b.landmarks[KEYPOINT_DICT['left_shoulder'],:2]) 37 | right_pose = int((right_arm_angle +202.5) / 45) % 8 38 | left_pose = int((left_arm_angle +202.5) / 45) % 8 39 | letter = semaphore_flag.get((right_pose, left_pose), None) 40 | return letter 41 | 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument("-m", "--model", type=str, choices=['full', 'lite', '831'], default='full', 44 | help="Landmark model to use (default=%(default)s") 45 | parser.add_argument('-i', '--input', type=str, default='rgb', 46 | help="'rgb' or 'rgb_laconic' or path to video/image file to use as input (default: %(default)s)") 47 | parser.add_argument("-o","--output", 48 | help="Path to output video file") 49 | args = parser.parse_args() 50 | 51 | pose = BlazeposeDepthai(input_src=args.input, lm_model=args.model) 52 | renderer = BlazeposeRenderer(pose, output=args.output) 53 | 54 | while True: 55 | # Run blazepose on next frame 56 | frame, body = pose.next_frame() 57 | if frame is None: break 58 | # Draw 2d skeleton 59 | frame = renderer.draw(frame, body) 60 | # Gesture recognition 61 | if body: 62 | letter = recognize_gesture(body) 63 | if letter: 64 | cv2.putText(frame, letter, (frame.shape[1] // 2, 100), cv2.FONT_HERSHEY_PLAIN, 5, (0,190,255), 3) 65 | key = renderer.waitKey(delay=1) 66 | if key == 27 or key == ord('q'): 67 | break 68 | renderer.exit() 69 | pose.exit() 70 | 71 | -------------------------------------------------------------------------------- /examples/semaphore_alphabet/medias/semaphore.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/examples/semaphore_alphabet/medias/semaphore.gif -------------------------------------------------------------------------------- /examples/semaphore_alphabet/medias/semaphore.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/examples/semaphore_alphabet/medias/semaphore.mp4 -------------------------------------------------------------------------------- /examples/semaphore_alphabet/medias/video.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/examples/semaphore_alphabet/medias/video.mp4 -------------------------------------------------------------------------------- /img/3d_visualizations.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/img/3d_visualizations.gif -------------------------------------------------------------------------------- /img/3d_world_visualization.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/img/3d_world_visualization.gif -------------------------------------------------------------------------------- /img/full_body_landmarks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/img/full_body_landmarks.png -------------------------------------------------------------------------------- /img/pipeline_edge_mode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/img/pipeline_edge_mode.png -------------------------------------------------------------------------------- /img/pipeline_host_mode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/img/pipeline_host_mode.png -------------------------------------------------------------------------------- /img/taichi.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/img/taichi.gif -------------------------------------------------------------------------------- /mediapipe_utils.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from collections import namedtuple 4 | from math import ceil, sqrt, pi, floor, sin, cos, atan2, gcd 5 | from collections import namedtuple 6 | 7 | # To not display: RuntimeWarning: overflow encountered in exp 8 | # in line: scores = 1 / (1 + np.exp(-scores)) 9 | np.seterr(over='ignore') 10 | 11 | # Dictionary that maps from joint names to keypoint indices. 12 | KEYPOINT_DICT = { 13 | "nose": 0, 14 | "left_eye_inner": 1, 15 | "left_eye": 2, 16 | "left_eye_outer": 3, 17 | "right_eye_inner": 4, 18 | "right_eye": 5, 19 | "right_eye_outer": 6, 20 | "left_ear": 7, 21 | "right_ear": 8, 22 | "mouth_left": 9, 23 | "mouth_right": 10, 24 | "left_shoulder": 11, 25 | "right_shoulder": 12, 26 | "left_elbow": 13, 27 | "right_elbow": 14, 28 | "left_wrist": 15, 29 | "right_wrist": 16, 30 | "left_pinky": 17, 31 | "right_pinky": 18, 32 | "left_index": 19, 33 | "right_index": 20, 34 | "left_thumb": 21, 35 | "right_thumb": 22, 36 | "left_hip": 23, 37 | "right_hip": 24, 38 | "left_knee": 25, 39 | "right_knee": 26, 40 | "left_ankle": 27, 41 | "right_ankle": 28, 42 | "left_heel": 29, 43 | "right_heel": 30, 44 | "left_foot_index": 31, 45 | "right_foot_index": 32 46 | } 47 | 48 | class Body: 49 | def __init__(self, pd_score=None, pd_box=None, pd_kps=None): 50 | """ 51 | Attributes: 52 | pd_score : detection score 53 | pd_box : detection box [x, y, w, h], normalized [0,1] in the squared image 54 | pd_kps : detection keypoints coordinates [x, y], normalized [0,1] in the squared image 55 | rect_x_center, rect_y_center : center coordinates of the rotated bounding rectangle, normalized [0,1] in the squared image 56 | rect_w, rect_h : width and height of the rotated bounding rectangle, normalized in the squared image (may be > 1) 57 | rotation : rotation angle of rotated bounding rectangle with y-axis in radian 58 | rect_x_center_a, rect_y_center_a : center coordinates of the rotated bounding rectangle, in pixels in the squared image 59 | rect_w, rect_h : width and height of the rotated bounding rectangle, in pixels in the squared image 60 | rect_points : list of the 4 points coordinates of the rotated bounding rectangle, in pixels 61 | expressed in the squared image during processing, 62 | expressed in the original rectangular image when returned to the user 63 | lm_score: global landmark score 64 | norm_landmarks : 3D landmarks coordinates in the rotated bounding rectangle, normalized [0,1] 65 | landmarks : 3D landmarks coordinates in the rotated bounding rectangle, in pixel in the original rectangular image 66 | world_landmarks : 3D landmarks coordinates in meter with mid hips point being the origin. 67 | The y value of landmarks_world coordinates is negative for landmarks 68 | above the mid hips (like shoulders) and negative for landmarks below (like feet) 69 | xyz: (optionally) 3D location in camera coordinate system of reference point (mid hips or mid shoulders) 70 | xyz_ref: (optionally) name of the reference point ("mid_hips" or "mid_shoulders"), 71 | xyz_zone: (optionally) 4 int array of zone (in the source image) on which is measured depth. 72 | xyz_zone[0:2] is top-left zone corner in pixels, xyz_zone[2:4] is bottom-right zone corner 73 | """ 74 | self.pd_score = pd_score 75 | self.pd_box = pd_box 76 | self.pd_kps = pd_kps 77 | def print(self): 78 | attrs = vars(self) 79 | print('\n'.join("%s: %s" % item for item in attrs.items())) 80 | 81 | 82 | SSDAnchorOptions = namedtuple('SSDAnchorOptions',[ 83 | 'num_layers', 84 | 'min_scale', 85 | 'max_scale', 86 | 'input_size_height', 87 | 'input_size_width', 88 | 'anchor_offset_x', 89 | 'anchor_offset_y', 90 | 'strides', 91 | 'aspect_ratios', 92 | 'reduce_boxes_in_lowest_layer', 93 | 'interpolated_scale_aspect_ratio', 94 | 'fixed_anchor_size']) 95 | 96 | def calculate_scale(min_scale, max_scale, stride_index, num_strides): 97 | if num_strides == 1: 98 | return (min_scale + max_scale) / 2 99 | else: 100 | return min_scale + (max_scale - min_scale) * stride_index / (num_strides - 1) 101 | 102 | def generate_anchors(options): 103 | """ 104 | option : SSDAnchorOptions 105 | # https://github.com/google/mediapipe/blob/master/mediapipe/calculators/tflite/ssd_anchors_calculator.cc 106 | """ 107 | anchors = [] 108 | layer_id = 0 109 | n_strides = len(options.strides) 110 | while layer_id < n_strides: 111 | anchor_height = [] 112 | anchor_width = [] 113 | aspect_ratios = [] 114 | scales = [] 115 | # For same strides, we merge the anchors in the same order. 116 | last_same_stride_layer = layer_id 117 | while last_same_stride_layer < n_strides and \ 118 | options.strides[last_same_stride_layer] == options.strides[layer_id]: 119 | scale = calculate_scale(options.min_scale, options.max_scale, last_same_stride_layer, n_strides) 120 | if last_same_stride_layer == 0 and options.reduce_boxes_in_lowest_layer: 121 | # For first layer, it can be specified to use predefined anchors. 122 | aspect_ratios += [1.0, 2.0, 0.5] 123 | scales += [0.1, scale, scale] 124 | else: 125 | aspect_ratios += options.aspect_ratios 126 | scales += [scale] * len(options.aspect_ratios) 127 | if options.interpolated_scale_aspect_ratio > 0: 128 | if last_same_stride_layer == n_strides -1: 129 | scale_next = 1.0 130 | else: 131 | scale_next = calculate_scale(options.min_scale, options.max_scale, last_same_stride_layer+1, n_strides) 132 | scales.append(sqrt(scale * scale_next)) 133 | aspect_ratios.append(options.interpolated_scale_aspect_ratio) 134 | last_same_stride_layer += 1 135 | 136 | for i,r in enumerate(aspect_ratios): 137 | ratio_sqrts = sqrt(r) 138 | anchor_height.append(scales[i] / ratio_sqrts) 139 | anchor_width.append(scales[i] * ratio_sqrts) 140 | 141 | stride = options.strides[layer_id] 142 | feature_map_height = ceil(options.input_size_height / stride) 143 | feature_map_width = ceil(options.input_size_width / stride) 144 | 145 | for y in range(feature_map_height): 146 | for x in range(feature_map_width): 147 | for anchor_id in range(len(anchor_height)): 148 | x_center = (x + options.anchor_offset_x) / feature_map_width 149 | y_center = (y + options.anchor_offset_y) / feature_map_height 150 | # new_anchor = Anchor(x_center=x_center, y_center=y_center) 151 | if options.fixed_anchor_size: 152 | new_anchor = [x_center, y_center, 1.0, 1.0] 153 | # new_anchor.w = 1.0 154 | # new_anchor.h = 1.0 155 | else: 156 | new_anchor = [x_center, y_center, anchor_width[anchor_id], anchor_height[anchor_id]] 157 | # new_anchor.w = anchor_width[anchor_id] 158 | # new_anchor.h = anchor_height[anchor_id] 159 | anchors.append(new_anchor) 160 | 161 | layer_id = last_same_stride_layer 162 | return np.array(anchors) 163 | 164 | def generate_blazepose_anchors(): 165 | # https://github.com/google/mediapipe/blob/master/mediapipe/modules/pose_detection/pose_detection_cpu.pbtxt 166 | anchor_options = SSDAnchorOptions( 167 | num_layers=5, 168 | min_scale=0.1484375, 169 | max_scale=0.75, 170 | input_size_height=224, 171 | input_size_width=224, 172 | anchor_offset_x=0.5, 173 | anchor_offset_y=0.5, 174 | strides=[8, 16, 32, 32, 32], 175 | aspect_ratios= [1.0], 176 | reduce_boxes_in_lowest_layer=False, 177 | interpolated_scale_aspect_ratio=1.0, 178 | fixed_anchor_size=True) 179 | return generate_anchors(anchor_options) 180 | 181 | def decode_bboxes(score_thresh, scores, bboxes, anchors, best_only=False): 182 | """ 183 | wi, hi : NN input shape 184 | https://github.com/google/mediapipe/blob/master/mediapipe/modules/pose_detection/pose_detection_cpu.pbtxt 185 | # Decodes the detection tensors generated by the TensorFlow Lite model, based on 186 | # the SSD anchors and the specification in the options, into a vector of 187 | # detections. Each detection describes a detected object. 188 | Version 0.8.3.1: 189 | node { 190 | calculator: "TensorsToDetectionsCalculator" 191 | input_stream: "TENSORS:detection_tensors" 192 | input_side_packet: "ANCHORS:anchors" 193 | output_stream: "DETECTIONS:unfiltered_detections" 194 | options: { 195 | [mediapipe.TensorsToDetectionsCalculatorOptions.ext] { 196 | num_classes: 1 197 | num_boxes: 896 198 | num_coords: 12 199 | box_coord_offset: 0 200 | keypoint_coord_offset: 4 201 | num_keypoints: 4 202 | num_values_per_keypoint: 2 203 | sigmoid_score: true 204 | score_clipping_thresh: 100.0 205 | reverse_output_order: true 206 | x_scale: 128.0 207 | y_scale: 128.0 208 | h_scale: 128.0 209 | w_scale: 128.0 210 | min_score_thresh: 0.5 211 | } 212 | } 213 | 214 | Version 0.8.4: 215 | [mediapipe.TensorsToDetectionsCalculatorOptions.ext] { 216 | num_classes: 1 217 | num_boxes: 2254 218 | num_coords: 12 219 | box_coord_offset: 0 220 | keypoint_coord_offset: 4 221 | num_keypoints: 4 222 | num_values_per_keypoint: 2 223 | sigmoid_score: true 224 | score_clipping_thresh: 100.0 225 | reverse_output_order: true 226 | x_scale: 224.0 227 | y_scale: 224.0 228 | h_scale: 224.0 229 | w_scale: 224.0 230 | min_score_thresh: 0.5 231 | } 232 | 233 | # Bounding box in each pose detection is currently set to the bounding box of 234 | # the detected face. However, 4 additional key points are available in each 235 | # detection, which are used to further calculate a (rotated) bounding box that 236 | # encloses the body region of interest. Among the 4 key points, the first two 237 | # are for identifying the full-body region, and the second two for upper body 238 | # only: 239 | # 240 | # Key point 0 - mid hip center 241 | # Key point 1 - point that encodes size & rotation (for full body) 242 | # Key point 2 - mid shoulder center 243 | # Key point 3 - point that encodes size & rotation (for upper body) 244 | # 245 | 246 | scores: shape = [number of anchors 896] 247 | bboxes: shape = [ number of anchors x 12], 12 = 4 (bounding box : (cx,cy,w,h) + 8 (4 palm keypoints) 248 | """ 249 | bodies = [] 250 | scores = 1 / (1 + np.exp(-scores)) 251 | if best_only: 252 | best_id = np.argmax(scores) 253 | if scores[best_id] < score_thresh: return bodies 254 | det_scores = scores[best_id:best_id+1] 255 | det_bboxes = bboxes[best_id:best_id+1] 256 | det_anchors = anchors[best_id:best_id+1] 257 | else: 258 | detection_mask = scores > score_thresh 259 | det_scores = scores[detection_mask] 260 | if det_scores.size == 0: return bodies 261 | det_bboxes = bboxes[detection_mask] 262 | det_anchors = anchors[detection_mask] 263 | 264 | scale = 224 # x_scale, y_scale, w_scale, h_scale 265 | 266 | # cx, cy, w, h = bboxes[i,:4] 267 | # cx = cx * anchor.w / wi + anchor.x_center 268 | # cy = cy * anchor.h / hi + anchor.y_center 269 | # lx = lx * anchor.w / wi + anchor.x_center 270 | # ly = ly * anchor.h / hi + anchor.y_center 271 | det_bboxes = det_bboxes* np.tile(det_anchors[:,2:4], 6) / scale + np.tile(det_anchors[:,0:2],6) 272 | # w = w * anchor.w / wi (in the prvious line, we add anchor.x_center and anchor.y_center to w and h, we need to substract them now) 273 | # h = h * anchor.h / hi 274 | det_bboxes[:,2:4] = det_bboxes[:,2:4] - det_anchors[:,0:2] 275 | # box = [cx - w*0.5, cy - h*0.5, w, h] 276 | det_bboxes[:,0:2] = det_bboxes[:,0:2] - det_bboxes[:,3:4] * 0.5 277 | 278 | for i in range(det_bboxes.shape[0]): 279 | score = det_scores[i] 280 | box = det_bboxes[i,0:4] 281 | kps = [] 282 | for kp in range(4): 283 | kps.append(det_bboxes[i,4+kp*2:6+kp*2]) 284 | bodies.append(Body(float(score), box, kps)) 285 | return bodies 286 | 287 | 288 | def non_max_suppression(bodies, nms_thresh): 289 | 290 | # cv2.dnn.NMSBoxes(boxes, scores, 0, nms_thresh) needs: 291 | # boxes = [ [x, y, w, h], ...] with x, y, w, h of type int 292 | # Currently, x, y, w, h are float between 0 and 1, so we arbitrarily multiply by 1000 and cast to int 293 | # boxes = [r.box for r in bodies] 294 | boxes = [ [int(x*1000) for x in r.pd_box] for r in bodies] 295 | scores = [r.pd_score for r in bodies] 296 | indices = cv2.dnn.NMSBoxes(boxes, scores, 0, nms_thresh) 297 | return [bodies[i[0]] for i in indices] 298 | 299 | def normalize_radians(angle): 300 | return angle - 2 * pi * floor((angle + pi) / (2 * pi)) 301 | 302 | def rot_vec(vec, rotation): 303 | vx, vy = vec 304 | return [vx * cos(rotation) - vy * sin(rotation), vx * sin(rotation) + vy * cos(rotation)] 305 | 306 | def detections_to_rect(body, kp_pair=[0,1]): 307 | # https://github.com/google/mediapipe/blob/master/mediapipe/modules/pose_landmark/pose_detection_to_roi.pbtxt 308 | # # Converts pose detection into a rectangle based on center and scale alignment 309 | # # points. Pose detection contains four key points: first two for full-body pose 310 | # # and two more for upper-body pose. 311 | # node { 312 | # calculator: "SwitchContainer" 313 | # input_side_packet: "ENABLE:upper_body_only" 314 | # input_stream: "DETECTION:detection" 315 | # input_stream: "IMAGE_SIZE:image_size" 316 | # output_stream: "NORM_RECT:raw_roi" 317 | # options { 318 | # [mediapipe.SwitchContainerOptions.ext] { 319 | # contained_node: { 320 | # calculator: "AlignmentPointsRectsCalculator" 321 | # options: { 322 | # [mediapipe.DetectionsToRectsCalculatorOptions.ext] { 323 | # rotation_vector_start_keypoint_index: 0 324 | # rotation_vector_end_keypoint_index: 1 325 | # rotation_vector_target_angle_degrees: 90 326 | # } 327 | # } 328 | # } 329 | # contained_node: { 330 | # calculator: "AlignmentPointsRectsCalculator" 331 | # options: { 332 | # [mediapipe.DetectionsToRectsCalculatorOptions.ext] { 333 | # rotation_vector_start_keypoint_index: 2 334 | # rotation_vector_end_keypoint_index: 3 335 | # rotation_vector_target_angle_degrees: 90 336 | # } 337 | # } 338 | # } 339 | # } 340 | # } 341 | # } 342 | 343 | target_angle = pi * 0.5 # 90 = pi/2 344 | 345 | # AlignmentPointsRectsCalculator : https://github.com/google/mediapipe/blob/master/mediapipe/calculators/util/alignment_points_to_rects_calculator.cc 346 | x_center, y_center = body.pd_kps[kp_pair[0]] 347 | x_scale, y_scale = body.pd_kps[kp_pair[1]] 348 | # Bounding box size as double distance from center to scale point. 349 | box_size = sqrt((x_scale-x_center)**2 + (y_scale-y_center)**2) * 2 350 | body.rect_w = box_size 351 | body.rect_h = box_size 352 | body.rect_x_center = x_center 353 | body.rect_y_center = y_center 354 | 355 | rotation = target_angle - atan2(-(y_scale - y_center), x_scale - x_center) 356 | body.rotation = normalize_radians(rotation) 357 | 358 | def rotated_rect_to_points(cx, cy, w, h, rotation): 359 | b = cos(rotation) * 0.5 360 | a = sin(rotation) * 0.5 361 | points = [] 362 | p0x = cx - a*h - b*w 363 | p0y = cy + b*h - a*w 364 | p1x = cx + a*h - b*w 365 | p1y = cy - b*h - a*w 366 | p2x = int(2*cx - p0x) 367 | p2y = int(2*cy - p0y) 368 | p3x = int(2*cx - p1x) 369 | p3y = int(2*cy - p1y) 370 | p0x, p0y, p1x, p1y = int(p0x), int(p0y), int(p1x), int(p1y) 371 | return [[p0x,p0y], [p1x,p1y], [p2x,p2y], [p3x,p3y]] 372 | 373 | def rect_transformation(body, w, h, scale = 1.25): 374 | """ 375 | w, h : image input shape 376 | """ 377 | # https://github.com/google/mediapipe/blob/master/mediapipe/modules/pose_landmark/pose_detection_to_roi.pbtxt 378 | # # Expands pose rect with marging used during training. 379 | # node { 380 | # calculator: "RectTransformationCalculator" 381 | # input_stream: "NORM_RECT:raw_roi" 382 | # input_stream: "IMAGE_SIZE:image_size" 383 | # output_stream: "roi" 384 | # options: { 385 | # [mediapipe.RectTransformationCalculatorOptions.ext] { 386 | # Version 0831: 387 | # scale_x: 1.5 388 | # scale_y: 1.5 389 | # Version 084: 390 | # scale_x: 1.25 391 | # scale_y: 1.25 392 | # square_long: true 393 | # } 394 | # } 395 | # } 396 | scale_x = scale 397 | scale_y = scale 398 | shift_x = 0 399 | shift_y = 0 400 | 401 | width = body.rect_w 402 | height = body.rect_h 403 | rotation = body.rotation 404 | if rotation == 0: 405 | body.rect_x_center_a = (body.rect_x_center + width * shift_x) * w 406 | body.rect_y_center_a = (body.rect_y_center + height * shift_y) * h 407 | else: 408 | x_shift = (w * width * shift_x * cos(rotation) - h * height * shift_y * sin(rotation)) 409 | y_shift = (w * width * shift_x * sin(rotation) + h * height * shift_y * cos(rotation)) 410 | body.rect_x_center_a = body.rect_x_center*w + x_shift 411 | body.rect_y_center_a = body.rect_y_center*h + y_shift 412 | 413 | # square_long: true 414 | long_side = max(width * w, height * h) 415 | body.rect_w_a = long_side * scale_x 416 | body.rect_h_a = long_side * scale_y 417 | body.rect_points = rotated_rect_to_points(body.rect_x_center_a, body.rect_y_center_a, body.rect_w_a, body.rect_h_a, body.rotation) 418 | 419 | def warp_rect_img(rect_points, img, w, h): 420 | src = np.array(rect_points[1:], dtype=np.float32) # rect_points[0] is left bottom point ! 421 | dst = np.array([(0, 0), (w, 0), (w, h)], dtype=np.float32) 422 | mat = cv2.getAffineTransform(src, dst) 423 | return cv2.warpAffine(img, mat, (w, h)) 424 | 425 | def distance(a, b): 426 | """ 427 | a, b: 2 points in 3D (x,y,z) 428 | """ 429 | return np.linalg.norm(a-b) 430 | 431 | def angle(a, b, c): 432 | # https://stackoverflow.com/questions/35176451/python-code-to-calculate-angle-between-three-point-using-their-3d-coordinates 433 | # a, b and c : points as np.array([x, y, z]) 434 | ba = a - b 435 | bc = c - b 436 | cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc)) 437 | angle = np.arccos(cosine_angle) 438 | 439 | return np.degrees(angle) 440 | 441 | # 442 | def find_isp_scale_params(size, is_height=True): 443 | """ 444 | Find closest valid size close to 'size' and and the corresponding parameters to setIspScale() 445 | This function is useful to work around a bug in depthai where ImageManip is scrambling images that have an invalid size 446 | is_height : boolean that indicates if the value is the height or the width of the image 447 | Returns: valid size, (numerator, denominator) 448 | """ 449 | # We want size >= 288 450 | if size < 288: 451 | size = 288 452 | 453 | # We are looking for the list on integers that are divisible by 16 and 454 | # that can be written like n/d where n <= 16 and d <= 63 455 | if is_height: 456 | reference = 1080 457 | other = 1920 458 | else: 459 | reference = 1920 460 | other = 1080 461 | size_candidates = {} 462 | for s in range(288,reference,16): 463 | f = gcd(reference, s) 464 | n = s//f 465 | d = reference//f 466 | if n <= 16 and d <= 63 and int(round(other * n / d) % 2 == 0): 467 | size_candidates[s] = (n, d) 468 | 469 | # What is the candidate size closer to 'size' ? 470 | min_dist = -1 471 | for s in size_candidates: 472 | dist = abs(size - s) 473 | if min_dist == -1: 474 | min_dist = dist 475 | candidate = s 476 | else: 477 | if dist > min_dist: break 478 | candidate = s 479 | min_dist = dist 480 | return candidate, size_candidates[candidate] 481 | 482 | # 483 | # Filtering 484 | # 485 | 486 | class LandmarksSmoothingFilter: 487 | ''' 488 | Adapted from: https://github.com/google/mediapipe/blob/master/mediapipe/calculators/util/landmarks_smoothing_calculator.cc 489 | 490 | frequency, min_cutoff, beta, derivate_cutoff: 491 | See class OneEuroFilter description. 492 | min_allowed_object_scale: 493 | If calculated object scale is less than given value smoothing will be 494 | disabled and landmarks will be returned as is. Default=1e-6 495 | disable_value_scaling: 496 | Disable value scaling based on object size and use `1.0` instead. 497 | If not disabled, value scale is calculated as inverse value of object 498 | size. Object size is calculated as maximum side of rectangular bounding 499 | box of the object in XY plane. Default=False 500 | ''' 501 | def __init__(self, 502 | frequency=30, 503 | min_cutoff=1, 504 | beta=0, 505 | derivate_cutoff=1, 506 | min_allowed_object_scale=1e-6, 507 | disable_value_scaling=False 508 | ): 509 | self.frequency = frequency 510 | self.min_cutoff = min_cutoff 511 | self.beta = beta 512 | self.derivate_cutoff = derivate_cutoff 513 | self.min_allowed_object_scale = min_allowed_object_scale 514 | self.disable_value_scaling = disable_value_scaling 515 | self.init = True 516 | 517 | @staticmethod 518 | def get_object_scale(landmarks): 519 | # Estimate object scale to use its inverse value as velocity scale for 520 | # RelativeVelocityFilter. If value will be too small (less than 521 | # `options_.min_allowed_object_scale`) smoothing will be disabled and 522 | # landmarks will be returned as is. 523 | # Object scale is calculated as average between bounding box width and height 524 | # with sides parallel to axis. 525 | min_xy = np.min(landmarks[:,:2], axis=0) 526 | max_xy = np.max(landmarks[:,:2], axis=0) 527 | return np.mean(max_xy - min_xy) 528 | 529 | def apply(self, landmarks, timestamp, object_scale=0): 530 | # object_scale: in practice, we use the size of the rotated rectangle region.rect_w_a=region.rect_h_a 531 | 532 | # Initialize filters 533 | if self.init: 534 | self.filters = OneEuroFilter(self.frequency, self.min_cutoff, self.beta, self.derivate_cutoff) 535 | self.init = False 536 | 537 | # Get value scale as inverse value of the object scale. 538 | # If value is too small smoothing will be disabled and landmarks will be 539 | # returned as is. 540 | if self.disable_value_scaling: 541 | value_scale = 1 542 | else: 543 | object_scale = object_scale if object_scale else self.get_object_scale(landmarks) 544 | if object_scale < self.min_allowed_object_scale: 545 | return landmarks 546 | value_scale = 1 / object_scale 547 | 548 | return self.filters.apply(landmarks, value_scale, timestamp) 549 | 550 | def get_alpha(self, cutoff): 551 | ''' 552 | te = 1.0 / self.frequency 553 | tau = 1.0 / (2 * Math.PI * cutoff) 554 | result = 1 / (1.0 + (tau / te)) 555 | ''' 556 | return 1.0 / (1.0 + (self.frequency / (2 * pi * cutoff))) 557 | 558 | def reset(self): 559 | self.init = True 560 | 561 | class OneEuroFilter: 562 | ''' 563 | Adapted from: https://github.com/google/mediapipe/blob/master/mediapipe/util/filtering/one_euro_filter.cc 564 | Paper: https://cristal.univ-lille.fr/~casiez/1euro/ 565 | 566 | frequency: 567 | Frequency of incoming frames defined in seconds. Used 568 | only if can't be calculated from provided events (e.g. 569 | on the very first frame). Default=30 570 | min_cutoff: 571 | Minimum cutoff frequency. Start by tuning this parameter while 572 | keeping `beta=0` to reduce jittering to the desired level. 1Hz 573 | (the default value) is a a good starting point. 574 | beta: 575 | Cutoff slope. After `min_cutoff` is configured, start 576 | increasing `beta` value to reduce the lag introduced by the 577 | `min_cutoff`. Find the desired balance between jittering and lag. Default=0 578 | derivate_cutoff: 579 | Cutoff frequency for derivate. It is set to 1Hz in the 580 | original algorithm, but can be turned to further smooth the 581 | speed (i.e. derivate) on the object. Default=1 582 | ''' 583 | def __init__(self, 584 | frequency=30, 585 | min_cutoff=1, 586 | beta=0, 587 | derivate_cutoff=1, 588 | ): 589 | self.frequency = frequency 590 | self.min_cutoff = min_cutoff 591 | self.beta = beta 592 | self.derivate_cutoff = derivate_cutoff 593 | self.x = LowPassFilter(self.get_alpha(min_cutoff)) 594 | self.dx = LowPassFilter(self.get_alpha(derivate_cutoff)) 595 | self.last_timestamp = 0 596 | 597 | def get_alpha(self, cutoff): 598 | ''' 599 | te = 1.0 / self.frequency 600 | tau = 1.0 / (2 * Math.PI * cutoff) 601 | result = 1 / (1.0 + (tau / te)) 602 | ''' 603 | return 1.0 / (1.0 + (self.frequency / (2 * pi * cutoff))) 604 | 605 | def apply(self, value, value_scale, timestamp): 606 | ''' 607 | Applies filter to the value. 608 | timestamp in s associated with the value (for instance, 609 | timestamp of the frame where you got value from). 610 | ''' 611 | if self.last_timestamp >= timestamp: 612 | # Results are unpreditable in this case, so nothing to do but return same value. 613 | return value 614 | 615 | # Update the sampling frequency based on timestamps. 616 | if self.last_timestamp != 0 and timestamp != 0: 617 | self.frequency = 1 / (timestamp - self.last_timestamp) 618 | self.last_timestamp = timestamp 619 | 620 | # Estimate the current variation per second. 621 | if self.x.has_last_raw_value(): 622 | dvalue = (value - self.x.last_raw_value()) * value_scale * self.frequency 623 | else: 624 | dvalue = 0 625 | edvalue = self.dx.apply_with_alpha(dvalue, self.get_alpha(self.derivate_cutoff)) 626 | 627 | # Use it to update the cutoff frequency 628 | cutoff = self.min_cutoff + self.beta * np.abs(edvalue) 629 | 630 | # filter the given value. 631 | return self.x.apply_with_alpha(value, self.get_alpha(cutoff)) 632 | 633 | class LowPassFilter: 634 | ''' 635 | Adapted from: https://github.com/google/mediapipe/blob/master/mediapipe/util/filtering/low_pass_filter.cc 636 | Note that 'value' can be a numpy array 637 | ''' 638 | def __init__(self, alpha=0.9): 639 | self.alpha = alpha 640 | self.initialized = False 641 | 642 | def apply(self, value): 643 | if self.initialized: 644 | # Regular lowpass filter. 645 | # result = alpha * value + (1 - alpha) * stored_value; 646 | result = self.alpha * value + (1 - self.alpha) * self.stored_value 647 | else: 648 | result = value 649 | self.initialized = True 650 | self.raw_value = value 651 | self.stored_value = result 652 | return result 653 | 654 | def apply_with_alpha(self, value, alpha): 655 | self.alpha = alpha 656 | return self.apply(value) 657 | 658 | def has_last_raw_value(self): 659 | return self.initialized 660 | 661 | def last_raw_value(self): 662 | return self.raw_value 663 | 664 | def last_value(self): 665 | return self.stored_value 666 | 667 | def reset(self): 668 | self.initialized = False 669 | 670 | 671 | -------------------------------------------------------------------------------- /models/convert_models.sh: -------------------------------------------------------------------------------- 1 | # This script has to be run from the docker container started by ./docker_tflite2tensorflow.sh 2 | 3 | source /opt/intel/openvino_2021/bin/setupvars.sh 4 | 5 | replace () { # Replace in file $1 occurences of the string $2 by $3 6 | sed "s/${2}/${3}/" $1 > tmpf 7 | mv tmpf $1 8 | } 9 | 10 | convert_model () { 11 | model_name=$1 12 | if [ -z "$2" ] 13 | then 14 | arg_mean_values="" 15 | else 16 | arg_mean_values="--mean_values ${2}" 17 | fi 18 | if [ -z "$3" ] 19 | then 20 | arg_scale_values="" 21 | else 22 | arg_scale_values="--scale_values ${3}" 23 | fi 24 | mean_values=$2 25 | scale_values=$3 26 | input_precision=$4 27 | tflite2tensorflow \ 28 | --model_path ${model_name}.tflite \ 29 | --model_output_path ${model_name} \ 30 | --flatc_path ~/flatc \ 31 | --schema_path ~/schema.fbs \ 32 | --output_pb \ 33 | --optimizing_for_openvino_and_myriad 34 | # For generating Openvino "non normalized input" models (the normalization would need to be made explictly in the code): 35 | #tflite2tensorflow \ 36 | # --model_path ${model_name}.tflite \ 37 | # --model_output_path ${model_name} \ 38 | # --flatc_path ~/flatc \ 39 | # --schema_path ~/schema.fbs \ 40 | # --output_openvino_and_myriad 41 | # Generate Openvino "normalized input" models 42 | /opt/intel/openvino_2021/deployment_tools/model_optimizer/mo_tf.py \ 43 | --saved_model_dir ${model_name} \ 44 | --model_name ${model_name} \ 45 | --data_type FP16 \ 46 | ${arg_mean_values} \ 47 | ${arg_scale_values} \ 48 | --reverse_input_channels 49 | # For Interpolate layers, replace in coordinate_transformation_mode, "half_pixel" by "align_corners" (bug optimizer) 50 | if [ model_name != "pose_detection" ] 51 | then 52 | replace ${model_name}.xml half_pixel align_corners 53 | fi 54 | 55 | /opt/intel/openvino_2021/deployment_tools/inference_engine/lib/intel64/myriad_compile \ 56 | -m ${model_name}.xml \ 57 | -ip $input_precision \ 58 | -VPU_NUMBER_OF_SHAVES 4 \ 59 | -VPU_NUMBER_OF_CMX_SLICES 4 \ 60 | -o ${model_name}_sh4.blob 61 | } 62 | 63 | convert_model pose_detection "[127.5,127.5,127.5]" "[127.5,127.5,127.5]" "u8" 64 | convert_model pose_landmark_full "" "" "fp16" 65 | convert_model pose_landmark_lite "" "" "fp16" 66 | convert_model pose_landmark_heavy "" "" "fp16" 67 | 68 | 69 | -------------------------------------------------------------------------------- /models/gen_blob_shave.sh: -------------------------------------------------------------------------------- 1 | # This script has to be run from the docker container started by ./docker_tflite2tensorflow.sh 2 | 3 | usage () 4 | { 5 | echo "Generate a new blob with a specified number of shaves and cmx (nb cmx = nb shaves)" 6 | echo "Usage: ${0} -m model -n nb_shaves" 7 | echo "model = 'pd' for pose detection, 'full' or 'lite' or 'heavy' for landmarks" 8 | echo "nb_shaves must be between 1 and 13" 9 | } 10 | 11 | while getopts ":hm:n:" opt; do 12 | case ${opt} in 13 | h ) 14 | usage 15 | exit 0 16 | ;; 17 | m ) 18 | model=$OPTARG 19 | ;; 20 | n ) 21 | nb_shaves=$OPTARG 22 | ;; 23 | : ) 24 | echo "Error: -$OPTARG requires an argument." 25 | usage 26 | exit 1 27 | ;; 28 | \? ) 29 | echo "Invalid option: -$OPTARG" 30 | usage 31 | exit 1 32 | ;; 33 | esac 34 | done 35 | if [ -z "$model" ] 36 | then 37 | echo "Model not specified !" 38 | usage 39 | exit 1 40 | fi 41 | if [ $model == "pd" ] 42 | then 43 | model="pose_detection" 44 | input_precision="u8" 45 | elif [ $model == "full" ] 46 | then 47 | model="pose_landmark_full" 48 | input_precision="fp16" 49 | elif [ $model == "lite" ] 50 | then 51 | model="pose_landmark_lite" 52 | input_precision="fp16" 53 | elif [ $model == "heavy" ] 54 | then 55 | model="pose_landmark_heavy" 56 | input_precision="fp16" 57 | else 58 | echo "Invalid model !" 59 | usage 60 | exit 1 61 | fi 62 | model_xml="${model}.xml" 63 | if [ ! -f $model_xml ] 64 | then 65 | echo "The model ${model_xml} does not exist" 66 | echo "Have you run convert_models.sh ?" 67 | exit 1 68 | fi 69 | if [ -z "$nb_shaves" ] 70 | then 71 | echo "The number of shaves must be specified with -n" 72 | usage 73 | exit 1 74 | fi 75 | if [ $nb_shaves -lt 1 -o $nb_shaves -gt 13 ] 76 | then 77 | echo "Invalid number of shaves !" 78 | usage 79 | exit 1 80 | fi 81 | 82 | model_blob="${model}_sh${nb_shaves}.blob" 83 | 84 | echo Model: $model_xml $model_blob 85 | echo Shaves $nb_shaves 86 | 87 | source /opt/intel/openvino_2021/bin/setupvars.sh 88 | 89 | 90 | /opt/intel/openvino_2021/deployment_tools/inference_engine/lib/intel64/myriad_compile -m $model_xml -ip $input_precision -VPU_NUMBER_OF_SHAVES $nb_shaves -VPU_NUMBER_OF_CMX_SLICES $nb_shaves -o $model_blob 91 | 92 | -------------------------------------------------------------------------------- /models/pose_detection_sh4.blob: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/models/pose_detection_sh4.blob -------------------------------------------------------------------------------- /models/pose_landmark_full_sh4.blob: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/models/pose_landmark_full_sh4.blob -------------------------------------------------------------------------------- /models/pose_landmark_heavy_sh4.blob: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/models/pose_landmark_heavy_sh4.blob -------------------------------------------------------------------------------- /models/pose_landmark_lite_sh4.blob: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/models/pose_landmark_lite_sh4.blob -------------------------------------------------------------------------------- /o3d_utils.py: -------------------------------------------------------------------------------- 1 | import open3d as o3d 2 | import numpy as np 3 | 4 | 5 | # From : https://stackoverflow.com/a/59026582/8574085 6 | 7 | def calculate_zy_rotation_for_arrow(v): 8 | """ 9 | Calculates the rotations required to go from the vector v to the 10 | z axis vector. The first rotation that is 11 | calculated is over the z axis. This will leave the vector v on the 12 | XZ plane. Then, the rotation over the y axis. 13 | 14 | Returns the angles of rotation over axis z and y required to 15 | get the vector v into the same orientation as axis z 16 | 17 | Args: 18 | - v (): 19 | """ 20 | # Rotation over z axis 21 | gamma = np.arctan(v[1]/v[0]) 22 | Rz = np.array([[np.cos(gamma),-np.sin(gamma),0], 23 | [np.sin(gamma),np.cos(gamma),0], 24 | [0,0,1]]) 25 | # Rotate v to calculate next rotation 26 | v = Rz.T@v.reshape(-1,1) 27 | v = v.reshape(-1) 28 | # Rotation over y axis 29 | beta = np.arctan(v[0]/v[2]) 30 | Ry = np.array([[np.cos(beta),0,np.sin(beta)], 31 | [0,1,0], 32 | [-np.sin(beta),0,np.cos(beta)]]) 33 | return Rz @ Ry 34 | 35 | def create_cylinder(height=1, radius=None, resolution=20): 36 | """ 37 | Create an cylinder in Open3D 38 | """ 39 | radius = height/20 if radius is None else radius 40 | mesh_frame = o3d.geometry.TriangleMesh.create_cylinder( 41 | radius=radius, 42 | height=height, 43 | resolution=resolution) 44 | return(mesh_frame) 45 | 46 | def create_segment(a, b, radius=0.05, color=(1,1,0), resolution=20): 47 | """ 48 | Creates an line(cylinder) from an pointa to point b, 49 | or create an line from a vector v starting from origin. 50 | Args: 51 | - a, b: End points [x,y,z] 52 | - radius: radius cylinder 53 | """ 54 | a = np.array(a) 55 | b = np.array(b) 56 | T = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]) 57 | T[:3, -1] = a 58 | v = b-a 59 | 60 | height = np.linalg.norm(v) 61 | if height == 0: return None 62 | R = calculate_zy_rotation_for_arrow(v) 63 | mesh = create_cylinder(height, radius) 64 | mesh.rotate(R, center=np.array([0, 0, 0])) 65 | mesh.translate((a+b)/2) 66 | mesh.paint_uniform_color(color) 67 | mesh.compute_vertex_normals() 68 | return mesh 69 | 70 | def create_tetra(p1, p2, p3, p4, color=(1,1,0)): 71 | vertices = o3d.utility.Vector3dVector([p1, p2, p3, p4]) 72 | tetras = o3d.utility.Vector4iVector([[0, 1, 2, 3]]) 73 | mesh = o3d.geometry.TetraMesh(vertices, tetras) 74 | mesh.paint_uniform_color(color) 75 | return mesh 76 | 77 | def create_grid(p0, p1, p2, p3, ni1, ni2, color=(0,0,0)): 78 | ''' 79 | p0, p1, p2, p3 : points defining a quadrilateral 80 | ni1: nb of equidistant intervals on segments p0p1 and p3p2 81 | ni2: nb of equidistant intervals on segments p1p2 and p0p3 82 | ''' 83 | p0 = np.array(p0) 84 | p1 = np.array(p1) 85 | p2 = np.array(p2) 86 | p3 = np.array(p3) 87 | 88 | vertices = [p0, p1, p2, p3] 89 | lines = [[0,1],[0,3],[1,2],[2,3]] 90 | for i in range(1,ni1): 91 | l = len(vertices) 92 | vertices.append((p0*(ni1-i)+p1*i)/ni1) 93 | vertices.append((p3*(ni1-i)+p2*i)/ni1) 94 | lines.append([l,l+1]) 95 | for i in range(1,ni2): 96 | l = len(vertices) 97 | vertices.append((p1*(ni2-i)+p2*i)/ni2) 98 | vertices.append((p0*(ni2-i)+p3*i)/ni2) 99 | lines.append([l,l+1]) 100 | vertices = o3d.utility.Vector3dVector(vertices) 101 | lines = o3d.utility.Vector2iVector(lines) 102 | mesh = o3d.geometry.LineSet(vertices, lines) 103 | mesh.paint_uniform_color(color) 104 | return mesh 105 | 106 | 107 | def create_coord_frame(origin=[0, 0, 0],size=1): 108 | mesh = o3d.geometry.TriangleMesh.create_coordinate_frame(size=size) 109 | mesh.translate(origin) 110 | return mesh 111 | 112 | 113 | # Visu3d : custom class used to visualize 3d skeleton 114 | class Visu3D: 115 | def __init__(self, bg_color=[0,0,0], zoom=1, segment_radius=1): 116 | self.vis = o3d.visualization.VisualizerWithKeyCallback() 117 | self.vis.create_window() 118 | opt = self.vis.get_render_option() 119 | opt.background_color = np.asarray(bg_color) 120 | # Defining callbacks - Key codes: https://www.glfw.org/docs/latest/group__keys.html 121 | self.vis.register_key_callback(ord("R"), self.start_rotating) 122 | self.vis.register_key_callback(ord("O"), self.start_oscillating) 123 | self.vis.register_key_callback(ord("S"), self.stop_moving) 124 | self.vis.register_key_callback(262, self.turn_view_right) # Right arrow 125 | self.vis.register_key_callback(263, self.turn_view_left) # Left arrow 126 | self.vis.register_key_callback(265, self.incr_rot_speed) # Up arrow 127 | self.vis.register_key_callback(264, self.decr_rot_speed) # Down arrow 128 | self.view_control = self.vis.get_view_control() 129 | self.zoom = zoom 130 | self.segment_radius = segment_radius 131 | self.move = "oscillate" 132 | self.angle = 0 133 | self.direction = 1 134 | self.oscillate_angle = 200 135 | self.geometries = [] 136 | 137 | def set_view(self): 138 | if self.angle_view % 4 == 0: 139 | ax = 0 140 | elif self.angle_view <= 3: 141 | ax = 1 142 | else: 143 | ax = -1 144 | if self.angle_view == 2 or self.angle_view == 6: 145 | az = 0 146 | elif 3 <= self.angle_view <= 5: 147 | az = 1 148 | else: 149 | az = -1 150 | self.view_control.set_front(np.array([ax,0,az])) 151 | self.view_control.set_up(np.array([0,-1,0])) 152 | 153 | def init_view(self): 154 | self.angle_view = 0 155 | self.rot_speed = 2 156 | self.set_view() 157 | self.view_control.set_zoom(self.zoom) 158 | 159 | def create_grid(self, p0, p1, p2, p3, ni1, ni2, color=(1,1,1)): 160 | ''' 161 | p0, p1, p2, p3 : points defining a quadrilateral 162 | ni1: nb of equidistant intervals on segments p0p1 and p3p2 163 | ni2: nb of equidistant intervals on segments p1p2 and p0p3 164 | ''' 165 | grid = create_grid(p0, p1, p2, p3, ni1, ni2, color) 166 | self.vis.add_geometry(grid) 167 | self.geometries.append(grid) 168 | 169 | def create_camera(self): 170 | cam = o3d.geometry.TriangleMesh.create_arrow(cylinder_radius=0.02, cone_radius=0.03, cylinder_height=0.1, cone_height=0.08) 171 | cam.paint_uniform_color([0.2,0.7,1]) 172 | cam.compute_vertex_normals() 173 | self.geometries.append(cam) 174 | 175 | def add_geometries(self): 176 | for geo in self.geometries: 177 | self.vis.add_geometry(geo, reset_bounding_box=False) 178 | 179 | def add_segment(self, p1, p2, radius=None, color=[1,1,1]): 180 | radius = self.segment_radius if radius is None else radius 181 | line = create_segment(p1, p2, radius=radius, color=color) 182 | if line: self.vis.add_geometry(line, reset_bounding_box=False) 183 | 184 | def clear(self): 185 | self.vis.clear_geometries() 186 | 187 | # Callback 188 | def incr_rot_speed(self, vis): 189 | if self.move == "rotate": 190 | if self.rot_speed * self.direction == -1: 191 | self.direction = 1 192 | else: 193 | self.rot_speed += self.direction 194 | else: 195 | self.rot_speed += 1 196 | # Callback 197 | def decr_rot_speed(self, vis): 198 | if self.move == "rotate": 199 | if self.rot_speed * self.direction == 1: 200 | self.direction = -1 201 | else: 202 | self.rot_speed -= self.direction 203 | else: 204 | self.rot_speed = max (1, self.rot_speed-1) 205 | # Callback 206 | def turn_view_right(self, vis): 207 | self.angle_view = (self.angle_view + 1) %8 208 | self.set_view() 209 | self.move = None 210 | # Callback 211 | def turn_view_left(self, vis): 212 | self.angle_view = (self.angle_view - 1) %8 213 | self.set_view() 214 | self.move = None 215 | # Callback 216 | def start_rotating(self, vis): 217 | self.move = "rotate" 218 | # Callback 219 | def start_oscillating(self, vis): 220 | self.move = "oscillate" 221 | self.angle = 0 222 | # Callback 223 | def stop_moving(self, vis): 224 | self.move = None 225 | 226 | def try_move(self): 227 | if self.move == "rotate": 228 | self.view_control.rotate(self.rot_speed * self.direction,0) 229 | elif self.move == "oscillate": 230 | self.view_control.rotate(self.rot_speed * self.direction,0) 231 | self.angle += self.rot_speed * self.direction 232 | if abs(self.angle) >= self.oscillate_angle: 233 | self.direction = - self.direction 234 | 235 | def render(self): 236 | self.vis.poll_events() 237 | self.vis.update_renderer() 238 | 239 | 240 | 241 | if __name__ == "__main__": 242 | 243 | line = create_segment([0, 0, 0], [1, 0, 0], color=(1,0,0)) 244 | line2 = create_segment([1, 0, 0], [1, 1, 0], color=(0,1,0)) 245 | line3 = create_segment([1, 1, 0], [0, 0, 0], radius=0.1) 246 | grid = create_grid([0,0,0],[0,0,1],[0,1,1],[0,1,0], 3, 2) 247 | frame =create_coord_frame() 248 | print(grid) 249 | # Draw everything 250 | o3d.visualization.draw_geometries([line, line2, line3, grid, frame]) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python >= 4.5.1.48 2 | open3d 3 | depthai >= 2.10 4 | -------------------------------------------------------------------------------- /template_manager_script.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is the template of the scripting node source code in edge mode 3 | Substitution is made in BlazeposeDepthaiEdge.py 4 | 5 | In the following: 6 | rrn_ : normalized [0:1] coordinates in rotated rectangle coordinate systems 7 | sqn_ : normalized [0:1] coordinates in squared input image 8 | """ 9 | 10 | ${_TRACE} ("Starting manager script node") 11 | 12 | import marshal 13 | from math import sin, cos, atan2, pi, hypot, degrees, floor 14 | 15 | # Indexes of some keypoints 16 | left_shoulder = 11 17 | right_shoulder = 12 18 | left_hip = 23 19 | right_hip = 24 20 | 21 | ${_IF_XYZ} 22 | # We use a filter for smoothing the reference point (mid hips or mid shoulders) 23 | # for which we fetch the (x,y,z). Without this filter, the reference point is very shaky 24 | class SmoothingFilter: 25 | def __init__(self, alpha): 26 | self.alpha = alpha 27 | self.initialized = False 28 | def apply(self, x): 29 | if self.initialized: 30 | result = self.alpha * x + (1 - self.alpha) * self.prev_x 31 | else: 32 | result = x 33 | self.initialized = True 34 | self.prev_x = result 35 | return int(result) 36 | def reset(self): 37 | self.initialized = False 38 | 39 | filter_x = SmoothingFilter(0.5) 40 | filter_y = SmoothingFilter(0.5) 41 | ${_IF_XYZ} 42 | 43 | # BufferMgr is used to statically allocate buffers once 44 | # (replace dynamic allocation). 45 | # These buffers are used for sending result to host 46 | class BufferMgr: 47 | def __init__(self): 48 | self._bufs = {} 49 | def __call__(self, size): 50 | try: 51 | buf = self._bufs[size] 52 | except KeyError: 53 | buf = self._bufs[size] = Buffer(size) 54 | ${_TRACE} (f"New buffer allocated: {size}") 55 | return buf 56 | 57 | buffer_mgr = BufferMgr() 58 | 59 | def send_result(type, lm_score=0, rect_center_x=0, rect_center_y=0, rect_size=0, rotation=0, lms=0, lms_world=0, xyz_ref=0, xyz=0, xyz_zone=0): 60 | # type : 0, 1 or 2 61 | # 0 : pose detection only (detection score < threshold) 62 | # 1 : pose detection + landmark regression 63 | # 2 : landmark regression only (ROI computed from previous landmarks) 64 | result = dict([("type", type), ("lm_score", lm_score), ("rotation", rotation), 65 | ("rect_center_x", rect_center_x), ("rect_center_y", rect_center_y), ("rect_size", rect_size), 66 | ("lms", lms), ('lms_world', lms_world), 67 | ("xyz_ref", xyz_ref), ("xyz", xyz), ("xyz_zone", xyz_zone)]) 68 | result_serial = marshal.dumps(result, 2) 69 | buffer = buffer_mgr(len(result_serial)) 70 | buffer.getData()[:] = result_serial 71 | ${_TRACE} ("len result:"+str(len(result_serial))) 72 | 73 | node.io['host'].send(buffer) 74 | ${_TRACE} ("Manager sent result to host") 75 | 76 | 77 | def rr2img(rrn_x, rrn_y): 78 | # Convert a point (rrn_x, rrn_y) expressed in normalized rotated rectangle (rrn) 79 | # into (X, Y) expressed in normalized image (sqn) 80 | X = sqn_rr_center_x + sqn_rr_size * ((rrn_x - 0.5) * cos_rot + (0.5 - rrn_y) * sin_rot) 81 | Y = sqn_rr_center_y + sqn_rr_size * ((rrn_y - 0.5) * cos_rot + (rrn_x - 0.5) * sin_rot) 82 | return X, Y 83 | 84 | norm_pad_size = ${_pad_h} / ${_frame_size} 85 | 86 | def is_visible(lm_id): 87 | # Is the landmark lm_id is visible ? 88 | # Here visibility means inferred visibility from the landmark model 89 | return lms[lm_id*5+3] > ${_visibility_threshold} 90 | 91 | def is_in_image(sqn_x, sqn_y): 92 | # Is the point (sqn_x, sqn_y) is included in the useful part of the image (excluding the pads)? 93 | return norm_pad_size < sqn_y < 1 - norm_pad_size 94 | 95 | # send_new_frame_to_branch defines on which branch new incoming frames are sent 96 | # 1 = pose detection branch 97 | # 2 = landmark branch 98 | send_new_frame_to_branch = 1 99 | 100 | next_roi_lm_idx = 33*5 101 | 102 | cfg_pre_pd = ImageManipConfig() 103 | cfg_pre_pd.setResizeThumbnail(224, 224, 0, 0, 0) 104 | 105 | while True: 106 | if send_new_frame_to_branch == 1: # Routing frame to pd 107 | node.io['pre_pd_manip_cfg'].send(cfg_pre_pd) 108 | ${_TRACE} ("Manager sent thumbnail config to pre_pd manip") 109 | # Wait for pd post processing's result 110 | detection = node.io['from_post_pd_nn'].get().getLayerFp16("result") 111 | ${_TRACE} ("Manager received pd result: "+str(detection)) 112 | pd_score, sqn_rr_center_x, sqn_rr_center_y, sqn_scale_x, sqn_scale_y = detection 113 | if pd_score < ${_pd_score_thresh}: 114 | send_result(0) 115 | continue 116 | scale_center_x = sqn_scale_x - sqn_rr_center_x 117 | scale_center_y = sqn_scale_y - sqn_rr_center_y 118 | sqn_rr_size = 2 * ${_rect_transf_scale} * hypot(scale_center_x, scale_center_y) 119 | rotation = 0.5 * pi - atan2(-scale_center_y, scale_center_x) 120 | rotation = rotation - 2 * pi *floor((rotation + pi) / (2 * pi)) 121 | ${_IF_XYZ} 122 | filter_x.reset() 123 | filter_y.reset() 124 | ${_IF_XYZ} 125 | 126 | 127 | # Routing frame to lm 128 | sin_rot = sin(rotation) # We will often need these values later 129 | cos_rot = cos(rotation) 130 | # Tell pre_lm_manip how to crop body region 131 | rr = RotatedRect() 132 | rr.center.x = sqn_rr_center_x 133 | rr.center.y = (sqn_rr_center_y * ${_frame_size} - ${_pad_h}) / ${_img_h} 134 | rr.size.width = sqn_rr_size 135 | rr.size.height = sqn_rr_size * ${_frame_size} / ${_img_h} 136 | rr.angle = degrees(rotation) 137 | cfg = ImageManipConfig() 138 | cfg.setCropRotatedRect(rr, True) 139 | cfg.setResize(256, 256) 140 | node.io['pre_lm_manip_cfg'].send(cfg) 141 | ${_TRACE} ("Manager sent config to pre_lm manip") 142 | 143 | # Wait for lm's result 144 | lm_result = node.io['from_lm_nn'].get() 145 | ${_TRACE} ("Manager received result from lm nn") 146 | lm_score = lm_result.getLayerFp16("Identity_1")[0] 147 | if lm_score > ${_lm_score_thresh}: 148 | lms = lm_result.getLayerFp16("Identity") 149 | lms_world = lm_result.getLayerFp16("Identity_4")[:99] 150 | 151 | xyz = 0 152 | xyz_zone = 0 153 | xyz_ref = 0 154 | # Query xyz 155 | ${_IF_XYZ} 156 | # Choosing the reference point: mid hips if hips visible, or mid shoulders otherwise 157 | # xyz_ref codes the reference point, 1 if mid hips, 2 if mid shoulders, 0 if no reference point 158 | if is_visible(right_hip) and is_visible(left_hip): 159 | kp1 = right_hip 160 | kp2 = left_hip 161 | rrn_xyz_ref_x = (lms[5*kp1] + lms[5*kp2]) / 512 # 512 = 256*2 (256 for normalizing, 2 for the mean) 162 | rrn_xyz_ref_y = (lms[5*kp1+1] + lms[5*kp2+1]) / 512 163 | sqn_xyz_ref_x, sqn_xyz_ref_y = rr2img(rrn_xyz_ref_x, rrn_xyz_ref_y) 164 | if is_in_image(sqn_xyz_ref_x, sqn_xyz_ref_y): 165 | xyz_ref = 1 166 | if xyz_ref == 0 and is_visible(right_shoulder) and is_visible(left_shoulder): 167 | kp1 = right_shoulder 168 | kp2 = left_shoulder 169 | rrn_xyz_ref_x = (lms[5*kp1] + lms[5*kp2]) / 512 # 512 = 256*2 (256 for normalizing, 2 for the mean) 170 | rrn_xyz_ref_y = (lms[5*kp1+1] + lms[5*kp2+1]) / 512 171 | sqn_xyz_ref_x, sqn_xyz_ref_y = rr2img(rrn_xyz_ref_x, rrn_xyz_ref_y) 172 | if is_in_image(sqn_xyz_ref_x, sqn_xyz_ref_y): 173 | xyz_ref = 2 174 | if xyz_ref: 175 | cfg = SpatialLocationCalculatorConfig() 176 | conf_data = SpatialLocationCalculatorConfigData() 177 | conf_data.depthThresholds.lowerThreshold = 100 178 | conf_data.depthThresholds.upperThreshold = 10000 179 | half_zone_size = max(int(sqn_rr_size * ${_frame_size} / 90), 4) 180 | xc = filter_x.apply(sqn_xyz_ref_x * ${_frame_size} + ${_crop_w}) 181 | yc = filter_y.apply(sqn_xyz_ref_y * ${_frame_size} - ${_pad_h}) 182 | roi_left = max(0, xc - half_zone_size) 183 | roi_right = min(${_img_w}-1, xc + half_zone_size) 184 | roi_top = max(0, yc - half_zone_size) 185 | roi_bottom = min(${_img_h}-1, yc + half_zone_size) 186 | roi_topleft = Point2f(roi_left, roi_top) 187 | roi_bottomright = Point2f(roi_right, roi_bottom) 188 | conf_data.roi = Rect(roi_topleft, roi_bottomright) 189 | cfg = SpatialLocationCalculatorConfig() 190 | cfg.addROI(conf_data) 191 | node.io['spatial_location_config'].send(cfg) 192 | ${_TRACE} ("Manager sent ROI to spatial_location_config") 193 | 194 | # Wait xyz response 195 | xyz_data = node.io['spatial_data'].get().getSpatialLocations() 196 | ${_TRACE} ("Manager received spatial_location") 197 | coords = xyz_data[0].spatialCoordinates 198 | xyz = [float(coords.x), float(coords.y), float(coords.z)] 199 | roi = xyz_data[0].config.roi 200 | xyz_zone = [int(roi.topLeft().x - ${_crop_w}), int(roi.topLeft().y), int(roi.bottomRight().x - ${_crop_w}), int(roi.bottomRight().y)] 201 | else: 202 | xyz = [0.0] * 3 203 | xyz_zone = [0] * 4 204 | ${_IF_XYZ} 205 | 206 | # Send result to host 207 | send_result(send_new_frame_to_branch, lm_score, sqn_rr_center_x, sqn_rr_center_y, sqn_rr_size, rotation, lms, lms_world, xyz_ref, xyz, xyz_zone) 208 | 209 | if not ${_force_detection}: 210 | send_new_frame_to_branch = 2 211 | # Calculate the ROI for next frame 212 | rrn_rr_center_x = lms[next_roi_lm_idx] / 256 213 | rrn_rr_center_y = lms[next_roi_lm_idx+1] / 256 214 | rrn_scale_x = lms[next_roi_lm_idx+5] / 256 215 | rrn_scale_y = lms[next_roi_lm_idx+6] / 256 216 | sqn_scale_x, sqn_scale_y = rr2img(rrn_scale_x, rrn_scale_y) 217 | sqn_rr_center_x, sqn_rr_center_y = rr2img(rrn_rr_center_x, rrn_rr_center_y) 218 | scale_center_x = sqn_scale_x - sqn_rr_center_x 219 | scale_center_y = sqn_scale_y - sqn_rr_center_y 220 | sqn_rr_size = 2 * ${_rect_transf_scale} * hypot(scale_center_x, scale_center_y) 221 | rotation = 0.5 * pi - atan2(-scale_center_y, scale_center_x) 222 | rotation = rotation - 2 * pi *floor((rotation + pi) / (2 * pi)) 223 | 224 | else: 225 | send_result(send_new_frame_to_branch, lm_score) 226 | send_new_frame_to_branch = 1 227 | --------------------------------------------------------------------------------