├── BlazeposeDepthai.py
├── BlazeposeDepthaiEdge.py
├── BlazeposeRenderer.py
├── FPS.py
├── LICENSE.txt
├── README.md
├── custom_models
├── DetectionBestCandidate.py
├── DetectionBestCandidate_sh1.blob
├── DivideBy255.py
├── DivideBy255_sh1.blob
├── README.md
└── convert_model.sh
├── demo.py
├── docker_tflite2tensorflow.sh
├── examples
└── semaphore_alphabet
│ ├── README.md
│ ├── demo.py
│ └── medias
│ ├── semaphore.gif
│ ├── semaphore.mp4
│ └── video.mp4
├── img
├── 3d_visualizations.gif
├── 3d_world_visualization.gif
├── full_body_landmarks.png
├── pipeline_edge_mode.png
├── pipeline_host_mode.png
└── taichi.gif
├── mediapipe_utils.py
├── models
├── convert_models.sh
├── gen_blob_shave.sh
├── pose_detection_sh4.blob
├── pose_landmark_full_sh4.blob
├── pose_landmark_heavy_sh4.blob
└── pose_landmark_lite_sh4.blob
├── o3d_utils.py
├── requirements.txt
└── template_manager_script.py
/BlazeposeDepthai.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import mediapipe_utils as mpu
3 | import cv2
4 | from pathlib import Path
5 | from FPS import FPS, now
6 | from math import sin, cos
7 | import depthai as dai
8 | import time, sys
9 |
10 | SCRIPT_DIR = Path(__file__).resolve().parent
11 | POSE_DETECTION_MODEL = str(SCRIPT_DIR / "models/pose_detection_sh4.blob")
12 | LANDMARK_MODEL_FULL = str(SCRIPT_DIR / "models/pose_landmark_full_sh4.blob")
13 | LANDMARK_MODEL_HEAVY = str(SCRIPT_DIR / "models/pose_landmark_heavy_sh4.blob")
14 | LANDMARK_MODEL_LITE = str(SCRIPT_DIR / "models/pose_landmark_lite_sh4.blob")
15 |
16 |
17 | def to_planar(arr: np.ndarray, shape: tuple) -> np.ndarray:
18 | return cv2.resize(arr, shape).transpose(2,0,1).flatten()
19 |
20 | class BlazeposeDepthai:
21 | """
22 | Blazepose body pose detector
23 | Arguments:
24 | - input_src: frame source,
25 | - "rgb" or None: OAK* internal color camera,
26 | - "rgb_laconic": same as "rgb" but without sending the frames to the host,
27 | - a file path of an image or a video,
28 | - an integer (eg 0) for a webcam id,
29 | - pd_model: Blazepose detection model blob file (if None, takes the default value POSE_DETECTION_MODEL),
30 | - pd_score: confidence score to determine whether a detection is reliable (a float between 0 and 1).
31 | - lm_model: Blazepose landmark model blob file
32 | - None or "full": the default blob file LANDMARK_MODEL_FULL,
33 | - "lite": the default blob file LANDMARK_MODEL_LITE,
34 | - "heavy": default blob file LANDMARK_MODEL_HEAVY,
35 | - a path of a blob file.
36 | - lm_score_thresh : confidence score to determine whether landmarks prediction is reliable (a float between 0 and 1).
37 | - xyz: boolean, when True get the (x, y, z) coords of the reference point (center of the hips) (if the device supports depth measures).
38 | - crop : boolean which indicates if square cropping is done or not
39 | - smoothing: boolean which indicates if smoothing filtering is applied
40 | - filter_window_size and filter_velocity_scale:
41 | The filter keeps track (on a window of specified size) of
42 | value changes over time, which as result gives velocity of how value
43 | changes over time. With higher velocity it weights new values higher.
44 | - higher filter_window_size adds to lag and to stability
45 | - lower filter_velocity_scale adds to lag and to stability
46 |
47 | - internal_fps : when using the internal color camera as input source, set its FPS to this value (calling setFps()).
48 | - resolution : sensor resolution "full" (1920x1080) or "ultra" (3840x2160),
49 | - internal_frame_height : when using the internal color camera, set the frame height (calling setIspScale()).
50 | The width is calculated accordingly to height and depends on value of 'crop'
51 | - stats : boolean, when True, display some statistics when exiting.
52 | - trace: boolean, when True print some debug messages
53 | - force_detection: boolean, force person detection on every frame (never use landmarks from previous frame to determine ROI)
54 | """
55 | def __init__(self, input_src="rgb",
56 | pd_model=None,
57 | pd_score_thresh=0.5,
58 | lm_model=None,
59 | lm_score_thresh=0.7,
60 | xyz=False,
61 | crop=False,
62 | smoothing= True,
63 | internal_fps=None,
64 | resolution="full",
65 | internal_frame_height=1080,
66 | stats=False,
67 | trace=False,
68 | force_detection=False
69 | ):
70 |
71 | self.pd_model = pd_model if pd_model else POSE_DETECTION_MODEL
72 | print(f"Pose detection blob file : {self.pd_model}")
73 | self.rect_transf_scale = 1.25
74 | if lm_model is None or lm_model == "full":
75 | self.lm_model = LANDMARK_MODEL_FULL
76 | elif lm_model == "lite":
77 | self.lm_model = LANDMARK_MODEL_LITE
78 | elif lm_model == "heavy":
79 | self.lm_model = LANDMARK_MODEL_HEAVY
80 | else:
81 | self.lm_model = lm_model
82 | print(f"Landmarks using blob file : {self.lm_model}")
83 |
84 | self.pd_score_thresh = pd_score_thresh
85 | self.lm_score_thresh = lm_score_thresh
86 | self.smoothing = smoothing
87 | self.crop = crop
88 | self.internal_fps = internal_fps
89 | self.stats = stats
90 | self.force_detection = force_detection
91 | self.presence_threshold = 0.5
92 | self.visibility_threshold = 0.5
93 |
94 | self.device = dai.Device()
95 | self.xyz = False
96 |
97 | if input_src == None or input_src == "rgb" or input_src == "rgb_laconic":
98 | # Note that here (in Host mode), specifying "rgb_laconic" has no effect
99 | # Color camera frame is systematically transferred to the host
100 | self.input_type = "rgb" # OAK* internal color camera
101 | if internal_fps is None:
102 | if "heavy" in str(lm_model):
103 | self.internal_fps = 10
104 | elif "full" in str(lm_model):
105 | self.internal_fps = 8
106 | else: # Light
107 | self.internal_fps = 13
108 | else:
109 | self.internal_fps = internal_fps
110 | print(f"Internal camera FPS set to: {self.internal_fps}")
111 | if resolution == "full":
112 | self.resolution = (1920, 1080)
113 | elif resolution == "ultra":
114 | self.resolution = (3840, 2160)
115 | else:
116 | print(f"Error: {resolution} is not a valid resolution !")
117 | sys.exit()
118 | print("Sensor resolution:", self.resolution)
119 |
120 | self.video_fps = self.internal_fps # Used when saving the output in a video file. Should be close to the real fps
121 |
122 | if xyz:
123 | # Check if the device supports stereo
124 | cameras = self.device.getConnectedCameras()
125 | if dai.CameraBoardSocket.LEFT in cameras and dai.CameraBoardSocket.RIGHT in cameras:
126 | self.xyz = True
127 | else:
128 | print("Warning: depth unavailable on this device, 'xyz' argument is ignored")
129 |
130 | if self.crop:
131 | self.frame_size, self.scale_nd = mpu.find_isp_scale_params(internal_frame_height)
132 | self.img_h = self.img_w = self.frame_size
133 | self.pad_w = self.pad_h = 0
134 | self.crop_w = (int(round(self.resolution[0] * self.scale_nd[0] / self.scale_nd[1])) - self.img_w) // 2
135 |
136 | else:
137 | width, self.scale_nd = mpu.find_isp_scale_params(internal_frame_height * 1920 / 1080, is_height=False)
138 | self.img_h = int(round(self.resolution[1] * self.scale_nd[0] / self.scale_nd[1]))
139 | self.img_w = int(round(self.resolution[0] * self.scale_nd[0] / self.scale_nd[1]))
140 | self.pad_h = (self.img_w - self.img_h) // 2
141 | self.pad_w = 0
142 | self.frame_size = self.img_w
143 | self.crop_w = 0
144 |
145 | print(f"Internal camera image size: {self.img_w} x {self.img_h} - crop_w:{self.crop_w} pad_h: {self.pad_h}")
146 |
147 | elif input_src.endswith('.jpg') or input_src.endswith('.png') :
148 | self.input_type= "image"
149 | self.img = cv2.imread(input_src)
150 | self.video_fps = 25
151 | self.img_h, self.img_w = self.img.shape[:2]
152 | else:
153 | self.input_type = "video"
154 | if input_src.isdigit():
155 | input_type = "webcam"
156 | input_src = int(input_src)
157 | self.cap = cv2.VideoCapture(input_src)
158 | self.video_fps = int(self.cap.get(cv2.CAP_PROP_FPS))
159 | self.img_w = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
160 | self.img_h = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
161 | print("Video FPS:", self.video_fps)
162 |
163 | if self.input_type != "rgb":
164 | print(f"Original frame size: {self.img_w}x{self.img_h}")
165 | if self.crop:
166 | self.frame_size = min(self.img_w, self.img_h) # // 16 * 16
167 | else:
168 | self.frame_size = max(self.img_w, self.img_h) #// 16 * 16
169 | self.crop_w = max((self.img_w - self.frame_size) // 2, 0)
170 | if self.crop_w: print("Cropping on width :", self.crop_w)
171 | self.crop_h = max((self.img_h - self.frame_size) // 2, 0)
172 | if self.crop_h: print("Cropping on height :", self.crop_h)
173 |
174 | self.pad_w = max((self.frame_size - self.img_w) // 2, 0)
175 | if self.pad_w: print("Padding on width :", self.pad_w)
176 | self.pad_h = max((self.frame_size - self.img_h) // 2, 0)
177 | if self.pad_h: print("Padding on height :", self.pad_h)
178 |
179 |
180 | print(f"Frame working size: {self.img_w}x{self.img_h}")
181 |
182 | self.nb_kps = 33 # Number of "viewable" keypoints
183 |
184 | if self.smoothing:
185 |
186 | self.filter_landmarks = mpu.LandmarksSmoothingFilter(
187 | frequency=self.video_fps,
188 | min_cutoff=0.05,
189 | beta=80,
190 | derivate_cutoff=1
191 | )
192 | # landmarks_aux corresponds to the 2 landmarks used to compute the ROI in next frame
193 | self.filter_landmarks_aux = mpu.LandmarksSmoothingFilter(
194 | frequency=self.video_fps,
195 | min_cutoff=0.01,
196 | beta=10,
197 | derivate_cutoff=1
198 | )
199 | self.filter_landmarks_world = mpu.LandmarksSmoothingFilter(
200 | frequency=self.video_fps,
201 | min_cutoff=0.1,
202 | beta=40,
203 | derivate_cutoff=1,
204 | disable_value_scaling=True
205 | )
206 | if self.xyz:
207 | self.filter_xyz = mpu.LowPassFilter(alpha=0.25)
208 |
209 | # Create SSD anchors
210 | self.anchors = mpu.generate_blazepose_anchors()
211 | self.nb_anchors = self.anchors.shape[0]
212 | print(f"{self.nb_anchors} anchors have been created")
213 |
214 | # Define and start pipeline
215 | self.pd_input_length = 224
216 | self.lm_input_length = 256
217 | usb_speed = self.device.getUsbSpeed()
218 | self.device.startPipeline(self.create_pipeline())
219 | print(f"Pipeline started - USB speed: {str(usb_speed).split('.')[-1]}")
220 |
221 | # Define data queues
222 | if self.input_type == "rgb":
223 | self.q_video = self.device.getOutputQueue(name="cam_out", maxSize=1, blocking=False)
224 | self.q_pre_pd_manip_cfg = self.device.getInputQueue(name="pre_pd_manip_cfg")
225 | if self.xyz:
226 | self.q_spatial_data = self.device.getOutputQueue(name="spatial_data_out", maxSize=1, blocking=False)
227 | self.q_spatial_config = self.device.getInputQueue("spatial_calc_config_in")
228 |
229 | else:
230 | self.q_pd_in = self.device.getInputQueue(name="pd_in")
231 | self.q_pd_out = self.device.getOutputQueue(name="pd_out", maxSize=4, blocking=True)
232 | self.q_lm_in = self.device.getInputQueue(name="lm_in")
233 | self.q_lm_out = self.device.getOutputQueue(name="lm_out", maxSize=4, blocking=True)
234 |
235 |
236 | self.fps = FPS()
237 |
238 | self.nb_frames = 0
239 | self.nb_pd_inferences = 0
240 | self.nb_lm_inferences = 0
241 | self.nb_lm_inferences_after_landmarks_ROI = 0
242 | self.nb_frames_no_body = 0
243 |
244 | self.glob_pd_rtrip_time = 0
245 | self.glob_lm_rtrip_time = 0
246 |
247 | self.use_previous_landmarks = False
248 |
249 | self.cfg_pre_pd = dai.ImageManipConfig()
250 | self.cfg_pre_pd.setResizeThumbnail(self.pd_input_length, self.pd_input_length)
251 |
252 | def create_pipeline(self):
253 | print("Creating pipeline...")
254 | # Start defining a pipeline
255 | pipeline = dai.Pipeline()
256 | # pipeline.setOpenVINOVersion(version = dai.OpenVINO.Version.VERSION_2021_4)
257 |
258 |
259 | if self.input_type == "rgb":
260 | # ColorCamera
261 | print("Creating Color Camera...")
262 | cam = pipeline.createColorCamera()
263 | if self.resolution[0] == 1920:
264 | cam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
265 | else:
266 | cam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_4_K)
267 | cam.setInterleaved(False)
268 | cam.setIspScale(self.scale_nd[0], self.scale_nd[1])
269 | cam.setFps(self.internal_fps)
270 | cam.setBoardSocket(dai.CameraBoardSocket.RGB)
271 |
272 | if self.crop:
273 | cam.setVideoSize(self.frame_size, self.frame_size)
274 | cam.setPreviewSize(self.frame_size, self.frame_size)
275 | else:
276 | cam.setVideoSize(self.img_w, self.img_h)
277 | cam.setPreviewSize(self.img_w, self.img_h)
278 |
279 | cam_out = pipeline.createXLinkOut()
280 | cam_out.setStreamName("cam_out")
281 | cam_out.input.setQueueSize(1)
282 | cam_out.input.setBlocking(False)
283 | cam.video.link(cam_out.input)
284 |
285 | # Define pose detection pre processing (resize preview to (self.pd_input_length, self.pd_input_length))
286 | print("Creating Pose Detection pre processing image manip...")
287 | pre_pd_manip = pipeline.create(dai.node.ImageManip)
288 | pre_pd_manip.setMaxOutputFrameSize(self.pd_input_length*self.pd_input_length*3)
289 | pre_pd_manip.setWaitForConfigInput(True)
290 | pre_pd_manip.inputImage.setQueueSize(1)
291 | pre_pd_manip.inputImage.setBlocking(False)
292 | cam.preview.link(pre_pd_manip.inputImage)
293 |
294 | pre_pd_manip_cfg_in = pipeline.create(dai.node.XLinkIn)
295 | pre_pd_manip_cfg_in.setStreamName("pre_pd_manip_cfg")
296 | pre_pd_manip_cfg_in.out.link(pre_pd_manip.inputConfig)
297 |
298 | if self.xyz:
299 |
300 | # For now, RGB needs fixed focus to properly align with depth.
301 | # The value used during calibration should be used here
302 | calib_data = self.device.readCalibration()
303 | calib_lens_pos = calib_data.getLensPosition(dai.CameraBoardSocket.RGB)
304 | print(f"RGB calibration lens position: {calib_lens_pos}")
305 | cam.initialControl.setManualFocus(calib_lens_pos)
306 |
307 | mono_resolution = dai.MonoCameraProperties.SensorResolution.THE_400_P
308 | left = pipeline.createMonoCamera()
309 | left.setBoardSocket(dai.CameraBoardSocket.LEFT)
310 | left.setResolution(mono_resolution)
311 | left.setFps(self.internal_fps)
312 |
313 | right = pipeline.createMonoCamera()
314 | right.setBoardSocket(dai.CameraBoardSocket.RIGHT)
315 | right.setResolution(mono_resolution)
316 | right.setFps(self.internal_fps)
317 |
318 | stereo = pipeline.createStereoDepth()
319 | stereo.setConfidenceThreshold(230)
320 | # LR-check is required for depth alignment
321 | stereo.setLeftRightCheck(True)
322 | stereo.setDepthAlign(dai.CameraBoardSocket.RGB)
323 | stereo.setSubpixel(False) # subpixel True -> latency
324 |
325 | spatial_location_calculator = pipeline.createSpatialLocationCalculator()
326 | spatial_location_calculator.setWaitForConfigInput(True)
327 | spatial_location_calculator.inputDepth.setBlocking(False)
328 | spatial_location_calculator.inputDepth.setQueueSize(1)
329 |
330 | spatial_data_out = pipeline.createXLinkOut()
331 | spatial_data_out.setStreamName("spatial_data_out")
332 | spatial_data_out.input.setQueueSize(1)
333 | spatial_data_out.input.setBlocking(False)
334 |
335 | spatial_calc_config_in = pipeline.createXLinkIn()
336 | spatial_calc_config_in.setStreamName("spatial_calc_config_in")
337 |
338 | left.out.link(stereo.left)
339 | right.out.link(stereo.right)
340 |
341 | stereo.depth.link(spatial_location_calculator.inputDepth)
342 |
343 | spatial_location_calculator.out.link(spatial_data_out.input)
344 | spatial_calc_config_in.out.link(spatial_location_calculator.inputConfig)
345 |
346 |
347 | # Define pose detection model
348 | print("Creating Pose Detection Neural Network...")
349 | pd_nn = pipeline.createNeuralNetwork()
350 | pd_nn.setBlobPath(str(Path(self.pd_model).resolve().absolute()))
351 | # Increase threads for detection
352 | # pd_nn.setNumInferenceThreads(2)
353 | # Specify that network takes latest arriving frame in non-blocking manner
354 | # Pose detection input
355 | if self.input_type == "rgb":
356 | pre_pd_manip.out.link(pd_nn.input)
357 | else:
358 | pd_in = pipeline.createXLinkIn()
359 | pd_in.setStreamName("pd_in")
360 | pd_in.out.link(pd_nn.input)
361 |
362 | # Pose detection output
363 | pd_out = pipeline.createXLinkOut()
364 | pd_out.setStreamName("pd_out")
365 | pd_nn.out.link(pd_out.input)
366 |
367 |
368 | # Define landmark model
369 | print("Creating Landmark Neural Network...")
370 | lm_nn = pipeline.createNeuralNetwork()
371 | lm_nn.setBlobPath(str(Path(self.lm_model).resolve().absolute()))
372 | lm_nn.setNumInferenceThreads(1)
373 | # Landmark input
374 | lm_in = pipeline.createXLinkIn()
375 | lm_in.setStreamName("lm_in")
376 | lm_in.out.link(lm_nn.input)
377 | # Landmark output
378 | lm_out = pipeline.createXLinkOut()
379 | lm_out.setStreamName("lm_out")
380 | lm_nn.out.link(lm_out.input)
381 |
382 | print("Pipeline created.")
383 | return pipeline
384 |
385 | def is_present(self, body, lm_id):
386 | return body.presence[lm_id] > self.presence_threshold
387 |
388 | def is_visible(self, body, lm_id):
389 | if body.visibility[lm_id] > self.visibility_threshold and \
390 | 0 <= body.landmarks[lm_id][0] < self.img_w and \
391 | 0 <= body.landmarks[lm_id][1] < self.img_h :
392 | return True
393 | else:
394 | return False
395 |
396 | def query_body_xyz(self, body):
397 | # We want the 3d position (x,y,z) in meters of the body reference keypoint
398 | # in the camera coord system.
399 | # The reference point is either :
400 | # - the middle of the hips if both hips are present (presence of rght and left hips > threshold),
401 | # - the middle of the shoulders in case at leats one hip is not present and
402 | # both shoulders are present,
403 | # - None otherwise
404 | if self.is_visible(body, mpu.KEYPOINT_DICT['right_hip']) and self.is_visible(body, mpu.KEYPOINT_DICT['left_hip']):
405 | body.xyz_ref = "mid_hips"
406 | body.xyz_ref_coords_pixel = np.mean([
407 | body.landmarks[mpu.KEYPOINT_DICT['right_hip']][:2],
408 | body.landmarks[mpu.KEYPOINT_DICT['left_hip']][:2]],
409 | axis=0)
410 | elif self.is_visible(body, mpu.KEYPOINT_DICT['right_shoulder']) and self.is_visible(body, mpu.KEYPOINT_DICT['left_shoulder']):
411 | body.xyz_ref = "mid_shoulders"
412 | body.xyz_ref_coords_pixel = np.mean([
413 | body.landmarks[mpu.KEYPOINT_DICT['right_shoulder']][:2],
414 | body.landmarks[mpu.KEYPOINT_DICT['left_shoulder']][:2]],
415 | axis=0)
416 | else:
417 | body.xyz_ref = None
418 | return
419 | # Prepare the request to SpatialLocationCalculator
420 | # ROI : small rectangular zone around the reference keypoint
421 | half_zone_size = int(max(body.rect_w_a / 90, 4))
422 | xc = int(body.xyz_ref_coords_pixel[0] + self.crop_w)
423 | yc = int(body.xyz_ref_coords_pixel[1])
424 | roi_left = max(0, xc - half_zone_size)
425 | roi_right = min(self.img_w-1, xc + half_zone_size)
426 | roi_top = max(0, yc - half_zone_size)
427 | roi_bottom = min(self.img_h-1, yc + half_zone_size)
428 | roi_topleft = dai.Point2f(roi_left, roi_top)
429 | roi_bottomright = dai.Point2f(roi_right, roi_bottom)
430 | # Config
431 | conf_data = dai.SpatialLocationCalculatorConfigData()
432 | conf_data.depthThresholds.lowerThreshold = 100
433 | conf_data.depthThresholds.upperThreshold = 10000
434 | # conf_data.roi = dai.Rect(roi_center, roi_size)
435 | conf_data.roi = dai.Rect(roi_topleft, roi_bottomright)
436 | cfg = dai.SpatialLocationCalculatorConfig()
437 | cfg.setROIs([conf_data])
438 | # spatial_rtrip_time = now()
439 | self.q_spatial_config.send(cfg)
440 |
441 | # Receives spatial locations
442 | spatial_data = self.q_spatial_data.get().getSpatialLocations()
443 | # self.glob_spatial_rtrip_time += now() - spatial_rtrip_time
444 | # self.nb_spatial_requests += 1
445 | sd = spatial_data[0]
446 | body.xyz_zone = [
447 | int(sd.config.roi.topLeft().x) - self.crop_w,
448 | int(sd.config.roi.topLeft().y),
449 | int(sd.config.roi.bottomRight().x) - self.crop_w,
450 | int(sd.config.roi.bottomRight().y)
451 | ]
452 | body.xyz = np.array([
453 | sd.spatialCoordinates.x,
454 | sd.spatialCoordinates.y,
455 | sd.spatialCoordinates.z
456 | ])
457 | if self.smoothing:
458 | body.xyz = self.filter_xyz.apply(body.xyz)
459 |
460 | def pd_postprocess(self, inference):
461 | scores = np.array(inference.getLayerFp16("Identity_1"), dtype=np.float16) # 2254
462 | bboxes = np.array(inference.getLayerFp16("Identity"), dtype=np.float16).reshape((self.nb_anchors,12)) # 2254x12
463 | # Decode bboxes
464 | bodies = mpu.decode_bboxes(self.pd_score_thresh, scores, bboxes, self.anchors, best_only=True)
465 | if bodies:
466 | body = bodies[0]
467 | else:
468 | return None
469 | mpu.detections_to_rect(body)
470 | mpu.rect_transformation(body, self.frame_size, self.frame_size, self.rect_transf_scale)
471 | return body
472 |
473 | def lm_postprocess(self, body, inference):
474 | # The output names of the landmarks model are :
475 | # Identity_1 (1x1) : score (previously output_poseflag)
476 | # Identity_2 (1x128x128x1) (previously output_segmentation)
477 | # Identity_3 (1x64x64x39) (previously output_heatmap)
478 | # Identity_4 (1x117) world 3D landmarks (previously world_3d)
479 | # Identity (1x195) image 3D landmarks (previously ld_3d)
480 | body.lm_score = inference.getLayerFp16("Identity_1")[0]
481 | if body.lm_score > self.lm_score_thresh:
482 |
483 | lm_raw = np.array(inference.getLayerFp16("Identity")).reshape(-1,5)
484 | # Each keypoint have 5 information:
485 | # - X,Y coordinates are local to the body of
486 | # interest and range from [0.0, 255.0].
487 | # - Z coordinate is measured in "image pixels" like
488 | # the X and Y coordinates and represents the
489 | # distance relative to the plane of the subject's
490 | # hips, which is the origin of the Z axis. Negative
491 | # values are between the hips and the camera;
492 | # positive values are behind the hips. Z coordinate
493 | # scale is similar with X, Y scales but has different
494 | # nature as obtained not via human annotation, by
495 | # fitting synthetic data (GHUM model) to the 2D
496 | # annotation.
497 | # - Visibility, after user-applied sigmoid denotes the
498 | # probability that a keypoint is located within the
499 | # frame and not occluded by another bigger body
500 | # part or another object.
501 | # - Presence, after user-applied sigmoid denotes the
502 | # probability that a keypoint is located within the
503 | # frame.
504 |
505 | # Normalize x,y,z. Scaling in z = scaling in x = 1/self.lm_input_length
506 | lm_raw[:,:3] /= self.lm_input_length
507 | # Apply sigmoid on visibility and presence (if used later)
508 | body.visibility = 1 / (1 + np.exp(-lm_raw[:,3]))
509 | body.presence = 1 / (1 + np.exp(-lm_raw[:,4]))
510 |
511 | # body.norm_landmarks contains the normalized ([0:1]) 3D coordinates of landmarks in the square rotated body bounding box
512 | body.norm_landmarks = lm_raw[:,:3]
513 | # Now calculate body.landmarks = the landmarks in the image coordinate system (in pixel) (body.landmarks)
514 | src = np.array([(0, 0), (1, 0), (1, 1)], dtype=np.float32)
515 | dst = np.array([ (x, y) for x,y in body.rect_points[1:]], dtype=np.float32) # body.rect_points[0] is left bottom point and points going clockwise!
516 | mat = cv2.getAffineTransform(src, dst)
517 | lm_xy = np.expand_dims(body.norm_landmarks[:self.nb_kps+2,:2], axis=0)
518 | lm_xy = np.squeeze(cv2.transform(lm_xy, mat))
519 | # A segment of length 1 in the coordinates system of body bounding box takes body.rect_w_a pixels in the
520 | # original image. Then we arbitrarily divide by 4 for a more realistic appearance.
521 | lm_z = body.norm_landmarks[:self.nb_kps+2,2:3] * body.rect_w_a / 4
522 | lm_xyz = np.hstack((lm_xy, lm_z))
523 |
524 | # World landmarks are predicted in meters rather than in pixels of the image
525 | # and have origin in the middle of the hips rather than in the corner of the
526 | # pose image (cropped with given rectangle). Thus only rotation (but not scale
527 | # and translation) is applied to the landmarks to transform them back to
528 | # original coordinates.
529 | body.landmarks_world = np.array(inference.getLayerFp16("Identity_4")).reshape(-1,3)[:self.nb_kps]
530 | sin_rot = sin(body.rotation)
531 | cos_rot = cos(body.rotation)
532 | rot_m = np.array([[cos_rot, sin_rot], [-sin_rot, cos_rot]])
533 | body.landmarks_world[:,:2] = np.dot(body.landmarks_world[:,:2], rot_m)
534 |
535 | if self.smoothing:
536 | timestamp = now()
537 | object_scale = body.rect_w_a
538 | lm_xyz[:self.nb_kps] = self.filter_landmarks.apply(lm_xyz[:self.nb_kps], timestamp, object_scale)
539 | lm_xyz[self.nb_kps:] = self.filter_landmarks_aux.apply(lm_xyz[self.nb_kps:], timestamp, object_scale)
540 | body.landmarks_world = self.filter_landmarks_world.apply(body.landmarks_world, timestamp)
541 |
542 | body.landmarks = lm_xyz.astype(np.int32)
543 |
544 | # body_from_landmarks will be used to initialize the bounding rotated rectangle in the next frame
545 | # The only information we need are the 2 landmarks 33 and 34
546 | self.body_from_landmarks = mpu.Body(pd_kps=body.landmarks[self.nb_kps:self.nb_kps+2,:2]/self.frame_size)
547 |
548 | # If we added padding to make the image square, we need to remove this padding from landmark coordinates and from rect_points
549 | if self.pad_h > 0:
550 | body.landmarks[:,1] -= self.pad_h
551 | for i in range(len(body.rect_points)):
552 | body.rect_points[i][1] -= self.pad_h
553 | if self.pad_w > 0:
554 | body.landmarks[:,0] -= self.pad_w
555 | for i in range(len(body.rect_points)):
556 | body.rect_points[i][0] -= self.pad_w
557 |
558 |
559 | def next_frame(self):
560 |
561 | self.fps.update()
562 |
563 | if self.input_type == "rgb":
564 | in_video = self.q_video.get()
565 | video_frame = in_video.getCvFrame()
566 | if self.pad_h:
567 | square_frame = cv2.copyMakeBorder(video_frame, self.pad_h, self.pad_h, self.pad_w, self.pad_w, cv2.BORDER_CONSTANT)
568 | else:
569 | square_frame = video_frame
570 | # For debugging
571 | # if not self.crop:
572 | # lb = self.q_lb_out.get()
573 | # if lb:
574 | # lb = lb.getCvFrame()
575 | # cv2.imshow("letterbox", lb)
576 | else:
577 | if self.input_type == "image":
578 | frame = self.img.copy()
579 | else:
580 | ok, frame = self.cap.read()
581 | if not ok:
582 | return None, None
583 | # Cropping and/or padding of the video frame
584 | video_frame = frame[self.crop_h:self.crop_h+self.frame_size, self.crop_w:self.crop_w+self.frame_size]
585 | if self.pad_h or self.pad_w:
586 | square_frame = cv2.copyMakeBorder(video_frame, self.pad_h, self.pad_h, self.pad_w, self.pad_w, cv2.BORDER_CONSTANT)
587 | else:
588 | square_frame = video_frame
589 |
590 | if self.force_detection or not self.use_previous_landmarks:
591 | if self.input_type == "rgb":
592 | self.q_pre_pd_manip_cfg.send(self.cfg_pre_pd)
593 | else:
594 | frame_nn = dai.ImgFrame()
595 | frame_nn.setTimestamp(time.monotonic())
596 | frame_nn.setWidth(self.pd_input_length)
597 | frame_nn.setHeight(self.pd_input_length)
598 | frame_nn.setData(to_planar(square_frame, (self.pd_input_length, self.pd_input_length)))
599 | pd_rtrip_time = now()
600 | self.q_pd_in.send(frame_nn)
601 |
602 | # Get pose detection
603 | inference = self.q_pd_out.get()
604 | if self.input_type != "rgb" and (self.force_detection or not self.use_previous_landmarks):
605 | pd_rtrip_time = now() - pd_rtrip_time
606 | self.glob_pd_rtrip_time += pd_rtrip_time
607 | body = self.pd_postprocess(inference)
608 | self.nb_pd_inferences += 1
609 | else:
610 | body = self.body_from_landmarks
611 | mpu.detections_to_rect(body) # self.regions.pd_kps are initialized from landmarks on previous frame
612 | mpu.rect_transformation(body, self.frame_size, self.frame_size, self.rect_transf_scale)
613 |
614 |
615 | # Landmarks
616 | if body:
617 | frame_nn = mpu.warp_rect_img(body.rect_points, square_frame, self.lm_input_length, self.lm_input_length)
618 | frame_nn = frame_nn / 255.
619 | nn_data = dai.NNData()
620 | nn_data.setLayer("input_1", to_planar(frame_nn, (self.lm_input_length, self.lm_input_length)))
621 | lm_rtrip_time = now()
622 | self.q_lm_in.send(nn_data)
623 |
624 | # Get landmarks
625 | inference = self.q_lm_out.get()
626 | lm_rtrip_time = now() - lm_rtrip_time
627 | self.glob_lm_rtrip_time += lm_rtrip_time
628 | self.nb_lm_inferences += 1
629 | self.lm_postprocess(body, inference)
630 | if body.lm_score < self.lm_score_thresh:
631 | body = None
632 | self.use_previous_landmarks = False
633 | if self.smoothing:
634 | self.filter_landmarks.reset()
635 | self.filter_landmarks_aux.reset()
636 | self.filter_landmarks_world.reset()
637 | else:
638 | self.use_previous_landmarks = True
639 | if self.xyz:
640 | self.query_body_xyz(body)
641 |
642 | else:
643 | self.use_previous_landmarks = False
644 | if self.smoothing:
645 | self.filter_landmarks.reset()
646 | self.filter_landmarks_aux.reset()
647 | self.filter_landmarks_world.reset()
648 | if self.xyz: self.filter_xyz.reset()
649 |
650 | return video_frame, body
651 |
652 |
653 | def exit(self):
654 | self.device.close()
655 | # Print some stats
656 | if self.stats:
657 | print(f"FPS : {self.fps.get_global():.1f} f/s (# frames = {self.fps.nbf})")
658 | print(f"# pose detection inferences : {self.nb_pd_inferences}")
659 | print(f"# landmark inferences : {self.nb_lm_inferences}")
660 | if self.input_type != "rgb" and self.nb_pd_inferences != 0: print(f"Pose detection round trip : {self.glob_pd_rtrip_time/self.nb_pd_inferences*1000:.1f} ms")
661 | if self.nb_lm_inferences != 0: print(f"Landmark round trip : {self.glob_lm_rtrip_time/self.nb_lm_inferences*1000:.1f} ms")
662 |
663 |
664 |
--------------------------------------------------------------------------------
/BlazeposeDepthaiEdge.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import cv2
3 | from numpy.core.fromnumeric import trace
4 | import mediapipe_utils as mpu
5 | from pathlib import Path
6 | from FPS import FPS, now
7 | import depthai as dai
8 | import marshal
9 | import sys
10 | from string import Template
11 | from math import sin, cos
12 |
13 | SCRIPT_DIR = Path(__file__).resolve().parent
14 | POSE_DETECTION_MODEL = str(SCRIPT_DIR / "models/pose_detection_sh4.blob")
15 | LANDMARK_MODEL_FULL = str(SCRIPT_DIR / "models/pose_landmark_full_sh4.blob")
16 | LANDMARK_MODEL_HEAVY = str(SCRIPT_DIR / "models/pose_landmark_heavy_sh4.blob")
17 | LANDMARK_MODEL_LITE = str(SCRIPT_DIR / "models/pose_landmark_lite_sh4.blob")
18 | DETECTION_POSTPROCESSING_MODEL = str(SCRIPT_DIR / "custom_models/DetectionBestCandidate_sh1.blob")
19 | DIVIDE_BY_255_MODEL = str(SCRIPT_DIR / "custom_models/DivideBy255_sh1.blob")
20 | TEMPLATE_MANAGER_SCRIPT = str(SCRIPT_DIR / "template_manager_script.py")
21 |
22 |
23 | def to_planar(arr: np.ndarray, shape: tuple) -> np.ndarray:
24 | return cv2.resize(arr, shape).transpose(2,0,1).flatten()
25 |
26 | class BlazeposeDepthai:
27 | """
28 | Blazepose body pose detector
29 | Arguments:
30 | - input_src: frame source,
31 | - "rgb" or None: OAK* internal color camera,
32 | - "rgb_laconic": same as "rgb" but without sending the frames to the host,
33 | Note that as we are in Edge mode, input sources coming from the host like a image or a video is not supported
34 | - pd_model: Blazepose detection model blob file (if None, takes the default value POSE_DETECTION_MODEL),
35 | - pd_score: confidence score to determine whether a detection is reliable (a float between 0 and 1).
36 | - pp_model: detection postprocessing model blob file (if None, takes the default value DETECTION_POSTPROCESSING_MODEL),,
37 | - lm_model: Blazepose landmark model blob file
38 | - None or "full": the default blob file LANDMARK_MODEL_FULL,
39 | - "lite": the default blob file LANDMARK_MODEL_LITE,
40 | - "831": the full model from previous version of mediapipe (0.8.3.1) LANDMARK_MODEL_FULL_0831,
41 | - a path of a blob file.
42 | - lm_score_thresh : confidence score to determine whether landmarks prediction is reliable (a float between 0 and 1).
43 | - xyz: boolean, when True get the (x, y, z) coords of the reference point (center of the hips) (if the device supports depth measures).
44 | - crop : boolean which indicates if square cropping is done or not
45 | - smoothing: boolean which indicates if smoothing filtering is applied
46 | - filter_window_size and filter_velocity_scale:
47 | The filter keeps track (on a window of specified size) of
48 | value changes over time, which as result gives velocity of how value
49 | changes over time. With higher velocity it weights new values higher.
50 | - higher filter_window_size adds to lag and to stability
51 | - lower filter_velocity_scale adds to lag and to stability
52 |
53 | - internal_fps : when using the internal color camera as input source, set its FPS to this value (calling setFps()).
54 | - internal_frame_height : when using the internal color camera, set the frame height (calling setIspScale()).
55 | The width is calculated accordingly to height and depends on value of 'crop'
56 | - stats : boolean, when True, display some statistics when exiting.
57 | - trace: boolean, when True print some debug messages
58 | - force_detection: boolean, force person detection on every frame (never use landmarks from previous frame to determine ROI)
59 | """
60 | def __init__(self, input_src="rgb",
61 | pd_model=None,
62 | pd_score_thresh=0.5,
63 | pp_model=None,
64 | lm_model=None,
65 | lm_score_thresh=0.7,
66 | xyz = False,
67 | crop=False,
68 | smoothing= True,
69 | filter_window_size=5,
70 | filter_velocity_scale=10,
71 | stats=False,
72 | internal_fps=None,
73 | internal_frame_height=1080,
74 | trace=False,
75 | force_detection=False):
76 |
77 | self.pd_model = pd_model if pd_model else POSE_DETECTION_MODEL
78 | self.pp_model = pp_model if pd_model else DETECTION_POSTPROCESSING_MODEL
79 | self.divide_by_255_model = DIVIDE_BY_255_MODEL
80 | print(f"Pose detection blob file : {self.pd_model}")
81 | self.rect_transf_scale = 1.25
82 | if lm_model is None or lm_model == "full":
83 | self.lm_model = LANDMARK_MODEL_FULL
84 | elif lm_model == "lite":
85 | self.lm_model = LANDMARK_MODEL_LITE
86 | elif lm_model == "heavy":
87 | self.lm_model = LANDMARK_MODEL_HEAVY
88 | else:
89 | self.lm_model = lm_model
90 | print(f"Landmarks using blob file : {self.lm_model}")
91 |
92 | self.pd_score_thresh = pd_score_thresh
93 | self.lm_score_thresh = lm_score_thresh
94 | self.smoothing = smoothing
95 | self.crop = crop
96 | self.internal_fps = internal_fps
97 | self.stats = stats
98 | self.presence_threshold = 0.5
99 | self.visibility_threshold = 0.5
100 |
101 | self.trace = trace
102 | self.force_detection = force_detection
103 |
104 | self.device = dai.Device()
105 | self.xyz = False
106 |
107 | if input_src == None or input_src == "rgb" or input_src == "rgb_laconic":
108 | self.input_type = "rgb" # OAK* internal color camera
109 | self.laconic = input_src == "rgb_laconic" # Camera frames are not sent to the host
110 | if xyz:
111 | # Check if the device supports stereo
112 | cameras = self.device.getConnectedCameras()
113 | if dai.CameraBoardSocket.LEFT in cameras and dai.CameraBoardSocket.RIGHT in cameras:
114 | self.xyz = True
115 | else:
116 | print("Warning: depth unavailable on this device, 'xyz' argument is ignored")
117 |
118 | if internal_fps is None:
119 | if "full" in str(self.lm_model):
120 | self.internal_fps = 18 if self.xyz else 20
121 | elif "heavy" in str(lm_model):
122 | self.internal_fps = 7 if self.xyz else 8
123 | else: # "lite"
124 | self.internal_fps = 22 if self.xyz else 26
125 | else:
126 | self.internal_fps = internal_fps
127 | print(f"Internal camera FPS set to: {self.internal_fps}")
128 |
129 | self.video_fps = self.internal_fps # Used when saving the output in a video file. Should be close to the real fps
130 |
131 | if self.crop:
132 | self.frame_size, self.scale_nd = mpu.find_isp_scale_params(internal_frame_height)
133 | self.img_h = self.img_w = self.frame_size
134 | self.pad_w = self.pad_h = 0
135 | self.crop_w = (int(round(1920 * self.scale_nd[0] / self.scale_nd[1])) - self.img_w) // 2
136 |
137 | else:
138 | width, self.scale_nd = mpu.find_isp_scale_params(internal_frame_height * 1920 / 1080, is_height=False)
139 | self.img_h = int(round(1080 * self.scale_nd[0] / self.scale_nd[1]))
140 | self.img_w = int(round(1920 * self.scale_nd[0] / self.scale_nd[1]))
141 | self.pad_h = (self.img_w - self.img_h) // 2
142 | self.pad_w = 0
143 | self.frame_size = self.img_w
144 | self.crop_w = 0
145 |
146 | print(f"Internal camera image size: {self.img_w} x {self.img_h} - pad_h: {self.pad_h}")
147 |
148 | else:
149 | print("Invalid input source:", input_src)
150 | sys.exit()
151 |
152 | self.nb_kps = 33
153 |
154 | if self.smoothing:
155 | self.filter_landmarks = mpu.LandmarksSmoothingFilter(
156 | frequency=self.video_fps,
157 | min_cutoff=0.05,
158 | beta=80,
159 | derivate_cutoff=1
160 | )
161 | # landmarks_aux corresponds to the 2 landmarks used to compute the ROI in next frame
162 | self.filter_landmarks_aux = mpu.LandmarksSmoothingFilter(
163 | frequency=self.video_fps,
164 | min_cutoff=0.01,
165 | beta=10,
166 | derivate_cutoff=1
167 | )
168 | self.filter_landmarks_world = mpu.LandmarksSmoothingFilter(
169 | frequency=self.video_fps,
170 | min_cutoff=0.1,
171 | beta=40,
172 | derivate_cutoff=1,
173 | disable_value_scaling=True
174 | )
175 | if self.xyz:
176 | self.filter_xyz = mpu.LowPassFilter(alpha=0.25)
177 |
178 | # Define and start pipeline
179 | usb_speed = self.device.getUsbSpeed()
180 | self.device.startPipeline(self.create_pipeline())
181 | print(f"Pipeline started - USB speed: {str(usb_speed).split('.')[-1]}")
182 |
183 | # Define data queues
184 | if not self.laconic:
185 | self.q_video = self.device.getOutputQueue(name="cam_out", maxSize=1, blocking=False)
186 | self.q_manager_out = self.device.getOutputQueue(name="manager_out", maxSize=1, blocking=False)
187 | # For debugging
188 | # self.q_pre_pd_manip_out = self.device.getOutputQueue(name="pre_pd_manip_out", maxSize=1, blocking=False)
189 | # self.q_pre_lm_manip_out = self.device.getOutputQueue(name="pre_lm_manip_out", maxSize=1, blocking=False)
190 |
191 | self.fps = FPS()
192 |
193 | self.nb_pd_inferences = 0
194 | self.nb_lm_inferences = 0
195 | self.nb_lm_inferences_after_landmarks_ROI = 0
196 | self.nb_frames_no_body = 0
197 |
198 | def create_pipeline(self):
199 | print("Creating pipeline...")
200 | # Start defining a pipeline
201 | pipeline = dai.Pipeline()
202 | pipeline.setOpenVINOVersion(dai.OpenVINO.Version.VERSION_2021_4)
203 | self.pd_input_length = 224
204 | self.lm_input_length = 256
205 |
206 | # ColorCamera
207 | print("Creating Color Camera...")
208 | cam = pipeline.create(dai.node.ColorCamera)
209 | cam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
210 | cam.setInterleaved(False)
211 | cam.setIspScale(self.scale_nd[0], self.scale_nd[1])
212 | cam.setFps(self.internal_fps)
213 | cam.setBoardSocket(dai.CameraBoardSocket.RGB)
214 |
215 | if self.crop:
216 | cam.setVideoSize(self.frame_size, self.frame_size)
217 | cam.setPreviewSize(self.frame_size, self.frame_size)
218 | else:
219 | cam.setVideoSize(self.img_w, self.img_h)
220 | cam.setPreviewSize(self.img_w, self.img_h)
221 |
222 | if not self.laconic:
223 | cam_out = pipeline.create(dai.node.XLinkOut)
224 | cam_out.setStreamName("cam_out")
225 | cam_out.input.setQueueSize(1)
226 | cam_out.input.setBlocking(False)
227 | cam.video.link(cam_out.input)
228 |
229 |
230 | # Define manager script node
231 | manager_script = pipeline.create(dai.node.Script)
232 | manager_script.setScript(self.build_manager_script())
233 |
234 | if self.xyz:
235 | print("Creating MonoCameras, Stereo and SpatialLocationCalculator nodes...")
236 | # For now, RGB needs fixed focus to properly align with depth.
237 | # The value used during calibration should be used here
238 | calib_data = self.device.readCalibration()
239 | calib_lens_pos = calib_data.getLensPosition(dai.CameraBoardSocket.RGB)
240 | print(f"RGB calibration lens position: {calib_lens_pos}")
241 | cam.initialControl.setManualFocus(calib_lens_pos)
242 |
243 | mono_resolution = dai.MonoCameraProperties.SensorResolution.THE_400_P
244 | left = pipeline.createMonoCamera()
245 | left.setBoardSocket(dai.CameraBoardSocket.LEFT)
246 | left.setResolution(mono_resolution)
247 | left.setFps(self.internal_fps)
248 |
249 | right = pipeline.createMonoCamera()
250 | right.setBoardSocket(dai.CameraBoardSocket.RIGHT)
251 | right.setResolution(mono_resolution)
252 | right.setFps(self.internal_fps)
253 |
254 | stereo = pipeline.createStereoDepth()
255 | stereo.setConfidenceThreshold(230)
256 | # LR-check is required for depth alignment
257 | stereo.setLeftRightCheck(True)
258 | stereo.setDepthAlign(dai.CameraBoardSocket.RGB)
259 | stereo.setSubpixel(False) # subpixel True -> latency
260 | # MEDIAN_OFF necessary in depthai 2.7.2.
261 | # Otherwise : [critical] Fatal error. Please report to developers. Log: 'StereoSipp' '533'
262 | # stereo.setMedianFilter(dai.StereoDepthProperties.MedianFilter.MEDIAN_OFF)
263 |
264 | spatial_location_calculator = pipeline.createSpatialLocationCalculator()
265 | spatial_location_calculator.setWaitForConfigInput(True)
266 | spatial_location_calculator.inputDepth.setBlocking(False)
267 | spatial_location_calculator.inputDepth.setQueueSize(1)
268 |
269 | left.out.link(stereo.left)
270 | right.out.link(stereo.right)
271 |
272 | stereo.depth.link(spatial_location_calculator.inputDepth)
273 |
274 | manager_script.outputs['spatial_location_config'].link(spatial_location_calculator.inputConfig)
275 | spatial_location_calculator.out.link(manager_script.inputs['spatial_data'])
276 |
277 | # Define pose detection pre processing (resize preview to (self.pd_input_length, self.pd_input_length))
278 | print("Creating Pose Detection pre processing image manip...")
279 | pre_pd_manip = pipeline.create(dai.node.ImageManip)
280 | pre_pd_manip.setMaxOutputFrameSize(self.pd_input_length*self.pd_input_length*3)
281 | pre_pd_manip.setWaitForConfigInput(True)
282 | pre_pd_manip.inputImage.setQueueSize(1)
283 | pre_pd_manip.inputImage.setBlocking(False)
284 | cam.preview.link(pre_pd_manip.inputImage)
285 | manager_script.outputs['pre_pd_manip_cfg'].link(pre_pd_manip.inputConfig)
286 |
287 | # For debugging
288 | # pre_pd_manip_out = pipeline.createXLinkOut()
289 | # pre_pd_manip_out.setStreamName("pre_pd_manip_out")
290 | # pre_pd_manip.out.link(pre_pd_manip_out.input)
291 |
292 | # Define pose detection model
293 | print("Creating Pose Detection Neural Network...")
294 | pd_nn = pipeline.create(dai.node.NeuralNetwork)
295 | pd_nn.setBlobPath(self.pd_model)
296 | # Increase threads for detection
297 | # pd_nn.setNumInferenceThreads(2)
298 | pre_pd_manip.out.link(pd_nn.input)
299 |
300 | # Define pose detection post processing "model"
301 | print("Creating Pose Detection post processing Neural Network...")
302 | post_pd_nn = pipeline.create(dai.node.NeuralNetwork)
303 | post_pd_nn.setBlobPath(self.pp_model)
304 | pd_nn.out.link(post_pd_nn.input)
305 | post_pd_nn.out.link(manager_script.inputs['from_post_pd_nn'])
306 |
307 | # Define link to send result to host
308 | manager_out = pipeline.create(dai.node.XLinkOut)
309 | manager_out.setStreamName("manager_out")
310 | manager_script.outputs['host'].link(manager_out.input)
311 |
312 | # Define landmark pre processing image manip
313 | print("Creating Landmark pre processing image manip...")
314 | pre_lm_manip = pipeline.create(dai.node.ImageManip)
315 | pre_lm_manip.setMaxOutputFrameSize(self.lm_input_length*self.lm_input_length*3)
316 | pre_lm_manip.setWaitForConfigInput(True)
317 | pre_lm_manip.inputImage.setQueueSize(1)
318 | pre_lm_manip.inputImage.setBlocking(False)
319 | cam.preview.link(pre_lm_manip.inputImage)
320 |
321 | # For debugging
322 | # pre_lm_manip_out = pipeline.createXLinkOut()
323 | # pre_lm_manip_out.setStreamName("pre_lm_manip_out")
324 | # pre_lm_manip.out.link(pre_lm_manip_out.input)
325 |
326 | manager_script.outputs['pre_lm_manip_cfg'].link(pre_lm_manip.inputConfig)
327 |
328 | # Define normalization model between ImageManip and landmark model
329 | # This is a temporary step. Could be removed when support of setFrameType(RGBF16F16F16p) in ImageManip node
330 | print("Creating DiveideBy255 Neural Network...")
331 | divide_nn = pipeline.create(dai.node.NeuralNetwork)
332 | divide_nn.setBlobPath(self.divide_by_255_model)
333 | pre_lm_manip.out.link(divide_nn.input)
334 |
335 | # Define landmark model
336 | print("Creating Landmark Neural Network...")
337 | lm_nn = pipeline.create(dai.node.NeuralNetwork)
338 | lm_nn.setBlobPath(self.lm_model)
339 | # lm_nn.setNumInferenceThreads(1)
340 |
341 | divide_nn.out.link(lm_nn.input)
342 | lm_nn.out.link(manager_script.inputs['from_lm_nn'])
343 |
344 | print("Pipeline created.")
345 |
346 | return pipeline
347 |
348 | def build_manager_script(self):
349 | '''
350 | The code of the scripting node 'manager_script' depends on :
351 | - the NN model (full, lite, 831),
352 | - the score threshold,
353 | - the video frame shape
354 | So we build this code from the content of the file template_manager_script.py which is a python template
355 | '''
356 | # Read the template
357 | with open(TEMPLATE_MANAGER_SCRIPT, 'r') as file:
358 | template = Template(file.read())
359 |
360 | # Perform the substitution
361 | code = template.substitute(
362 | _TRACE = "node.warn" if self.trace else "#",
363 | _pd_score_thresh = self.pd_score_thresh,
364 | _lm_score_thresh = self.lm_score_thresh,
365 | _force_detection = self.force_detection,
366 | _pad_h = self.pad_h,
367 | _img_h = self.img_h,
368 | _img_w = self.img_w,
369 | _frame_size = self.frame_size,
370 | _crop_w = self.crop_w,
371 | _rect_transf_scale = self.rect_transf_scale,
372 | _IF_XYZ = "" if self.xyz else '"""',
373 | _buffer_size = 2910 if self.xyz else 2863,
374 | _visibility_threshold = self.visibility_threshold
375 | )
376 | # Remove comments and empty lines
377 | import re
378 | code = re.sub(r'"{3}.*?"{3}', '', code, flags=re.DOTALL)
379 | code = re.sub(r'#.*', '', code)
380 | code = re.sub('\n\s*\n', '\n', code)
381 | # For debugging
382 | if self.trace:
383 | with open("tmp_code.py", "w") as file:
384 | file.write(code)
385 |
386 | return code
387 |
388 | def is_present(self, body, lm_id):
389 | return body.presence[lm_id] > self.presence_threshold
390 |
391 | def lm_postprocess(self, body, lms, lms_world):
392 | # lms : landmarks sent by Manager script node to host (list of 39*5 elements for full body or 31*5 for upper body)
393 | lm_raw = np.array(lms).reshape(-1,5)
394 | # Each keypoint have 5 information:
395 | # - X,Y coordinates are local to the body of
396 | # interest and range from [0.0, 255.0].
397 | # - Z coordinate is measured in "image pixels" like
398 | # the X and Y coordinates and represents the
399 | # distance relative to the plane of the subject's
400 | # hips, which is the origin of the Z axis. Negative
401 | # values are between the hips and the camera;
402 | # positive values are behind the hips. Z coordinate
403 | # scale is similar with X, Y scales but has different
404 | # nature as obtained not via human annotation, by
405 | # fitting synthetic data (GHUM model) to the 2D
406 | # annotation.
407 | # - Visibility, after user-applied sigmoid denotes the
408 | # probability that a keypoint is located within the
409 | # frame and not occluded by another bigger body
410 | # part or another object.
411 | # - Presence, after user-applied sigmoid denotes the
412 | # probability that a keypoint is located within the
413 | # frame.
414 |
415 | # Normalize x,y,z. Scaling in z = scaling in x = 1/self.lm_input_length
416 | lm_raw[:,:3] /= self.lm_input_length
417 | # Apply sigmoid on visibility and presence (if used later)
418 | body.visibility = 1 / (1 + np.exp(-lm_raw[:,3]))
419 | body.presence = 1 / (1 + np.exp(-lm_raw[:,4]))
420 |
421 | # body.norm_landmarks contains the normalized ([0:1]) 3D coordinates of landmarks in the square rotated body bounding box
422 | body.norm_landmarks = lm_raw[:,:3]
423 | # Now calculate body.landmarks = the landmarks in the image coordinate system (in pixel) (body.landmarks)
424 | src = np.array([(0, 0), (1, 0), (1, 1)], dtype=np.float32)
425 | dst = np.array([ (x, y) for x,y in body.rect_points[1:]], dtype=np.float32) # body.rect_points[0] is left bottom point and points going clockwise!
426 | mat = cv2.getAffineTransform(src, dst)
427 | lm_xy = np.expand_dims(body.norm_landmarks[:self.nb_kps,:2], axis=0)
428 | lm_xy = np.squeeze(cv2.transform(lm_xy, mat))
429 |
430 | # A segment of length 1 in the coordinates system of body bounding box takes body.rect_w_a pixels in the
431 | # original image. Then we arbitrarily divide by 4 for a more realistic appearance.
432 | lm_z = body.norm_landmarks[:self.nb_kps,2:3] * body.rect_w_a / 4
433 | lm_xyz = np.hstack((lm_xy, lm_z))
434 |
435 | # World landmarks are predicted in meters rather than in pixels of the image
436 | # and have origin in the middle of the hips rather than in the corner of the
437 | # pose image (cropped with given rectangle). Thus only rotation (but not scale
438 | # and translation) is applied to the landmarks to transform them back to
439 | # original coordinates.
440 | body.landmarks_world = np.array(lms_world).reshape(-1,3)
441 | sin_rot = sin(body.rotation)
442 | cos_rot = cos(body.rotation)
443 | rot_m = np.array([[cos_rot, sin_rot], [-sin_rot, cos_rot]])
444 | body.landmarks_world[:,:2] = np.dot(body.landmarks_world[:,:2], rot_m)
445 |
446 | if self.smoothing:
447 | timestamp = now()
448 | object_scale = body.rect_w_a
449 | lm_xyz[:self.nb_kps] = self.filter_landmarks.apply(lm_xyz[:self.nb_kps], timestamp, object_scale)
450 | lm_xyz[self.nb_kps:] = self.filter_landmarks_aux.apply(lm_xyz[self.nb_kps:], timestamp, object_scale)
451 | body.landmarks_world = self.filter_landmarks_world.apply(body.landmarks_world, timestamp)
452 |
453 | body.landmarks = lm_xyz.astype(np.int32)
454 | # If we added padding to make the image square, we need to remove this padding from landmark coordinates and from rect_points
455 | if self.pad_h > 0:
456 | body.landmarks[:,1] -= self.pad_h
457 | for i in range(len(body.rect_points)):
458 | body.rect_points[i][1] -= self.pad_h
459 | # if self.pad_w > 0:
460 | # body.landmarks[:,0] -= self.pad_w
461 | # for i in range(len(body.rect_points)):
462 | # body.rect_points[i][0] -= self.pad_w
463 |
464 |
465 | def next_frame(self):
466 |
467 | self.fps.update()
468 |
469 | if self.laconic:
470 | video_frame = np.zeros((self.frame_size, self.frame_size, 3), dtype=np.uint8)
471 | else:
472 | in_video = self.q_video.get()
473 | video_frame = in_video.getCvFrame()
474 |
475 | # For debugging
476 | # pre_pd_manip = self.q_pre_pd_manip_out.tryGet()
477 | # if pre_pd_manip:
478 | # pre_pd_manip = pre_pd_manip.getCvFrame()
479 | # cv2.imshow("pre_pd_manip", pre_pd_manip)
480 | # pre_lm_manip = self.q_pre_lm_manip_out.tryGet()
481 | # if pre_lm_manip:
482 | # pre_lm_manip = pre_lm_manip.getCvFrame()
483 | # cv2.imshow("pre_lm_manip", pre_lm_manip)
484 |
485 | # Get result from device
486 | res = marshal.loads(self.q_manager_out.get().getData())
487 | if res["type"] != 0 and res["lm_score"] > self.lm_score_thresh:
488 | body = mpu.Body()
489 | body.rect_x_center_a = res["rect_center_x"] * self.frame_size
490 | body.rect_y_center_a = res["rect_center_y"] * self.frame_size
491 | body.rect_w_a = body.rect_h_a = res["rect_size"] * self.frame_size
492 | body.rotation = res["rotation"]
493 | body.rect_points = mpu.rotated_rect_to_points(body.rect_x_center_a, body.rect_y_center_a, body.rect_w_a, body.rect_h_a, body.rotation)
494 | body.lm_score = res["lm_score"]
495 | self.lm_postprocess(body, res['lms'], res['lms_world'])
496 | if self.xyz:
497 | if res['xyz_ref'] == 0:
498 | body.xyz_ref = None
499 | else:
500 | if res['xyz_ref'] == 1:
501 | body.xyz_ref = "mid_hips"
502 | else: # res['xyz_ref'] == 2:
503 | body.xyz_ref = "mid_shoulders"
504 | body.xyz = np.array(res["xyz"])
505 | if self.smoothing:
506 | body.xyz = self.filter_xyz.apply(body.xyz)
507 | body.xyz_zone = np.array(res["xyz_zone"])
508 | body.xyz_ref_coords_pixel = np.mean(body.xyz_zone.reshape((2,2)), axis=0)
509 |
510 |
511 | else:
512 | body = None
513 | if self.smoothing:
514 | self.filter_landmarks.reset()
515 | self.filter_landmarks_aux.reset()
516 | self.filter_landmarks_world.reset()
517 | if self.xyz: self.filter_xyz.reset()
518 |
519 | # Statistics
520 | if self.stats:
521 | if res["type"] == 0:
522 | self.nb_pd_inferences += 1
523 | self.nb_frames_no_body += 1
524 | else:
525 | self.nb_lm_inferences += 1
526 | if res["type"] == 1:
527 | self.nb_pd_inferences += 1
528 | else: # res["type"] == 2
529 | self.nb_lm_inferences_after_landmarks_ROI += 1
530 | if res["lm_score"] < self.lm_score_thresh: self.nb_frames_no_body += 1
531 |
532 | return video_frame, body
533 |
534 |
535 | def exit(self):
536 | self.device.close()
537 | # Print some stats
538 | if self.stats:
539 | print(f"FPS : {self.fps.get_global():.1f} f/s (# frames = {self.fps.nbf})")
540 | print(f"# frames without body : {self.nb_frames_no_body}")
541 | print(f"# pose detection inferences : {self.nb_pd_inferences}")
542 | print(f"# landmark inferences : {self.nb_lm_inferences} - # after pose detection: {self.nb_lm_inferences - self.nb_lm_inferences_after_landmarks_ROI} - # after landmarks ROI prediction: {self.nb_lm_inferences_after_landmarks_ROI}")
543 |
544 |
545 |
546 |
547 |
--------------------------------------------------------------------------------
/BlazeposeRenderer.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import numpy as np
3 | from o3d_utils import Visu3D
4 | import mediapipe_utils as mpu
5 |
6 |
7 |
8 | # LINE_BODY and COLORS_BODY are used when drawing the skeleton in 3D.
9 | rgb = {"right":(0,1,0), "left":(1,0,0), "middle":(1,1,0)}
10 | LINES_BODY = [[9,10],[4,6],[1,3],
11 | [12,14],[14,16],[16,20],[20,18],[18,16],
12 | [12,11],[11,23],[23,24],[24,12],
13 | [11,13],[13,15],[15,19],[19,17],[17,15],
14 | [24,26],[26,28],[32,30],
15 | [23,25],[25,27],[29,31]]
16 |
17 | COLORS_BODY = ["middle","right","left",
18 | "right","right","right","right","right",
19 | "middle","middle","middle","middle",
20 | "left","left","left","left","left",
21 | "right","right","right","left","left","left"]
22 | COLORS_BODY = [rgb[x] for x in COLORS_BODY]
23 |
24 |
25 |
26 |
27 | class BlazeposeRenderer:
28 | def __init__(self,
29 | tracker,
30 | show_3d=None,
31 | output=None):
32 | self.tracker = tracker
33 | self.show_3d = show_3d
34 | self.fram = None
35 | self.pause = False
36 |
37 | # Rendering flags
38 | self.show_rot_rect = False
39 | self.show_landmarks = True
40 | self.show_score = False
41 | self.show_fps = True
42 |
43 | self.show_xyz_zone = self.show_xyz = self.tracker.xyz
44 |
45 | if self.show_3d == "mixed" and not self.tracker.xyz:
46 | print("'mixed' 3d visualization needs the tracker to be in 'xyz' mode !")
47 | print("3d visualization falling back to 'world' mode.")
48 | self.show_3d = 'world'
49 | if self.show_3d == "image":
50 | self.vis3d = Visu3D(zoom=0.7, segment_radius=3)
51 | z = min(tracker.img_h, tracker.img_w)/3
52 | self.vis3d.create_grid([0,tracker.img_h,-z],[tracker.img_w,tracker.img_h,-z],[tracker.img_w,tracker.img_h,z],[0,tracker.img_h,z],5,2) # Floor
53 | self.vis3d.create_grid([0,0,z],[tracker.img_w,0,z],[tracker.img_w,tracker.img_h,z],[0,tracker.img_h,z],5,2) # Wall
54 | self.vis3d.init_view()
55 | elif self.show_3d == "world":
56 | self.vis3d = Visu3D(bg_color=(0.2, 0.2, 0.2), zoom=1.1, segment_radius=0.01)
57 | self.vis3d.create_grid([-1,1,-1],[1,1,-1],[1,1,1],[-1,1,1],2,2) # Floor
58 | self.vis3d.create_grid([-1,1,1],[1,1,1],[1,-1,1],[-1,-1,1],2,2) # Wall
59 | self.vis3d.init_view()
60 | elif self.show_3d == "mixed":
61 | self.vis3d = Visu3D(bg_color=(0.4, 0.4, 0.4), zoom=0.7, segment_radius=0.01)
62 | half_length = 3
63 | grid_depth = 5
64 | self.vis3d.create_grid([-half_length,1,0],[half_length,1,0],[half_length,1,grid_depth],[-half_length,1,grid_depth],2*half_length,grid_depth) # Floor
65 | self.vis3d.create_grid([-half_length,1,grid_depth],[half_length,1,grid_depth],[half_length,-1,grid_depth],[-half_length,-1,grid_depth],2*half_length,2) # Wall
66 | self.vis3d.create_camera()
67 | self.vis3d.init_view()
68 |
69 | if output is None:
70 | self.output = None
71 | else:
72 | fourcc = cv2.VideoWriter_fourcc(*"MJPG")
73 | self.output = cv2.VideoWriter(output,fourcc,tracker.video_fps,(tracker.img_w, tracker.img_h))
74 |
75 | def is_present(self, body, lm_id):
76 | return body.presence[lm_id] > self.tracker.presence_threshold
77 |
78 | def draw_landmarks(self, body):
79 | if self.show_rot_rect:
80 | cv2.polylines(self.frame, [np.array(body.rect_points)], True, (0,255,255), 2, cv2.LINE_AA)
81 | if self.show_landmarks:
82 | list_connections = LINES_BODY
83 | lines = [np.array([body.landmarks[point,:2] for point in line]) for line in list_connections if self.is_present(body, line[0]) and self.is_present(body, line[1])]
84 | cv2.polylines(self.frame, lines, False, (255, 180, 90), 2, cv2.LINE_AA)
85 |
86 | # for i,x_y in enumerate(body.landmarks_padded[:,:2]):
87 | for i,x_y in enumerate(body.landmarks[:self.tracker.nb_kps,:2]):
88 | if self.is_present(body, i):
89 | if i > 10:
90 | color = (0,255,0) if i%2==0 else (0,0,255)
91 | elif i == 0:
92 | color = (0,255,255)
93 | elif i in [4,5,6,8,10]:
94 | color = (0,255,0)
95 | else:
96 | color = (0,0,255)
97 | cv2.circle(self.frame, (x_y[0], x_y[1]), 4, color, -11)
98 | if self.show_score:
99 | h, w = self.frame.shape[:2]
100 | cv2.putText(self.frame, f"Landmark score: {body.lm_score:.2f}",
101 | (20, h-60),
102 | cv2.FONT_HERSHEY_PLAIN, 2, (255,255,0), 2)
103 |
104 | if self.show_xyz and body.xyz_ref:
105 | x0, y0 = body.xyz_ref_coords_pixel.astype(np.int32)
106 | x0 -= 50
107 | y0 += 40
108 | cv2.rectangle(self.frame, (x0,y0), (x0+100, y0+85), (220,220,240), -1)
109 | cv2.putText(self.frame, f"X:{body.xyz[0]/10:3.0f} cm", (x0+10, y0+20), cv2.FONT_HERSHEY_PLAIN, 1, (20,180,0), 2)
110 | cv2.putText(self.frame, f"Y:{body.xyz[1]/10:3.0f} cm", (x0+10, y0+45), cv2.FONT_HERSHEY_PLAIN, 1, (255,0,0), 2)
111 | cv2.putText(self.frame, f"Z:{body.xyz[2]/10:3.0f} cm", (x0+10, y0+70), cv2.FONT_HERSHEY_PLAIN, 1, (0,0,255), 2)
112 | if self.show_xyz_zone and body.xyz_ref:
113 | # Show zone on which the spatial data were calculated
114 | cv2.rectangle(self.frame, tuple(body.xyz_zone[0:2]), tuple(body.xyz_zone[2:4]), (180,0,180), 2)
115 |
116 | def draw_3d(self, body):
117 | self.vis3d.clear()
118 | self.vis3d.try_move()
119 | self.vis3d.add_geometries()
120 | if body is not None:
121 | points = body.landmarks if self.show_3d == "image" else body.landmarks_world
122 | draw_skeleton = True
123 | if self.show_3d == "mixed":
124 | if body.xyz_ref:
125 | """
126 | Beware, the y value of landmarks_world coordinates is negative for landmarks
127 | above the mid hips (like shoulders) and negative for landmarks below (like feet).
128 | The y value of (x,y,z) coordinates given by depth sensor is negative in the lower part
129 | of the image and positive in the upper part.
130 | """
131 | translation = body.xyz / 1000
132 | translation[1] = -translation[1]
133 | if body.xyz_ref == "mid_hips":
134 | points = points + translation
135 | elif body.xyz_ref == "mid_shoulders":
136 | mid_hips_to_mid_shoulders = np.mean([
137 | points[mpu.KEYPOINT_DICT['right_shoulder']],
138 | points[mpu.KEYPOINT_DICT['left_shoulder']]],
139 | axis=0)
140 | points = points + translation - mid_hips_to_mid_shoulders
141 | else:
142 | draw_skeleton = False
143 | if draw_skeleton:
144 | lines = LINES_BODY
145 | colors = COLORS_BODY
146 | for i,a_b in enumerate(lines):
147 | a, b = a_b
148 | if self.is_present(body, a) and self.is_present(body, b):
149 | self.vis3d.add_segment(points[a], points[b], color=colors[i])
150 | self.vis3d.render()
151 |
152 |
153 | def draw(self, frame, body):
154 | if not self.pause:
155 | self.frame = frame
156 | if body:
157 | self.draw_landmarks(body)
158 | self.body = body
159 | elif self.frame is None:
160 | self.frame = frame
161 | self.body = None
162 | # else: self.frame points to previous frame
163 | if self.show_3d:
164 | self.draw_3d(self.body)
165 | return self.frame
166 |
167 | def exit(self):
168 | if self.output:
169 | self.output.release()
170 |
171 | def waitKey(self, delay=1):
172 | if self.show_fps:
173 | self.tracker.fps.draw(self.frame, orig=(50,50), size=1, color=(240,180,100))
174 | cv2.imshow("Blazepose", self.frame)
175 | if self.output:
176 | self.output.write(self.frame)
177 | key = cv2.waitKey(delay)
178 | if key == 32:
179 | # Pause on space bar
180 | self.pause = not self.pause
181 | elif key == ord('r'):
182 | self.show_rot_rect = not self.show_rot_rect
183 | elif key == ord('l'):
184 | self.show_landmarks = not self.show_landmarks
185 | elif key == ord('s'):
186 | self.show_score = not self.show_score
187 | elif key == ord('f'):
188 | self.show_fps = not self.show_fps
189 | elif key == ord('x'):
190 | if self.tracker.xyz:
191 | self.show_xyz = not self.show_xyz
192 | elif key == ord('z'):
193 | if self.tracker.xyz:
194 | self.show_xyz_zone = not self.show_xyz_zone
195 | return key
196 |
197 |
--------------------------------------------------------------------------------
/FPS.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Sep 22 15:29:32 2017
4 |
5 | @author: geaxx
6 | """
7 | import time
8 | import cv2
9 |
10 | def now():
11 | return time.perf_counter()
12 |
13 | class FPS: # To measure the number of frame per second
14 | def __init__(self, mean_nb_frames=10):
15 | self.nbf = -1
16 | self.fps = 0
17 | self.start = 0
18 | self.stop = 0
19 | self.local_start = 0
20 | self.mean_nb_frames = mean_nb_frames
21 |
22 | def update(self):
23 | if self.nbf%self.mean_nb_frames == 0:
24 | if self.start != 0:
25 | self.stop = now()
26 | self.fps = self.mean_nb_frames/(self.stop-self.local_start)
27 | self.local_start = self.stop
28 | else :
29 | self.start = self.local_start = now()
30 | self.nbf+=1
31 |
32 | def get(self):
33 | return self.fps
34 |
35 | def get_global(self):
36 | if self.stop == 0: self.stop = now()
37 | return self.nbf/(self.stop-self.start)
38 |
39 | def draw(self, win, orig=(10,30), font=cv2.FONT_HERSHEY_SIMPLEX, size=2, color=(0,255,0), thickness=2):
40 | cv2.putText(win,f"FPS={self.get():.2f}",orig,font,size,color,thickness)
41 |
42 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) [2021] [geaxgx]
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Blazepose tracking with DepthAI
2 |
3 | Running Google Mediapipe single body pose tracking models on [DepthAI](https://docs.luxonis.com/en/gen2/) hardware (OAK-1, OAK-D, ...).
4 |
5 | The Blazepose landmark models available in this repository are the version "full", "lite" and "heavy" of mediapipe 0.8.6 (2021/07),
6 |
7 | The pose detection model comes from mediapipe 0.8.4 and is compatible with the 3 landmark models (the 0.8.6 version currently cannot be converted into a Myriad blob).
8 |
9 |

10 |
11 | For the challenger Movenet on DepthAI, please visit : [depthai_movenet](https://github.com/geaxgx/depthai_movenet)
12 |
13 | For an OpenVINO version of Blazepose, please visit : [openvino_blazepose](https://github.com/geaxgx/openvino_blazepose)
14 |
15 |
16 | - [Blazepose tracking with DepthAI](#blazepose-tracking-with-depthai)
17 | - [Architecture: Host mode vs Edge mode](#architecture-host-mode-vs-edge-mode)
18 | - [Inferred 3D vs Measured 3D](#inferred-3d-vs-measured-3d)
19 | - [Install](#install)
20 | - [Run](#run)
21 | - [Mediapipe models](#mediapipe-models)
22 | - [Custom models](#custom-models)
23 | - [Landmarks](#landmarks)
24 | - [Code](#code)
25 | - [Examples](#examples)
26 | - [Credits](#credits)
27 |
28 | ## Architecture: Host mode vs Edge mode
29 | Two modes are available:
30 | - **Host mode :** aside the neural networks that run on the device, almost all the processing is run on the host (the only processing done on the device is the letterboxing operation before the pose detection network when using the device camera as video source). **Use this mode when you want to infer on external input source (videos, images).**
31 | - **Edge mode :** most of the processing (neural networks, post-processings, image manipulations) is run on the device thaks to the depthai scripting node feature. It works only with the device camera but is **definitely the best option when working with the internal camera** (much faster than in Host mode). The data exchanged between the host and the device is minimal: the landmarks of detected body (~3kB/frame). In Edge mode, you can choose not to send the camera video frame to the host (by specifying 'rgb_laconic' as input).
32 |
33 | |Landmark model (Edge mode)| FPS (FPS with 'xyz' option)|
34 | |-|-|
35 | |Full|20 (18)|
36 | |Lite|26 (22)|
37 | |Heavy|8 (7)|
38 |
39 |
40 | 
41 | 
42 |
43 |
44 | For depth-capable devices, when measuring the 3D location of a reference point, more nodes are used and not represented here (2 mono cameras, stereo node, spatial location calculator).
45 |
46 | *Note : the Edge mode schema is missing a custom NeuralNetwork node between the ImageManip node on the right and the landmark NeuralNetwork. This custom NeuralNetwork runs a very simple model that normalize (divide by 255) the output image from the ImageManip node. This is a temporary fix, should be removed when depthai ImageManip node will support setFrameType(RGBF16F16F16p).*
47 |
48 | ## Inferred 3D vs Measured 3D
49 |
50 | * **Inferred 3D** : the Landmark model is able to infer 3D (x,y,z) landmarks. Actually, the 0.8.6 model yields 2 outputs with 3D landmarks : Identity (accessed via *body.landmarks* with the API) and Identity_4 (accessed via *body.world_landmarks*). It may sound as redundant information but there is a difference: *world landmarks* are real-world 3D coordinates in meters with the origin at the center between hips. *world landmarks* share the same landmark topology as *landmarks*. However, *landmarks* provide coordinates (in pixels) of a 3D object projected onto the 2D image surface, while *world landmarks* provide coordinates (in meters) of the 3D object itself.
51 | * **Measured 3D** : for devices able to measure depth (like OAK-D), we can determine the real 3D location of a point of the image in camera coordinate system. So one idea would be to measure the 3D locations of each inferred 2D body landmarks. It turns out it is not a good idea in practice for at least 2 reasons. First, a inferred keypoint may stand "outside" of its counterpart in the image and therefore in the aligned depth frame. It happens probably more frequently with extremities, and it can be explained by the inaccuracy of the model which is never 100% perfect. Secondly, we can't get depth for hidden keypoints. An alternative solution, implemented here, is to combine the inferred 3D *world landmarks* with the measured 3D location of one reference point, the center between hips. Compared to extremities, this reference point can be more robustly measured.
52 |
53 | The image below demonstrates the 3 modes of 3D visualization:
54 | 1) **Image mode** (top-right), based on *body.landmarks*. Note that the size of the drawn skeleton depends on the distance camera-body, but that the mid hips reference point is restricted and can only moved inside a plane parallel to the wall grid;
55 | 2) **World mode** (bottom-left), based on *body.world_landmarks*. Note the mid hips reference point is fixed and the size of the skeleton does not change;
56 | 3) **Mixed mode** (bottom right), mixing *body.world_landmarks* with measured 3D location of the reference point. Like in World mode, the size of the skeleton does not change. But the mid hips reference point is not restricted any more.
57 |
58 | 
59 |
60 |
61 | ## Install
62 |
63 |
64 | Install the python packages (depthai, opencv, open3d) with the following command:
65 |
66 | ```
67 | python3 -m pip install -r requirements.txt
68 | ```
69 |
70 | ## Run
71 |
72 | **Usage:**
73 |
74 | ```
75 | -> python3 demo.py -h
76 | usage: demo.py [-h] [-e] [-i INPUT] [--pd_m PD_M] [--lm_m LM_M] [-xyz] [-c]
77 | [--no_smoothing] [-f INTERNAL_FPS]
78 | [--internal_frame_height INTERNAL_FRAME_HEIGHT] [-s] [-t]
79 | [--force_detection] [-3 {None,image,mixed,world}]
80 | [-o OUTPUT]
81 |
82 | optional arguments:
83 | -h, --help show this help message and exit
84 | -e, --edge Use Edge mode (postprocessing runs on the device)
85 |
86 | Tracker arguments:
87 | -i INPUT, --input INPUT
88 | 'rgb' or 'rgb_laconic' or path to video/image file to
89 | use as input (default=rgb)
90 | --pd_m PD_M Path to an .blob file for pose detection model
91 | --lm_m LM_M Landmark model ('full' or 'lite' or 'heavy') or path
92 | to an .blob file
93 | -xyz, --xyz Get (x,y,z) coords of reference body keypoint in
94 | camera coord system (only for compatible devices)
95 | -c, --crop Center crop frames to a square shape before feeding
96 | pose detection model
97 | --no_smoothing Disable smoothing filter
98 | -f INTERNAL_FPS, --internal_fps INTERNAL_FPS
99 | Fps of internal color camera. Too high value lower NN
100 | fps (default= depends on the model)
101 | --internal_frame_height INTERNAL_FRAME_HEIGHT
102 | Internal color camera frame height in pixels
103 | (default=640)
104 | -s, --stats Print some statistics at exit
105 | -t, --trace Print some debug messages
106 | --force_detection Force person detection on every frame (never use
107 | landmarks from previous frame to determine ROI)
108 |
109 | Renderer arguments:
110 | -3 {None,image,mixed,world}, --show_3d {None,image,mixed,world}
111 | Display skeleton in 3d in a separate window. See
112 | README for description.
113 | -o OUTPUT, --output OUTPUT
114 | Path to output video file
115 | ```
116 | **Examples :**
117 |
118 | - To use default internal color camera as input with the model "full" in Host mode:
119 |
120 | ```python3 demo.py```
121 |
122 | - To use default internal color camera as input with the model "full" in Edge mode [**preferred**]:
123 |
124 | ```python3 demo.py -e```
125 |
126 | - To use a file (video or image) as input :
127 |
128 | ```python3 demo.py -i filename```
129 |
130 | - To use the model "lite" :
131 |
132 | ```python3 demo.py -lm_m lite```
133 |
134 | - To measure body spatial location in camera coordinate system (only for depth-capable device like OAK-D):
135 | ```python3 demo.py -e -xyz```
136 |
137 | The measure is made only on one reference point:
138 | - the middle of the hips if both hips are visible;
139 | - the middle of the shoulders if hips are not visible and both shoulders are visible.
140 |
141 | - To show the skeleton in 3D 'world' mode (-xyz flag needed):
142 |
143 | ```python3 demo.py -e -xyz -3 world```
144 |
145 | 
146 |
147 | Note that the floor and wall grids does not correspond to a real floor and wall. Each grid square size is 1m x 1m.
148 |
149 | - When using the internal camera, to change its FPS to 15 :
150 |
151 | ```python3 demo.py --internal_fps 15```
152 |
153 | Note: by default, the default internal camera FPS depends on the model, the mode (Edge vs Host), the use of depth ("-xyz"). These default values are based on my own observations. **Please, don't hesitate to play with this parameter to find the optimal value.** If you observe that your FPS is well below the default value, you should lower the FPS with this option until the set FPS is just above the observed FPS.
154 |
155 | - When using the internal camera, you probably don't need to work with the full resolution. You can set a lower resolution (and win a bit of FPS) by using this option:
156 |
157 | ```python3 demo.py --internal_frame_size 450```
158 |
159 | Note: currently, depthai supports only some possible values for this argument. The value you specify will be replaced by the closest possible value (here 432 instead of 450).
160 |
161 | - By default, temporal filters smooth the landmark positions. Use *--no_smoothing* to disable the filter.
162 |
163 | |Keypress in OpenCV window|Function|
164 | |-|-|
165 | |*Esc*|Exit|
166 | |*space*|Pause|
167 | |r|Show/hide the bounding rotated rectangle around the body|
168 | |l|Show/hide landmarks|
169 | |s|Show/hide landmark score|
170 | |f|Show/hide FPS|
171 | |x|Show/hide (x,y,z) coordinates (only on depth-capable devices and if using "-xyz" flag)|
172 | |z|Show/hide the square zone used to measure depth (only on depth-capable devices and if using "-xyz" flag)|
173 |
174 | If using a 3D visualization mode ("-3" or "--show_3d"):
175 | |Keypress in Open3d window|Function|
176 | |-|-|
177 | |o|Oscillating (rotating back and forth) of the view|
178 | |r|Continuous rotating of the view|
179 | |s|Stop oscillating or rotating|
180 | |*Up*|Increasing rotating or oscillating speed|
181 | |*Down*|Decreasing rotating or oscillating speed|
182 | |*Right* or *Left*|Change the point of view to a predefined position|
183 | |*Mouse*|Freely change the point of view|
184 |
185 | ## Mediapipe models
186 | You can directly find the model files (.xml and .bin) under the 'models' directory. Below I describe how to get the files in case you need to regenerate the models.
187 |
188 | 1) Clone this github repository in a local directory (DEST_DIR)
189 | 2) In DEST_DIR/models directory, download the tflite models from [this archive](https://drive.google.com/file/d/1bEL4zmh2PEFsRfmFOofP0rbGNo-O1p5_/view?usp=sharing). The archive contains:
190 | * Pose detection model from Mediapipe 0.8.4,
191 | * Full, Lite anf Hevay pose landmark modelfrom Mediapipe 0.8.6.
192 |
193 | *Note: the Pose detection model from Mediapipe 0.8.6 can't currently be converted (more info [here](https://github.com/PINTO0309/tflite2tensorflow/issues/11)).*
194 |
195 | 3) Install the amazing [PINTO's tflite2tensorflow tool](https://github.com/PINTO0309/tflite2tensorflow). Use the docker installation which includes many packages including a recent version of Openvino.
196 | 3) From DEST_DIR, run the tflite2tensorflow container: ```./docker_tflite2tensorflow.sh```
197 | 4) From the running container:
198 | ```
199 | cd workdir/models
200 | ./convert_models.sh
201 | ```
202 | The *convert_models.sh* converts the tflite models in tensorflow (.pb), then converts the pb file into Openvino IR format (.xml and .bin), and finally converts the IR files in MyriadX format (.blob).
203 |
204 | 5) By default, the number of SHAVES associated with the blob files is 4. In case you want to generate new blobs with different number of shaves, you can use the script *gen_blob_shave.sh*:
205 | ```
206 | # Example: to generate blobs for 6 shaves
207 | ./gen_blob_shave.sh -m pd -n 6 # will generate pose_detection_sh6.blob
208 | ./gen_blob_shave.sh -m full -n 6 # will generate pose_landmark_full_sh6.blob
209 | ```
210 |
211 | **Explanation about the Model Optimizer params :**
212 | - The preview of the OAK-* color camera outputs BGR [0, 255] frames . The original tflite pose detection model is expecting RGB [-1, 1] frames. ```--reverse_input_channels``` converts BGR to RGB. ```--mean_values [127.5,127.5,127.5] --scale_values [127.5,127.5,127.5]``` normalizes the frames between [-1, 1].
213 | - The original landmark model is expecting RGB [0, 1] frames. Therefore, the following arguments are used ```--reverse_input_channels```, but unlike the detection model, we choose to do the normalization in the python code and not in the models (via ```--scale_values```). Indeed, we have observed a better accuracy with FP16 models when doing the normalization of the inputs outside of the models ([a possible explanation](https://github.com/PINTO0309/tflite2tensorflow/issues/9#issuecomment-842460014)).
214 |
215 | ## Custom models
216 |
217 | The `custom_models` directory contains the code to build the following custom models:
218 | - DetectionBestCandidate: this model processes the outputs of the pose detection network (a 1x2254x1 tensor for the scores and a 1x2254x12 for the regressors) and yields the regressor with the highest score.
219 | - DivideBy255: this model transforms an 256x256 RGB888p ([0, 255]) image to a 256x256 RGBF16F16F16p image ([0., 1.]).
220 |
221 | The method used to build these models is well explained on the [rahulrav's blog](https://rahulrav.com/blog/depthai_camera.html).
222 |
223 |
224 | ## Landmarks
225 |
226 |
227 |
228 |
229 |
230 | Source
231 |
232 |
233 |
234 | ## Code
235 |
236 | To facilitate reusability, the code is splitted in 2 classes:
237 | - **BlazeposeDepthai**, which is responsible of computing the body landmarks. The importation of this class depends on the mode:
238 | ```
239 | # For Host mode:
240 | from BlazeposeDepthai import BlazeposeDepthai
241 | ```
242 | ```
243 | # For Edge mode:
244 | from BlazeposeDepthaiEdge import BlazeposeDepthai
245 | ```
246 | - **BlazeposeRenderer**, which is responsible of rendering the landmarks and the skeleton on the video frame.
247 |
248 | This way, you can replace the renderer from this repository and write and personalize your own renderer (for some projects, you may not even need a renderer).
249 |
250 | The file ```demo.py``` is a representative example of how to use these classes:
251 | ```
252 | from BlazeposeDepthaiEdge import BlazeposeDepthai
253 | from BlazeposeRenderer import BlazeposeRenderer
254 |
255 | # The argparse stuff has been removed to keep only the important code
256 |
257 | tracker = BlazeposeDepthai(input_src=args.input,
258 | pd_model=args.pd_m,
259 | lm_model=args.lm_m,
260 | smoothing=not args.no_smoothing,
261 | xyz=args.xyz,
262 | crop=args.crop,
263 | internal_fps=args.internal_fps,
264 | internal_frame_height=args.internal_frame_height,
265 | force_detection=args.force_detection,
266 | stats=args.stats,
267 | trace=args.trace)
268 |
269 | renderer = BlazeposeRenderer(
270 | pose,
271 | show_3d=args.show_3d,
272 | output=args.output)
273 |
274 | while True:
275 | # Run blazepose on next frame
276 | frame, body = tracker.next_frame()
277 | if frame is None: break
278 | # Draw 2d skeleton
279 | frame = renderer.draw(frame, body)
280 | key = renderer.waitKey(delay=1)
281 | if key == 27 or key == ord('q'):
282 | break
283 | renderer.exit()
284 | tracker.exit()
285 | ```
286 |
287 | For more information on:
288 | - the arguments of the tracker, please refer to the docstring of class `BlazeposeDepthai` or `BlazeposeDepthaiEdge` in `BlazeposeDepthai.py` or `BlazeposeDepthaiEdge.py`;
289 | - the attributes of the 'body' element you can exploit in your program, please refer to the doctring of class `Body` in `mediapipe_utils.py`.
290 |
291 | ## Examples
292 |
293 | |||
294 | |-|-|
295 | |[Semaphore alphabet](examples/semaphore_alphabet) |
|
296 |
297 | ## Credits
298 | * [Google Mediapipe](https://github.com/google/mediapipe)
299 | * Katsuya Hyodo a.k.a [Pinto](https://github.com/PINTO0309), the Wizard of Model Conversion !
300 | * [Tai Chi Step by Step For Beginners Training Session 4](https://www.youtube.com/watch?v=oawZ_7wNWrU&ab_channel=MasterSongKungFu)
301 | * [Semaphore with The RCR Museum](https://www.youtube.com/watch?v=DezaTjQYPh0&ab_channel=TheRoyalCanadianRegimentMuseum)
302 |
--------------------------------------------------------------------------------
/custom_models/DetectionBestCandidate.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import sys, os
4 | sys.path.insert(1, os.path.realpath(os.path.pardir))
5 | from mediapipe_utils import generate_blazepose_anchors
6 | import numpy as np
7 | from math import pi
8 |
9 | anchors = generate_blazepose_anchors()
10 | print(f"Nb anchors: {len(anchors)}")
11 |
12 | detection_input_length = 224
13 |
14 | class DetectionBestCandidate(nn.Module):
15 | def __init__(self):
16 | super(DetectionBestCandidate, self).__init__()
17 | # anchors shape is nb_anchorsx4 [x_center, y_center, width, height]
18 | # Here: width and height is always 1, so we keep just [x_center, y_center]
19 | self.anchors = torch.from_numpy(anchors[:,:2])
20 | self.plus_anchor_center = np.array([[1,0,0,0,1,0,1,0,1,0,1,0], [0,1,0,0,0,1,0,1,0,1,0,1]], dtype=np.float)
21 | self.plus_anchor_center = torch.from_numpy(self.plus_anchor_center)
22 |
23 | def forward(self, x, y):
24 | # x.shape: 1xnb_anchorsx1
25 | # y.shape: 1xnb_anchorsx12
26 |
27 | # decode_bboxes
28 | best_id = torch.argmax(x)
29 | score = x[0][best_id]
30 | score = torch.sigmoid(score)
31 | bbox = y[0,best_id]
32 | bbox = bbox/detection_input_length + torch.mm(self.anchors[best_id].unsqueeze(0), self.plus_anchor_center)[0]
33 |
34 | # bbox : the 4 first elements describe the square bounding box around the head (not used for determining rotated rectangle around the body)
35 | # The 8 following corresponds to 4 (x,y) normalized keypoints coordinates (useful for determining rotated rectangle)
36 | # For full body, keypoints 1 and 2 are used.
37 | # For upper body, keypoints 3 and 4 are used.
38 | # Here we are only interested in full body
39 |
40 | # detections_to_rect (1st part)
41 | sqn_rr_center_xy = bbox[4:6]
42 | sqn_scale_xy = bbox[6:8]
43 |
44 | return torch.cat((score, sqn_rr_center_xy, sqn_scale_xy))
45 |
46 | def test():
47 |
48 | model = DetectionBestCandidate()
49 | X = torch.randn(1, len(anchors), 1, dtype=torch.float)
50 | Y = torch.randn(1, len(anchors), 12, dtype=torch.float)
51 | result = model(X, Y)
52 | print(result)
53 |
54 | def export_onnx():
55 | """
56 | Exports the model to an ONNX file.
57 | """
58 | model = DetectionBestCandidate()
59 | X = torch.randn(1, len(anchors), 1, dtype=torch.float)
60 | Y = torch.randn(1, len(anchors), 12, dtype=torch.float)
61 | onnx_name = "DetectionBestCandidate.onnx"
62 |
63 | print(f"Generating {onnx_name}")
64 | torch.onnx.export(
65 | model,
66 | (X, Y),
67 | onnx_name,
68 | opset_version=10,
69 | do_constant_folding=True,
70 | # verbose=True,
71 | input_names=['Identity_1', 'Identity'],
72 | output_names=['result']
73 | )
74 |
75 | if __name__ == "__main__":
76 |
77 | test()
78 | export_onnx()
--------------------------------------------------------------------------------
/custom_models/DetectionBestCandidate_sh1.blob:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/custom_models/DetectionBestCandidate_sh1.blob
--------------------------------------------------------------------------------
/custom_models/DivideBy255.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import sys, os
4 | sys.path.insert(1, os.path.realpath(os.path.pardir))
5 | import numpy as np
6 | from math import pi
7 |
8 |
9 |
10 | detection_input_length = 224
11 |
12 | class DivideBy255(nn.Module):
13 | def __init__(self):
14 | super(DivideBy255, self).__init__()
15 |
16 | def forward(self, x):
17 | return x / 255.0
18 |
19 | def test():
20 |
21 | model = DivideBy255()
22 |
23 | X = torch.ones(1, 3, 256, 256, dtype=torch.float)
24 | result = model(X)
25 | print(result)
26 |
27 | def export_onnx():
28 | """
29 | Exports the model to an ONNX file.
30 | """
31 | model = DivideBy255()
32 | X = torch.randn(1, 3, 256, 256, dtype=torch.float)
33 | onnx_name = "DivideBy255.onnx"
34 |
35 | print(f"Generating {onnx_name}")
36 | torch.onnx.export(
37 | model,
38 | (X),
39 | onnx_name,
40 | opset_version=10,
41 | do_constant_folding=True,
42 | # verbose=True,
43 | # input_names=['Identity_1', 'Identity'],
44 | output_names=['input_1']
45 | )
46 |
47 | if __name__ == "__main__":
48 |
49 | test()
50 | export_onnx()
--------------------------------------------------------------------------------
/custom_models/DivideBy255_sh1.blob:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/custom_models/DivideBy255_sh1.blob
--------------------------------------------------------------------------------
/custom_models/README.md:
--------------------------------------------------------------------------------
1 | ## Generation of DetectionBestCandidate.blob
2 | 1) pytorch is needed
3 | 2) Generate the ONNX model *DetectionBestCandidate.onnx* :
4 | ```
5 | python DetectionBestCandidate.py
6 | ```
7 |
8 | 3) Start the tflite2tensorflow docker container:
9 | ```
10 | ../docker_tflite2tensorflow.sh
11 | ```
12 | 4) From the container shell:
13 | ```
14 | cd workdir
15 | ./convert_model.sh -m DetectionBestCandidate # Use `-n #` if you want to change the number of shaves
16 | ```
17 |
18 | ## Generation of DivideBy255.blob
19 | Same as above with 'DivideBy255' instead of 'DetectionBestCandidate'.
20 |
21 |
--------------------------------------------------------------------------------
/custom_models/convert_model.sh:
--------------------------------------------------------------------------------
1 | # This script has to be run from the docker container started by ./docker_tflite2tensorflow.sh
2 |
3 | usage ()
4 | {
5 | echo "Generate a blob from an ONNX model with a specified number of shaves and cmx (nb cmx = nb shaves)"
6 | echo
7 | echo "Usage: ${0} [-m model] [-n nb_shaves]"
8 | echo
9 | echo "model: DetectionBestCandidate or DivideBy255 or other"
10 | echo "nb_shaves must be between 1 and 13 (default=1)"
11 | }
12 |
13 | while getopts ":hm:n:" opt; do
14 | case ${opt} in
15 | h )
16 | usage
17 | exit 0
18 | ;;
19 | m )
20 | model=$OPTARG
21 | ;;
22 | n )
23 | nb_shaves=$OPTARG
24 | ;;
25 | : )
26 | echo "Error: -$OPTARG requires an argument."
27 | usage
28 | exit 1
29 | ;;
30 | \? )
31 | echo "Invalid option: -$OPTARG"
32 | usage
33 | exit 1
34 | ;;
35 | esac
36 | done
37 |
38 | if [ -z "$model" ]
39 | then
40 | usage
41 | exit 1
42 | fi
43 | if [ ! -f $model_onnx ]
44 | then
45 | echo "The model ${model_onnx} does not exist"
46 | echo "Have you run 'python DetectionBestCandidate.py' ?"
47 | exit 1
48 | fi
49 |
50 | if [ -z "$nb_shaves" ]
51 | then
52 | nb_shaves=1
53 | fi
54 | if [ $nb_shaves -lt 1 -o $nb_shaves -gt 13 ]
55 | then
56 | echo "Invalid number of shaves !"
57 | usage
58 | exit 1
59 | fi
60 |
61 | if [ $model = DivideBy255 ]
62 | then
63 | precision=U8
64 | else
65 | precision=FP16
66 | fi
67 |
68 |
69 | model_onnx="${model}.onnx"
70 | model_xml="${model}.xml"
71 | model_blob="${model}_sh${nb_shaves}.blob"
72 |
73 | echo Model: $model_xml $model_blob
74 | echo Shaves $nb_shaves
75 |
76 | ARG=$1
77 | source /opt/intel/openvino_2021/bin/setupvars.sh
78 |
79 | # Use FP16 and make the batch_size explicit.
80 | python /opt/intel/openvino_2021/deployment_tools/model_optimizer/mo_onnx.py \
81 | --input_model $model_onnx --data_type half
82 | /opt/intel/openvino_2021/deployment_tools/tools/compile_tool/compile_tool -d MYRIAD \
83 | -m $model_xml \
84 | -ip $precision \
85 | -VPU_NUMBER_OF_SHAVES $nb_shaves \
86 | -VPU_NUMBER_OF_CMX_SLICES $nb_shaves \
87 | -o $model_blob
88 |
--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from BlazeposeRenderer import BlazeposeRenderer
4 | import argparse
5 |
6 | parser = argparse.ArgumentParser()
7 | parser.add_argument('-e', '--edge', action="store_true",
8 | help="Use Edge mode (postprocessing runs on the device)")
9 | parser_tracker = parser.add_argument_group("Tracker arguments")
10 | parser_tracker.add_argument('-i', '--input', type=str, default="rgb",
11 | help="'rgb' or 'rgb_laconic' or path to video/image file to use as input (default=%(default)s)")
12 | parser_tracker.add_argument("--pd_m", type=str,
13 | help="Path to an .blob file for pose detection model")
14 | parser_tracker.add_argument("--lm_m", type=str,
15 | help="Landmark model ('full' or 'lite' or 'heavy') or path to an .blob file")
16 | parser_tracker.add_argument('-xyz', '--xyz', action="store_true",
17 | help="Get (x,y,z) coords of reference body keypoint in camera coord system (only for compatible devices)")
18 | parser_tracker.add_argument('-c', '--crop', action="store_true",
19 | help="Center crop frames to a square shape before feeding pose detection model")
20 | parser_tracker.add_argument('--no_smoothing', action="store_true",
21 | help="Disable smoothing filter")
22 | parser_tracker.add_argument('-f', '--internal_fps', type=int,
23 | help="Fps of internal color camera. Too high value lower NN fps (default= depends on the model)")
24 | parser_tracker.add_argument('--internal_frame_height', type=int, default=640,
25 | help="Internal color camera frame height in pixels (default=%(default)i)")
26 | parser_tracker.add_argument('-s', '--stats', action="store_true",
27 | help="Print some statistics at exit")
28 | parser_tracker.add_argument('-t', '--trace', action="store_true",
29 | help="Print some debug messages")
30 | parser_tracker.add_argument('--force_detection', action="store_true",
31 | help="Force person detection on every frame (never use landmarks from previous frame to determine ROI)")
32 |
33 | parser_renderer = parser.add_argument_group("Renderer arguments")
34 | parser_renderer.add_argument('-3', '--show_3d', choices=[None, "image", "world", "mixed"], default=None,
35 | help="Display skeleton in 3d in a separate window. See README for description.")
36 | parser_renderer.add_argument("-o","--output",
37 | help="Path to output video file")
38 |
39 |
40 | args = parser.parse_args()
41 |
42 | if args.edge:
43 | from BlazeposeDepthaiEdge import BlazeposeDepthai
44 | else:
45 | from BlazeposeDepthai import BlazeposeDepthai
46 | tracker = BlazeposeDepthai(input_src=args.input,
47 | pd_model=args.pd_m,
48 | lm_model=args.lm_m,
49 | smoothing=not args.no_smoothing,
50 | xyz=args.xyz,
51 | crop=args.crop,
52 | internal_fps=args.internal_fps,
53 | internal_frame_height=args.internal_frame_height,
54 | force_detection=args.force_detection,
55 | stats=True,
56 | trace=args.trace)
57 |
58 | renderer = BlazeposeRenderer(
59 | tracker,
60 | show_3d=args.show_3d,
61 | output=args.output)
62 |
63 | while True:
64 | # Run blazepose on next frame
65 | frame, body = tracker.next_frame()
66 | if frame is None: break
67 | # Draw 2d skeleton
68 | frame = renderer.draw(frame, body)
69 | key = renderer.waitKey(delay=1)
70 | if key == 27 or key == ord('q'):
71 | break
72 | renderer.exit()
73 | tracker.exit()
--------------------------------------------------------------------------------
/docker_tflite2tensorflow.sh:
--------------------------------------------------------------------------------
1 | # Used to convert original Mediapipe tflite models to Openvino IR and MyriadX blob
2 | # https://github.com/PINTO0309/tflite2tensorflow
3 | #
4 | # First run: docker pull pinto0309/tflite2tensorflow
5 | docker run -it --rm \
6 | -v `pwd`:/home/user/workdir \
7 | -v /tmp/.X11-unix/:/tmp/.X11-unix:rw \
8 | --device /dev/video0:/dev/video0:mwr \
9 | --net=host \
10 | -e XDG_RUNTIME_DIR=$XDG_RUNTIME_DIR \
11 | -e DISPLAY=$DISPLAY \
12 | --privileged \
13 | pinto0309/tflite2tensorflow:latest bash
14 |
15 |
--------------------------------------------------------------------------------
/examples/semaphore_alphabet/README.md:
--------------------------------------------------------------------------------
1 | # Semaphore alphabet with Blazepose on DepthAI
2 |
3 | This demo demonstrates the recognition of semaphore alphabet.
4 |
5 | For each arm (segment shoulder-elbow), its angle with the vertical axe is calculated. The pair of left and right arm angles gives the letter.
6 |
7 |
8 | 
9 |
10 | ## Usage
11 |
12 | ```
13 | -> python3 demo.py -h
14 | usage: demo.py [-h] [-m {full,lite,831}] [-i INPUT] [-o OUTPUT]
15 |
16 | optional arguments:
17 | -h, --help show this help message and exit
18 | -m {full,lite,831}, --model {full,lite,831}
19 | Landmark model to use (default=full
20 | -i INPUT, --input INPUT
21 | 'rgb' or 'rgb_laconic' or path to video/image file to
22 | use as input (default: rgb)
23 | -o OUTPUT, --output OUTPUT
24 | Path to output video file
25 | ```
26 |
27 | ## Credits
28 |
29 | * [Semaphore with The RCR Museum](https://www.youtube.com/watch?v=DezaTjQYPh0&ab_channel=TheRoyalCanadianRegimentMuseum)
--------------------------------------------------------------------------------
/examples/semaphore_alphabet/demo.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | from math import atan2, degrees
3 | import sys
4 | sys.path.append("../..")
5 | from BlazeposeDepthai import BlazeposeDepthai
6 | from BlazeposeRenderer import BlazeposeRenderer
7 | from mediapipe_utils import KEYPOINT_DICT
8 | import argparse
9 |
10 |
11 | # For gesture demo
12 | semaphore_flag = {
13 | (3,4):'A', (2,4):'B', (1,4):'C', (0,4):'D',
14 | (4,7):'E', (4,6):'F', (4,5):'G', (2,3):'H',
15 | (0,3):'I', (0,6):'J', (3,0):'K', (3,7):'L',
16 | (3,6):'M', (3,5):'N', (2,1):'O', (2,0):'P',
17 | (2,7):'Q', (2,6):'R', (2,5):'S', (1,0):'T',
18 | (1,7):'U', (0,5):'V', (7,6):'W', (7,5):'X',
19 | (1,6):'Y', (5,6):'Z',
20 | }
21 |
22 | def recognize_gesture(b):
23 | # b: body
24 |
25 | def angle_with_y(v):
26 | # v: 2d vector (x,y)
27 | # Returns angle in degree of v with y-axis of image plane
28 | if v[1] == 0:
29 | return 90
30 | angle = atan2(v[0], v[1])
31 | return degrees(angle)
32 |
33 | # For the demo, we want to recognize the flag semaphore alphabet
34 | # For this task, we just need to measure the angles of both arms with vertical
35 | right_arm_angle = angle_with_y(b.landmarks[KEYPOINT_DICT['right_elbow'],:2] - b.landmarks[KEYPOINT_DICT['right_shoulder'],:2])
36 | left_arm_angle = angle_with_y(b.landmarks[KEYPOINT_DICT['left_elbow'],:2] - b.landmarks[KEYPOINT_DICT['left_shoulder'],:2])
37 | right_pose = int((right_arm_angle +202.5) / 45) % 8
38 | left_pose = int((left_arm_angle +202.5) / 45) % 8
39 | letter = semaphore_flag.get((right_pose, left_pose), None)
40 | return letter
41 |
42 | parser = argparse.ArgumentParser()
43 | parser.add_argument("-m", "--model", type=str, choices=['full', 'lite', '831'], default='full',
44 | help="Landmark model to use (default=%(default)s")
45 | parser.add_argument('-i', '--input', type=str, default='rgb',
46 | help="'rgb' or 'rgb_laconic' or path to video/image file to use as input (default: %(default)s)")
47 | parser.add_argument("-o","--output",
48 | help="Path to output video file")
49 | args = parser.parse_args()
50 |
51 | pose = BlazeposeDepthai(input_src=args.input, lm_model=args.model)
52 | renderer = BlazeposeRenderer(pose, output=args.output)
53 |
54 | while True:
55 | # Run blazepose on next frame
56 | frame, body = pose.next_frame()
57 | if frame is None: break
58 | # Draw 2d skeleton
59 | frame = renderer.draw(frame, body)
60 | # Gesture recognition
61 | if body:
62 | letter = recognize_gesture(body)
63 | if letter:
64 | cv2.putText(frame, letter, (frame.shape[1] // 2, 100), cv2.FONT_HERSHEY_PLAIN, 5, (0,190,255), 3)
65 | key = renderer.waitKey(delay=1)
66 | if key == 27 or key == ord('q'):
67 | break
68 | renderer.exit()
69 | pose.exit()
70 |
71 |
--------------------------------------------------------------------------------
/examples/semaphore_alphabet/medias/semaphore.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/examples/semaphore_alphabet/medias/semaphore.gif
--------------------------------------------------------------------------------
/examples/semaphore_alphabet/medias/semaphore.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/examples/semaphore_alphabet/medias/semaphore.mp4
--------------------------------------------------------------------------------
/examples/semaphore_alphabet/medias/video.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/examples/semaphore_alphabet/medias/video.mp4
--------------------------------------------------------------------------------
/img/3d_visualizations.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/img/3d_visualizations.gif
--------------------------------------------------------------------------------
/img/3d_world_visualization.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/img/3d_world_visualization.gif
--------------------------------------------------------------------------------
/img/full_body_landmarks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/img/full_body_landmarks.png
--------------------------------------------------------------------------------
/img/pipeline_edge_mode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/img/pipeline_edge_mode.png
--------------------------------------------------------------------------------
/img/pipeline_host_mode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/img/pipeline_host_mode.png
--------------------------------------------------------------------------------
/img/taichi.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/img/taichi.gif
--------------------------------------------------------------------------------
/mediapipe_utils.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import numpy as np
3 | from collections import namedtuple
4 | from math import ceil, sqrt, pi, floor, sin, cos, atan2, gcd
5 | from collections import namedtuple
6 |
7 | # To not display: RuntimeWarning: overflow encountered in exp
8 | # in line: scores = 1 / (1 + np.exp(-scores))
9 | np.seterr(over='ignore')
10 |
11 | # Dictionary that maps from joint names to keypoint indices.
12 | KEYPOINT_DICT = {
13 | "nose": 0,
14 | "left_eye_inner": 1,
15 | "left_eye": 2,
16 | "left_eye_outer": 3,
17 | "right_eye_inner": 4,
18 | "right_eye": 5,
19 | "right_eye_outer": 6,
20 | "left_ear": 7,
21 | "right_ear": 8,
22 | "mouth_left": 9,
23 | "mouth_right": 10,
24 | "left_shoulder": 11,
25 | "right_shoulder": 12,
26 | "left_elbow": 13,
27 | "right_elbow": 14,
28 | "left_wrist": 15,
29 | "right_wrist": 16,
30 | "left_pinky": 17,
31 | "right_pinky": 18,
32 | "left_index": 19,
33 | "right_index": 20,
34 | "left_thumb": 21,
35 | "right_thumb": 22,
36 | "left_hip": 23,
37 | "right_hip": 24,
38 | "left_knee": 25,
39 | "right_knee": 26,
40 | "left_ankle": 27,
41 | "right_ankle": 28,
42 | "left_heel": 29,
43 | "right_heel": 30,
44 | "left_foot_index": 31,
45 | "right_foot_index": 32
46 | }
47 |
48 | class Body:
49 | def __init__(self, pd_score=None, pd_box=None, pd_kps=None):
50 | """
51 | Attributes:
52 | pd_score : detection score
53 | pd_box : detection box [x, y, w, h], normalized [0,1] in the squared image
54 | pd_kps : detection keypoints coordinates [x, y], normalized [0,1] in the squared image
55 | rect_x_center, rect_y_center : center coordinates of the rotated bounding rectangle, normalized [0,1] in the squared image
56 | rect_w, rect_h : width and height of the rotated bounding rectangle, normalized in the squared image (may be > 1)
57 | rotation : rotation angle of rotated bounding rectangle with y-axis in radian
58 | rect_x_center_a, rect_y_center_a : center coordinates of the rotated bounding rectangle, in pixels in the squared image
59 | rect_w, rect_h : width and height of the rotated bounding rectangle, in pixels in the squared image
60 | rect_points : list of the 4 points coordinates of the rotated bounding rectangle, in pixels
61 | expressed in the squared image during processing,
62 | expressed in the original rectangular image when returned to the user
63 | lm_score: global landmark score
64 | norm_landmarks : 3D landmarks coordinates in the rotated bounding rectangle, normalized [0,1]
65 | landmarks : 3D landmarks coordinates in the rotated bounding rectangle, in pixel in the original rectangular image
66 | world_landmarks : 3D landmarks coordinates in meter with mid hips point being the origin.
67 | The y value of landmarks_world coordinates is negative for landmarks
68 | above the mid hips (like shoulders) and negative for landmarks below (like feet)
69 | xyz: (optionally) 3D location in camera coordinate system of reference point (mid hips or mid shoulders)
70 | xyz_ref: (optionally) name of the reference point ("mid_hips" or "mid_shoulders"),
71 | xyz_zone: (optionally) 4 int array of zone (in the source image) on which is measured depth.
72 | xyz_zone[0:2] is top-left zone corner in pixels, xyz_zone[2:4] is bottom-right zone corner
73 | """
74 | self.pd_score = pd_score
75 | self.pd_box = pd_box
76 | self.pd_kps = pd_kps
77 | def print(self):
78 | attrs = vars(self)
79 | print('\n'.join("%s: %s" % item for item in attrs.items()))
80 |
81 |
82 | SSDAnchorOptions = namedtuple('SSDAnchorOptions',[
83 | 'num_layers',
84 | 'min_scale',
85 | 'max_scale',
86 | 'input_size_height',
87 | 'input_size_width',
88 | 'anchor_offset_x',
89 | 'anchor_offset_y',
90 | 'strides',
91 | 'aspect_ratios',
92 | 'reduce_boxes_in_lowest_layer',
93 | 'interpolated_scale_aspect_ratio',
94 | 'fixed_anchor_size'])
95 |
96 | def calculate_scale(min_scale, max_scale, stride_index, num_strides):
97 | if num_strides == 1:
98 | return (min_scale + max_scale) / 2
99 | else:
100 | return min_scale + (max_scale - min_scale) * stride_index / (num_strides - 1)
101 |
102 | def generate_anchors(options):
103 | """
104 | option : SSDAnchorOptions
105 | # https://github.com/google/mediapipe/blob/master/mediapipe/calculators/tflite/ssd_anchors_calculator.cc
106 | """
107 | anchors = []
108 | layer_id = 0
109 | n_strides = len(options.strides)
110 | while layer_id < n_strides:
111 | anchor_height = []
112 | anchor_width = []
113 | aspect_ratios = []
114 | scales = []
115 | # For same strides, we merge the anchors in the same order.
116 | last_same_stride_layer = layer_id
117 | while last_same_stride_layer < n_strides and \
118 | options.strides[last_same_stride_layer] == options.strides[layer_id]:
119 | scale = calculate_scale(options.min_scale, options.max_scale, last_same_stride_layer, n_strides)
120 | if last_same_stride_layer == 0 and options.reduce_boxes_in_lowest_layer:
121 | # For first layer, it can be specified to use predefined anchors.
122 | aspect_ratios += [1.0, 2.0, 0.5]
123 | scales += [0.1, scale, scale]
124 | else:
125 | aspect_ratios += options.aspect_ratios
126 | scales += [scale] * len(options.aspect_ratios)
127 | if options.interpolated_scale_aspect_ratio > 0:
128 | if last_same_stride_layer == n_strides -1:
129 | scale_next = 1.0
130 | else:
131 | scale_next = calculate_scale(options.min_scale, options.max_scale, last_same_stride_layer+1, n_strides)
132 | scales.append(sqrt(scale * scale_next))
133 | aspect_ratios.append(options.interpolated_scale_aspect_ratio)
134 | last_same_stride_layer += 1
135 |
136 | for i,r in enumerate(aspect_ratios):
137 | ratio_sqrts = sqrt(r)
138 | anchor_height.append(scales[i] / ratio_sqrts)
139 | anchor_width.append(scales[i] * ratio_sqrts)
140 |
141 | stride = options.strides[layer_id]
142 | feature_map_height = ceil(options.input_size_height / stride)
143 | feature_map_width = ceil(options.input_size_width / stride)
144 |
145 | for y in range(feature_map_height):
146 | for x in range(feature_map_width):
147 | for anchor_id in range(len(anchor_height)):
148 | x_center = (x + options.anchor_offset_x) / feature_map_width
149 | y_center = (y + options.anchor_offset_y) / feature_map_height
150 | # new_anchor = Anchor(x_center=x_center, y_center=y_center)
151 | if options.fixed_anchor_size:
152 | new_anchor = [x_center, y_center, 1.0, 1.0]
153 | # new_anchor.w = 1.0
154 | # new_anchor.h = 1.0
155 | else:
156 | new_anchor = [x_center, y_center, anchor_width[anchor_id], anchor_height[anchor_id]]
157 | # new_anchor.w = anchor_width[anchor_id]
158 | # new_anchor.h = anchor_height[anchor_id]
159 | anchors.append(new_anchor)
160 |
161 | layer_id = last_same_stride_layer
162 | return np.array(anchors)
163 |
164 | def generate_blazepose_anchors():
165 | # https://github.com/google/mediapipe/blob/master/mediapipe/modules/pose_detection/pose_detection_cpu.pbtxt
166 | anchor_options = SSDAnchorOptions(
167 | num_layers=5,
168 | min_scale=0.1484375,
169 | max_scale=0.75,
170 | input_size_height=224,
171 | input_size_width=224,
172 | anchor_offset_x=0.5,
173 | anchor_offset_y=0.5,
174 | strides=[8, 16, 32, 32, 32],
175 | aspect_ratios= [1.0],
176 | reduce_boxes_in_lowest_layer=False,
177 | interpolated_scale_aspect_ratio=1.0,
178 | fixed_anchor_size=True)
179 | return generate_anchors(anchor_options)
180 |
181 | def decode_bboxes(score_thresh, scores, bboxes, anchors, best_only=False):
182 | """
183 | wi, hi : NN input shape
184 | https://github.com/google/mediapipe/blob/master/mediapipe/modules/pose_detection/pose_detection_cpu.pbtxt
185 | # Decodes the detection tensors generated by the TensorFlow Lite model, based on
186 | # the SSD anchors and the specification in the options, into a vector of
187 | # detections. Each detection describes a detected object.
188 | Version 0.8.3.1:
189 | node {
190 | calculator: "TensorsToDetectionsCalculator"
191 | input_stream: "TENSORS:detection_tensors"
192 | input_side_packet: "ANCHORS:anchors"
193 | output_stream: "DETECTIONS:unfiltered_detections"
194 | options: {
195 | [mediapipe.TensorsToDetectionsCalculatorOptions.ext] {
196 | num_classes: 1
197 | num_boxes: 896
198 | num_coords: 12
199 | box_coord_offset: 0
200 | keypoint_coord_offset: 4
201 | num_keypoints: 4
202 | num_values_per_keypoint: 2
203 | sigmoid_score: true
204 | score_clipping_thresh: 100.0
205 | reverse_output_order: true
206 | x_scale: 128.0
207 | y_scale: 128.0
208 | h_scale: 128.0
209 | w_scale: 128.0
210 | min_score_thresh: 0.5
211 | }
212 | }
213 |
214 | Version 0.8.4:
215 | [mediapipe.TensorsToDetectionsCalculatorOptions.ext] {
216 | num_classes: 1
217 | num_boxes: 2254
218 | num_coords: 12
219 | box_coord_offset: 0
220 | keypoint_coord_offset: 4
221 | num_keypoints: 4
222 | num_values_per_keypoint: 2
223 | sigmoid_score: true
224 | score_clipping_thresh: 100.0
225 | reverse_output_order: true
226 | x_scale: 224.0
227 | y_scale: 224.0
228 | h_scale: 224.0
229 | w_scale: 224.0
230 | min_score_thresh: 0.5
231 | }
232 |
233 | # Bounding box in each pose detection is currently set to the bounding box of
234 | # the detected face. However, 4 additional key points are available in each
235 | # detection, which are used to further calculate a (rotated) bounding box that
236 | # encloses the body region of interest. Among the 4 key points, the first two
237 | # are for identifying the full-body region, and the second two for upper body
238 | # only:
239 | #
240 | # Key point 0 - mid hip center
241 | # Key point 1 - point that encodes size & rotation (for full body)
242 | # Key point 2 - mid shoulder center
243 | # Key point 3 - point that encodes size & rotation (for upper body)
244 | #
245 |
246 | scores: shape = [number of anchors 896]
247 | bboxes: shape = [ number of anchors x 12], 12 = 4 (bounding box : (cx,cy,w,h) + 8 (4 palm keypoints)
248 | """
249 | bodies = []
250 | scores = 1 / (1 + np.exp(-scores))
251 | if best_only:
252 | best_id = np.argmax(scores)
253 | if scores[best_id] < score_thresh: return bodies
254 | det_scores = scores[best_id:best_id+1]
255 | det_bboxes = bboxes[best_id:best_id+1]
256 | det_anchors = anchors[best_id:best_id+1]
257 | else:
258 | detection_mask = scores > score_thresh
259 | det_scores = scores[detection_mask]
260 | if det_scores.size == 0: return bodies
261 | det_bboxes = bboxes[detection_mask]
262 | det_anchors = anchors[detection_mask]
263 |
264 | scale = 224 # x_scale, y_scale, w_scale, h_scale
265 |
266 | # cx, cy, w, h = bboxes[i,:4]
267 | # cx = cx * anchor.w / wi + anchor.x_center
268 | # cy = cy * anchor.h / hi + anchor.y_center
269 | # lx = lx * anchor.w / wi + anchor.x_center
270 | # ly = ly * anchor.h / hi + anchor.y_center
271 | det_bboxes = det_bboxes* np.tile(det_anchors[:,2:4], 6) / scale + np.tile(det_anchors[:,0:2],6)
272 | # w = w * anchor.w / wi (in the prvious line, we add anchor.x_center and anchor.y_center to w and h, we need to substract them now)
273 | # h = h * anchor.h / hi
274 | det_bboxes[:,2:4] = det_bboxes[:,2:4] - det_anchors[:,0:2]
275 | # box = [cx - w*0.5, cy - h*0.5, w, h]
276 | det_bboxes[:,0:2] = det_bboxes[:,0:2] - det_bboxes[:,3:4] * 0.5
277 |
278 | for i in range(det_bboxes.shape[0]):
279 | score = det_scores[i]
280 | box = det_bboxes[i,0:4]
281 | kps = []
282 | for kp in range(4):
283 | kps.append(det_bboxes[i,4+kp*2:6+kp*2])
284 | bodies.append(Body(float(score), box, kps))
285 | return bodies
286 |
287 |
288 | def non_max_suppression(bodies, nms_thresh):
289 |
290 | # cv2.dnn.NMSBoxes(boxes, scores, 0, nms_thresh) needs:
291 | # boxes = [ [x, y, w, h], ...] with x, y, w, h of type int
292 | # Currently, x, y, w, h are float between 0 and 1, so we arbitrarily multiply by 1000 and cast to int
293 | # boxes = [r.box for r in bodies]
294 | boxes = [ [int(x*1000) for x in r.pd_box] for r in bodies]
295 | scores = [r.pd_score for r in bodies]
296 | indices = cv2.dnn.NMSBoxes(boxes, scores, 0, nms_thresh)
297 | return [bodies[i[0]] for i in indices]
298 |
299 | def normalize_radians(angle):
300 | return angle - 2 * pi * floor((angle + pi) / (2 * pi))
301 |
302 | def rot_vec(vec, rotation):
303 | vx, vy = vec
304 | return [vx * cos(rotation) - vy * sin(rotation), vx * sin(rotation) + vy * cos(rotation)]
305 |
306 | def detections_to_rect(body, kp_pair=[0,1]):
307 | # https://github.com/google/mediapipe/blob/master/mediapipe/modules/pose_landmark/pose_detection_to_roi.pbtxt
308 | # # Converts pose detection into a rectangle based on center and scale alignment
309 | # # points. Pose detection contains four key points: first two for full-body pose
310 | # # and two more for upper-body pose.
311 | # node {
312 | # calculator: "SwitchContainer"
313 | # input_side_packet: "ENABLE:upper_body_only"
314 | # input_stream: "DETECTION:detection"
315 | # input_stream: "IMAGE_SIZE:image_size"
316 | # output_stream: "NORM_RECT:raw_roi"
317 | # options {
318 | # [mediapipe.SwitchContainerOptions.ext] {
319 | # contained_node: {
320 | # calculator: "AlignmentPointsRectsCalculator"
321 | # options: {
322 | # [mediapipe.DetectionsToRectsCalculatorOptions.ext] {
323 | # rotation_vector_start_keypoint_index: 0
324 | # rotation_vector_end_keypoint_index: 1
325 | # rotation_vector_target_angle_degrees: 90
326 | # }
327 | # }
328 | # }
329 | # contained_node: {
330 | # calculator: "AlignmentPointsRectsCalculator"
331 | # options: {
332 | # [mediapipe.DetectionsToRectsCalculatorOptions.ext] {
333 | # rotation_vector_start_keypoint_index: 2
334 | # rotation_vector_end_keypoint_index: 3
335 | # rotation_vector_target_angle_degrees: 90
336 | # }
337 | # }
338 | # }
339 | # }
340 | # }
341 | # }
342 |
343 | target_angle = pi * 0.5 # 90 = pi/2
344 |
345 | # AlignmentPointsRectsCalculator : https://github.com/google/mediapipe/blob/master/mediapipe/calculators/util/alignment_points_to_rects_calculator.cc
346 | x_center, y_center = body.pd_kps[kp_pair[0]]
347 | x_scale, y_scale = body.pd_kps[kp_pair[1]]
348 | # Bounding box size as double distance from center to scale point.
349 | box_size = sqrt((x_scale-x_center)**2 + (y_scale-y_center)**2) * 2
350 | body.rect_w = box_size
351 | body.rect_h = box_size
352 | body.rect_x_center = x_center
353 | body.rect_y_center = y_center
354 |
355 | rotation = target_angle - atan2(-(y_scale - y_center), x_scale - x_center)
356 | body.rotation = normalize_radians(rotation)
357 |
358 | def rotated_rect_to_points(cx, cy, w, h, rotation):
359 | b = cos(rotation) * 0.5
360 | a = sin(rotation) * 0.5
361 | points = []
362 | p0x = cx - a*h - b*w
363 | p0y = cy + b*h - a*w
364 | p1x = cx + a*h - b*w
365 | p1y = cy - b*h - a*w
366 | p2x = int(2*cx - p0x)
367 | p2y = int(2*cy - p0y)
368 | p3x = int(2*cx - p1x)
369 | p3y = int(2*cy - p1y)
370 | p0x, p0y, p1x, p1y = int(p0x), int(p0y), int(p1x), int(p1y)
371 | return [[p0x,p0y], [p1x,p1y], [p2x,p2y], [p3x,p3y]]
372 |
373 | def rect_transformation(body, w, h, scale = 1.25):
374 | """
375 | w, h : image input shape
376 | """
377 | # https://github.com/google/mediapipe/blob/master/mediapipe/modules/pose_landmark/pose_detection_to_roi.pbtxt
378 | # # Expands pose rect with marging used during training.
379 | # node {
380 | # calculator: "RectTransformationCalculator"
381 | # input_stream: "NORM_RECT:raw_roi"
382 | # input_stream: "IMAGE_SIZE:image_size"
383 | # output_stream: "roi"
384 | # options: {
385 | # [mediapipe.RectTransformationCalculatorOptions.ext] {
386 | # Version 0831:
387 | # scale_x: 1.5
388 | # scale_y: 1.5
389 | # Version 084:
390 | # scale_x: 1.25
391 | # scale_y: 1.25
392 | # square_long: true
393 | # }
394 | # }
395 | # }
396 | scale_x = scale
397 | scale_y = scale
398 | shift_x = 0
399 | shift_y = 0
400 |
401 | width = body.rect_w
402 | height = body.rect_h
403 | rotation = body.rotation
404 | if rotation == 0:
405 | body.rect_x_center_a = (body.rect_x_center + width * shift_x) * w
406 | body.rect_y_center_a = (body.rect_y_center + height * shift_y) * h
407 | else:
408 | x_shift = (w * width * shift_x * cos(rotation) - h * height * shift_y * sin(rotation))
409 | y_shift = (w * width * shift_x * sin(rotation) + h * height * shift_y * cos(rotation))
410 | body.rect_x_center_a = body.rect_x_center*w + x_shift
411 | body.rect_y_center_a = body.rect_y_center*h + y_shift
412 |
413 | # square_long: true
414 | long_side = max(width * w, height * h)
415 | body.rect_w_a = long_side * scale_x
416 | body.rect_h_a = long_side * scale_y
417 | body.rect_points = rotated_rect_to_points(body.rect_x_center_a, body.rect_y_center_a, body.rect_w_a, body.rect_h_a, body.rotation)
418 |
419 | def warp_rect_img(rect_points, img, w, h):
420 | src = np.array(rect_points[1:], dtype=np.float32) # rect_points[0] is left bottom point !
421 | dst = np.array([(0, 0), (w, 0), (w, h)], dtype=np.float32)
422 | mat = cv2.getAffineTransform(src, dst)
423 | return cv2.warpAffine(img, mat, (w, h))
424 |
425 | def distance(a, b):
426 | """
427 | a, b: 2 points in 3D (x,y,z)
428 | """
429 | return np.linalg.norm(a-b)
430 |
431 | def angle(a, b, c):
432 | # https://stackoverflow.com/questions/35176451/python-code-to-calculate-angle-between-three-point-using-their-3d-coordinates
433 | # a, b and c : points as np.array([x, y, z])
434 | ba = a - b
435 | bc = c - b
436 | cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
437 | angle = np.arccos(cosine_angle)
438 |
439 | return np.degrees(angle)
440 |
441 | #
442 | def find_isp_scale_params(size, is_height=True):
443 | """
444 | Find closest valid size close to 'size' and and the corresponding parameters to setIspScale()
445 | This function is useful to work around a bug in depthai where ImageManip is scrambling images that have an invalid size
446 | is_height : boolean that indicates if the value is the height or the width of the image
447 | Returns: valid size, (numerator, denominator)
448 | """
449 | # We want size >= 288
450 | if size < 288:
451 | size = 288
452 |
453 | # We are looking for the list on integers that are divisible by 16 and
454 | # that can be written like n/d where n <= 16 and d <= 63
455 | if is_height:
456 | reference = 1080
457 | other = 1920
458 | else:
459 | reference = 1920
460 | other = 1080
461 | size_candidates = {}
462 | for s in range(288,reference,16):
463 | f = gcd(reference, s)
464 | n = s//f
465 | d = reference//f
466 | if n <= 16 and d <= 63 and int(round(other * n / d) % 2 == 0):
467 | size_candidates[s] = (n, d)
468 |
469 | # What is the candidate size closer to 'size' ?
470 | min_dist = -1
471 | for s in size_candidates:
472 | dist = abs(size - s)
473 | if min_dist == -1:
474 | min_dist = dist
475 | candidate = s
476 | else:
477 | if dist > min_dist: break
478 | candidate = s
479 | min_dist = dist
480 | return candidate, size_candidates[candidate]
481 |
482 | #
483 | # Filtering
484 | #
485 |
486 | class LandmarksSmoothingFilter:
487 | '''
488 | Adapted from: https://github.com/google/mediapipe/blob/master/mediapipe/calculators/util/landmarks_smoothing_calculator.cc
489 |
490 | frequency, min_cutoff, beta, derivate_cutoff:
491 | See class OneEuroFilter description.
492 | min_allowed_object_scale:
493 | If calculated object scale is less than given value smoothing will be
494 | disabled and landmarks will be returned as is. Default=1e-6
495 | disable_value_scaling:
496 | Disable value scaling based on object size and use `1.0` instead.
497 | If not disabled, value scale is calculated as inverse value of object
498 | size. Object size is calculated as maximum side of rectangular bounding
499 | box of the object in XY plane. Default=False
500 | '''
501 | def __init__(self,
502 | frequency=30,
503 | min_cutoff=1,
504 | beta=0,
505 | derivate_cutoff=1,
506 | min_allowed_object_scale=1e-6,
507 | disable_value_scaling=False
508 | ):
509 | self.frequency = frequency
510 | self.min_cutoff = min_cutoff
511 | self.beta = beta
512 | self.derivate_cutoff = derivate_cutoff
513 | self.min_allowed_object_scale = min_allowed_object_scale
514 | self.disable_value_scaling = disable_value_scaling
515 | self.init = True
516 |
517 | @staticmethod
518 | def get_object_scale(landmarks):
519 | # Estimate object scale to use its inverse value as velocity scale for
520 | # RelativeVelocityFilter. If value will be too small (less than
521 | # `options_.min_allowed_object_scale`) smoothing will be disabled and
522 | # landmarks will be returned as is.
523 | # Object scale is calculated as average between bounding box width and height
524 | # with sides parallel to axis.
525 | min_xy = np.min(landmarks[:,:2], axis=0)
526 | max_xy = np.max(landmarks[:,:2], axis=0)
527 | return np.mean(max_xy - min_xy)
528 |
529 | def apply(self, landmarks, timestamp, object_scale=0):
530 | # object_scale: in practice, we use the size of the rotated rectangle region.rect_w_a=region.rect_h_a
531 |
532 | # Initialize filters
533 | if self.init:
534 | self.filters = OneEuroFilter(self.frequency, self.min_cutoff, self.beta, self.derivate_cutoff)
535 | self.init = False
536 |
537 | # Get value scale as inverse value of the object scale.
538 | # If value is too small smoothing will be disabled and landmarks will be
539 | # returned as is.
540 | if self.disable_value_scaling:
541 | value_scale = 1
542 | else:
543 | object_scale = object_scale if object_scale else self.get_object_scale(landmarks)
544 | if object_scale < self.min_allowed_object_scale:
545 | return landmarks
546 | value_scale = 1 / object_scale
547 |
548 | return self.filters.apply(landmarks, value_scale, timestamp)
549 |
550 | def get_alpha(self, cutoff):
551 | '''
552 | te = 1.0 / self.frequency
553 | tau = 1.0 / (2 * Math.PI * cutoff)
554 | result = 1 / (1.0 + (tau / te))
555 | '''
556 | return 1.0 / (1.0 + (self.frequency / (2 * pi * cutoff)))
557 |
558 | def reset(self):
559 | self.init = True
560 |
561 | class OneEuroFilter:
562 | '''
563 | Adapted from: https://github.com/google/mediapipe/blob/master/mediapipe/util/filtering/one_euro_filter.cc
564 | Paper: https://cristal.univ-lille.fr/~casiez/1euro/
565 |
566 | frequency:
567 | Frequency of incoming frames defined in seconds. Used
568 | only if can't be calculated from provided events (e.g.
569 | on the very first frame). Default=30
570 | min_cutoff:
571 | Minimum cutoff frequency. Start by tuning this parameter while
572 | keeping `beta=0` to reduce jittering to the desired level. 1Hz
573 | (the default value) is a a good starting point.
574 | beta:
575 | Cutoff slope. After `min_cutoff` is configured, start
576 | increasing `beta` value to reduce the lag introduced by the
577 | `min_cutoff`. Find the desired balance between jittering and lag. Default=0
578 | derivate_cutoff:
579 | Cutoff frequency for derivate. It is set to 1Hz in the
580 | original algorithm, but can be turned to further smooth the
581 | speed (i.e. derivate) on the object. Default=1
582 | '''
583 | def __init__(self,
584 | frequency=30,
585 | min_cutoff=1,
586 | beta=0,
587 | derivate_cutoff=1,
588 | ):
589 | self.frequency = frequency
590 | self.min_cutoff = min_cutoff
591 | self.beta = beta
592 | self.derivate_cutoff = derivate_cutoff
593 | self.x = LowPassFilter(self.get_alpha(min_cutoff))
594 | self.dx = LowPassFilter(self.get_alpha(derivate_cutoff))
595 | self.last_timestamp = 0
596 |
597 | def get_alpha(self, cutoff):
598 | '''
599 | te = 1.0 / self.frequency
600 | tau = 1.0 / (2 * Math.PI * cutoff)
601 | result = 1 / (1.0 + (tau / te))
602 | '''
603 | return 1.0 / (1.0 + (self.frequency / (2 * pi * cutoff)))
604 |
605 | def apply(self, value, value_scale, timestamp):
606 | '''
607 | Applies filter to the value.
608 | timestamp in s associated with the value (for instance,
609 | timestamp of the frame where you got value from).
610 | '''
611 | if self.last_timestamp >= timestamp:
612 | # Results are unpreditable in this case, so nothing to do but return same value.
613 | return value
614 |
615 | # Update the sampling frequency based on timestamps.
616 | if self.last_timestamp != 0 and timestamp != 0:
617 | self.frequency = 1 / (timestamp - self.last_timestamp)
618 | self.last_timestamp = timestamp
619 |
620 | # Estimate the current variation per second.
621 | if self.x.has_last_raw_value():
622 | dvalue = (value - self.x.last_raw_value()) * value_scale * self.frequency
623 | else:
624 | dvalue = 0
625 | edvalue = self.dx.apply_with_alpha(dvalue, self.get_alpha(self.derivate_cutoff))
626 |
627 | # Use it to update the cutoff frequency
628 | cutoff = self.min_cutoff + self.beta * np.abs(edvalue)
629 |
630 | # filter the given value.
631 | return self.x.apply_with_alpha(value, self.get_alpha(cutoff))
632 |
633 | class LowPassFilter:
634 | '''
635 | Adapted from: https://github.com/google/mediapipe/blob/master/mediapipe/util/filtering/low_pass_filter.cc
636 | Note that 'value' can be a numpy array
637 | '''
638 | def __init__(self, alpha=0.9):
639 | self.alpha = alpha
640 | self.initialized = False
641 |
642 | def apply(self, value):
643 | if self.initialized:
644 | # Regular lowpass filter.
645 | # result = alpha * value + (1 - alpha) * stored_value;
646 | result = self.alpha * value + (1 - self.alpha) * self.stored_value
647 | else:
648 | result = value
649 | self.initialized = True
650 | self.raw_value = value
651 | self.stored_value = result
652 | return result
653 |
654 | def apply_with_alpha(self, value, alpha):
655 | self.alpha = alpha
656 | return self.apply(value)
657 |
658 | def has_last_raw_value(self):
659 | return self.initialized
660 |
661 | def last_raw_value(self):
662 | return self.raw_value
663 |
664 | def last_value(self):
665 | return self.stored_value
666 |
667 | def reset(self):
668 | self.initialized = False
669 |
670 |
671 |
--------------------------------------------------------------------------------
/models/convert_models.sh:
--------------------------------------------------------------------------------
1 | # This script has to be run from the docker container started by ./docker_tflite2tensorflow.sh
2 |
3 | source /opt/intel/openvino_2021/bin/setupvars.sh
4 |
5 | replace () { # Replace in file $1 occurences of the string $2 by $3
6 | sed "s/${2}/${3}/" $1 > tmpf
7 | mv tmpf $1
8 | }
9 |
10 | convert_model () {
11 | model_name=$1
12 | if [ -z "$2" ]
13 | then
14 | arg_mean_values=""
15 | else
16 | arg_mean_values="--mean_values ${2}"
17 | fi
18 | if [ -z "$3" ]
19 | then
20 | arg_scale_values=""
21 | else
22 | arg_scale_values="--scale_values ${3}"
23 | fi
24 | mean_values=$2
25 | scale_values=$3
26 | input_precision=$4
27 | tflite2tensorflow \
28 | --model_path ${model_name}.tflite \
29 | --model_output_path ${model_name} \
30 | --flatc_path ~/flatc \
31 | --schema_path ~/schema.fbs \
32 | --output_pb \
33 | --optimizing_for_openvino_and_myriad
34 | # For generating Openvino "non normalized input" models (the normalization would need to be made explictly in the code):
35 | #tflite2tensorflow \
36 | # --model_path ${model_name}.tflite \
37 | # --model_output_path ${model_name} \
38 | # --flatc_path ~/flatc \
39 | # --schema_path ~/schema.fbs \
40 | # --output_openvino_and_myriad
41 | # Generate Openvino "normalized input" models
42 | /opt/intel/openvino_2021/deployment_tools/model_optimizer/mo_tf.py \
43 | --saved_model_dir ${model_name} \
44 | --model_name ${model_name} \
45 | --data_type FP16 \
46 | ${arg_mean_values} \
47 | ${arg_scale_values} \
48 | --reverse_input_channels
49 | # For Interpolate layers, replace in coordinate_transformation_mode, "half_pixel" by "align_corners" (bug optimizer)
50 | if [ model_name != "pose_detection" ]
51 | then
52 | replace ${model_name}.xml half_pixel align_corners
53 | fi
54 |
55 | /opt/intel/openvino_2021/deployment_tools/inference_engine/lib/intel64/myriad_compile \
56 | -m ${model_name}.xml \
57 | -ip $input_precision \
58 | -VPU_NUMBER_OF_SHAVES 4 \
59 | -VPU_NUMBER_OF_CMX_SLICES 4 \
60 | -o ${model_name}_sh4.blob
61 | }
62 |
63 | convert_model pose_detection "[127.5,127.5,127.5]" "[127.5,127.5,127.5]" "u8"
64 | convert_model pose_landmark_full "" "" "fp16"
65 | convert_model pose_landmark_lite "" "" "fp16"
66 | convert_model pose_landmark_heavy "" "" "fp16"
67 |
68 |
69 |
--------------------------------------------------------------------------------
/models/gen_blob_shave.sh:
--------------------------------------------------------------------------------
1 | # This script has to be run from the docker container started by ./docker_tflite2tensorflow.sh
2 |
3 | usage ()
4 | {
5 | echo "Generate a new blob with a specified number of shaves and cmx (nb cmx = nb shaves)"
6 | echo "Usage: ${0} -m model -n nb_shaves"
7 | echo "model = 'pd' for pose detection, 'full' or 'lite' or 'heavy' for landmarks"
8 | echo "nb_shaves must be between 1 and 13"
9 | }
10 |
11 | while getopts ":hm:n:" opt; do
12 | case ${opt} in
13 | h )
14 | usage
15 | exit 0
16 | ;;
17 | m )
18 | model=$OPTARG
19 | ;;
20 | n )
21 | nb_shaves=$OPTARG
22 | ;;
23 | : )
24 | echo "Error: -$OPTARG requires an argument."
25 | usage
26 | exit 1
27 | ;;
28 | \? )
29 | echo "Invalid option: -$OPTARG"
30 | usage
31 | exit 1
32 | ;;
33 | esac
34 | done
35 | if [ -z "$model" ]
36 | then
37 | echo "Model not specified !"
38 | usage
39 | exit 1
40 | fi
41 | if [ $model == "pd" ]
42 | then
43 | model="pose_detection"
44 | input_precision="u8"
45 | elif [ $model == "full" ]
46 | then
47 | model="pose_landmark_full"
48 | input_precision="fp16"
49 | elif [ $model == "lite" ]
50 | then
51 | model="pose_landmark_lite"
52 | input_precision="fp16"
53 | elif [ $model == "heavy" ]
54 | then
55 | model="pose_landmark_heavy"
56 | input_precision="fp16"
57 | else
58 | echo "Invalid model !"
59 | usage
60 | exit 1
61 | fi
62 | model_xml="${model}.xml"
63 | if [ ! -f $model_xml ]
64 | then
65 | echo "The model ${model_xml} does not exist"
66 | echo "Have you run convert_models.sh ?"
67 | exit 1
68 | fi
69 | if [ -z "$nb_shaves" ]
70 | then
71 | echo "The number of shaves must be specified with -n"
72 | usage
73 | exit 1
74 | fi
75 | if [ $nb_shaves -lt 1 -o $nb_shaves -gt 13 ]
76 | then
77 | echo "Invalid number of shaves !"
78 | usage
79 | exit 1
80 | fi
81 |
82 | model_blob="${model}_sh${nb_shaves}.blob"
83 |
84 | echo Model: $model_xml $model_blob
85 | echo Shaves $nb_shaves
86 |
87 | source /opt/intel/openvino_2021/bin/setupvars.sh
88 |
89 |
90 | /opt/intel/openvino_2021/deployment_tools/inference_engine/lib/intel64/myriad_compile -m $model_xml -ip $input_precision -VPU_NUMBER_OF_SHAVES $nb_shaves -VPU_NUMBER_OF_CMX_SLICES $nb_shaves -o $model_blob
91 |
92 |
--------------------------------------------------------------------------------
/models/pose_detection_sh4.blob:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/models/pose_detection_sh4.blob
--------------------------------------------------------------------------------
/models/pose_landmark_full_sh4.blob:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/models/pose_landmark_full_sh4.blob
--------------------------------------------------------------------------------
/models/pose_landmark_heavy_sh4.blob:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/models/pose_landmark_heavy_sh4.blob
--------------------------------------------------------------------------------
/models/pose_landmark_lite_sh4.blob:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geaxgx/depthai_blazepose/d453e85231e36fc6625a1fc654cc4b5c24344a85/models/pose_landmark_lite_sh4.blob
--------------------------------------------------------------------------------
/o3d_utils.py:
--------------------------------------------------------------------------------
1 | import open3d as o3d
2 | import numpy as np
3 |
4 |
5 | # From : https://stackoverflow.com/a/59026582/8574085
6 |
7 | def calculate_zy_rotation_for_arrow(v):
8 | """
9 | Calculates the rotations required to go from the vector v to the
10 | z axis vector. The first rotation that is
11 | calculated is over the z axis. This will leave the vector v on the
12 | XZ plane. Then, the rotation over the y axis.
13 |
14 | Returns the angles of rotation over axis z and y required to
15 | get the vector v into the same orientation as axis z
16 |
17 | Args:
18 | - v ():
19 | """
20 | # Rotation over z axis
21 | gamma = np.arctan(v[1]/v[0])
22 | Rz = np.array([[np.cos(gamma),-np.sin(gamma),0],
23 | [np.sin(gamma),np.cos(gamma),0],
24 | [0,0,1]])
25 | # Rotate v to calculate next rotation
26 | v = Rz.T@v.reshape(-1,1)
27 | v = v.reshape(-1)
28 | # Rotation over y axis
29 | beta = np.arctan(v[0]/v[2])
30 | Ry = np.array([[np.cos(beta),0,np.sin(beta)],
31 | [0,1,0],
32 | [-np.sin(beta),0,np.cos(beta)]])
33 | return Rz @ Ry
34 |
35 | def create_cylinder(height=1, radius=None, resolution=20):
36 | """
37 | Create an cylinder in Open3D
38 | """
39 | radius = height/20 if radius is None else radius
40 | mesh_frame = o3d.geometry.TriangleMesh.create_cylinder(
41 | radius=radius,
42 | height=height,
43 | resolution=resolution)
44 | return(mesh_frame)
45 |
46 | def create_segment(a, b, radius=0.05, color=(1,1,0), resolution=20):
47 | """
48 | Creates an line(cylinder) from an pointa to point b,
49 | or create an line from a vector v starting from origin.
50 | Args:
51 | - a, b: End points [x,y,z]
52 | - radius: radius cylinder
53 | """
54 | a = np.array(a)
55 | b = np.array(b)
56 | T = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]])
57 | T[:3, -1] = a
58 | v = b-a
59 |
60 | height = np.linalg.norm(v)
61 | if height == 0: return None
62 | R = calculate_zy_rotation_for_arrow(v)
63 | mesh = create_cylinder(height, radius)
64 | mesh.rotate(R, center=np.array([0, 0, 0]))
65 | mesh.translate((a+b)/2)
66 | mesh.paint_uniform_color(color)
67 | mesh.compute_vertex_normals()
68 | return mesh
69 |
70 | def create_tetra(p1, p2, p3, p4, color=(1,1,0)):
71 | vertices = o3d.utility.Vector3dVector([p1, p2, p3, p4])
72 | tetras = o3d.utility.Vector4iVector([[0, 1, 2, 3]])
73 | mesh = o3d.geometry.TetraMesh(vertices, tetras)
74 | mesh.paint_uniform_color(color)
75 | return mesh
76 |
77 | def create_grid(p0, p1, p2, p3, ni1, ni2, color=(0,0,0)):
78 | '''
79 | p0, p1, p2, p3 : points defining a quadrilateral
80 | ni1: nb of equidistant intervals on segments p0p1 and p3p2
81 | ni2: nb of equidistant intervals on segments p1p2 and p0p3
82 | '''
83 | p0 = np.array(p0)
84 | p1 = np.array(p1)
85 | p2 = np.array(p2)
86 | p3 = np.array(p3)
87 |
88 | vertices = [p0, p1, p2, p3]
89 | lines = [[0,1],[0,3],[1,2],[2,3]]
90 | for i in range(1,ni1):
91 | l = len(vertices)
92 | vertices.append((p0*(ni1-i)+p1*i)/ni1)
93 | vertices.append((p3*(ni1-i)+p2*i)/ni1)
94 | lines.append([l,l+1])
95 | for i in range(1,ni2):
96 | l = len(vertices)
97 | vertices.append((p1*(ni2-i)+p2*i)/ni2)
98 | vertices.append((p0*(ni2-i)+p3*i)/ni2)
99 | lines.append([l,l+1])
100 | vertices = o3d.utility.Vector3dVector(vertices)
101 | lines = o3d.utility.Vector2iVector(lines)
102 | mesh = o3d.geometry.LineSet(vertices, lines)
103 | mesh.paint_uniform_color(color)
104 | return mesh
105 |
106 |
107 | def create_coord_frame(origin=[0, 0, 0],size=1):
108 | mesh = o3d.geometry.TriangleMesh.create_coordinate_frame(size=size)
109 | mesh.translate(origin)
110 | return mesh
111 |
112 |
113 | # Visu3d : custom class used to visualize 3d skeleton
114 | class Visu3D:
115 | def __init__(self, bg_color=[0,0,0], zoom=1, segment_radius=1):
116 | self.vis = o3d.visualization.VisualizerWithKeyCallback()
117 | self.vis.create_window()
118 | opt = self.vis.get_render_option()
119 | opt.background_color = np.asarray(bg_color)
120 | # Defining callbacks - Key codes: https://www.glfw.org/docs/latest/group__keys.html
121 | self.vis.register_key_callback(ord("R"), self.start_rotating)
122 | self.vis.register_key_callback(ord("O"), self.start_oscillating)
123 | self.vis.register_key_callback(ord("S"), self.stop_moving)
124 | self.vis.register_key_callback(262, self.turn_view_right) # Right arrow
125 | self.vis.register_key_callback(263, self.turn_view_left) # Left arrow
126 | self.vis.register_key_callback(265, self.incr_rot_speed) # Up arrow
127 | self.vis.register_key_callback(264, self.decr_rot_speed) # Down arrow
128 | self.view_control = self.vis.get_view_control()
129 | self.zoom = zoom
130 | self.segment_radius = segment_radius
131 | self.move = "oscillate"
132 | self.angle = 0
133 | self.direction = 1
134 | self.oscillate_angle = 200
135 | self.geometries = []
136 |
137 | def set_view(self):
138 | if self.angle_view % 4 == 0:
139 | ax = 0
140 | elif self.angle_view <= 3:
141 | ax = 1
142 | else:
143 | ax = -1
144 | if self.angle_view == 2 or self.angle_view == 6:
145 | az = 0
146 | elif 3 <= self.angle_view <= 5:
147 | az = 1
148 | else:
149 | az = -1
150 | self.view_control.set_front(np.array([ax,0,az]))
151 | self.view_control.set_up(np.array([0,-1,0]))
152 |
153 | def init_view(self):
154 | self.angle_view = 0
155 | self.rot_speed = 2
156 | self.set_view()
157 | self.view_control.set_zoom(self.zoom)
158 |
159 | def create_grid(self, p0, p1, p2, p3, ni1, ni2, color=(1,1,1)):
160 | '''
161 | p0, p1, p2, p3 : points defining a quadrilateral
162 | ni1: nb of equidistant intervals on segments p0p1 and p3p2
163 | ni2: nb of equidistant intervals on segments p1p2 and p0p3
164 | '''
165 | grid = create_grid(p0, p1, p2, p3, ni1, ni2, color)
166 | self.vis.add_geometry(grid)
167 | self.geometries.append(grid)
168 |
169 | def create_camera(self):
170 | cam = o3d.geometry.TriangleMesh.create_arrow(cylinder_radius=0.02, cone_radius=0.03, cylinder_height=0.1, cone_height=0.08)
171 | cam.paint_uniform_color([0.2,0.7,1])
172 | cam.compute_vertex_normals()
173 | self.geometries.append(cam)
174 |
175 | def add_geometries(self):
176 | for geo in self.geometries:
177 | self.vis.add_geometry(geo, reset_bounding_box=False)
178 |
179 | def add_segment(self, p1, p2, radius=None, color=[1,1,1]):
180 | radius = self.segment_radius if radius is None else radius
181 | line = create_segment(p1, p2, radius=radius, color=color)
182 | if line: self.vis.add_geometry(line, reset_bounding_box=False)
183 |
184 | def clear(self):
185 | self.vis.clear_geometries()
186 |
187 | # Callback
188 | def incr_rot_speed(self, vis):
189 | if self.move == "rotate":
190 | if self.rot_speed * self.direction == -1:
191 | self.direction = 1
192 | else:
193 | self.rot_speed += self.direction
194 | else:
195 | self.rot_speed += 1
196 | # Callback
197 | def decr_rot_speed(self, vis):
198 | if self.move == "rotate":
199 | if self.rot_speed * self.direction == 1:
200 | self.direction = -1
201 | else:
202 | self.rot_speed -= self.direction
203 | else:
204 | self.rot_speed = max (1, self.rot_speed-1)
205 | # Callback
206 | def turn_view_right(self, vis):
207 | self.angle_view = (self.angle_view + 1) %8
208 | self.set_view()
209 | self.move = None
210 | # Callback
211 | def turn_view_left(self, vis):
212 | self.angle_view = (self.angle_view - 1) %8
213 | self.set_view()
214 | self.move = None
215 | # Callback
216 | def start_rotating(self, vis):
217 | self.move = "rotate"
218 | # Callback
219 | def start_oscillating(self, vis):
220 | self.move = "oscillate"
221 | self.angle = 0
222 | # Callback
223 | def stop_moving(self, vis):
224 | self.move = None
225 |
226 | def try_move(self):
227 | if self.move == "rotate":
228 | self.view_control.rotate(self.rot_speed * self.direction,0)
229 | elif self.move == "oscillate":
230 | self.view_control.rotate(self.rot_speed * self.direction,0)
231 | self.angle += self.rot_speed * self.direction
232 | if abs(self.angle) >= self.oscillate_angle:
233 | self.direction = - self.direction
234 |
235 | def render(self):
236 | self.vis.poll_events()
237 | self.vis.update_renderer()
238 |
239 |
240 |
241 | if __name__ == "__main__":
242 |
243 | line = create_segment([0, 0, 0], [1, 0, 0], color=(1,0,0))
244 | line2 = create_segment([1, 0, 0], [1, 1, 0], color=(0,1,0))
245 | line3 = create_segment([1, 1, 0], [0, 0, 0], radius=0.1)
246 | grid = create_grid([0,0,0],[0,0,1],[0,1,1],[0,1,0], 3, 2)
247 | frame =create_coord_frame()
248 | print(grid)
249 | # Draw everything
250 | o3d.visualization.draw_geometries([line, line2, line3, grid, frame])
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | opencv-python >= 4.5.1.48
2 | open3d
3 | depthai >= 2.10
4 |
--------------------------------------------------------------------------------
/template_manager_script.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is the template of the scripting node source code in edge mode
3 | Substitution is made in BlazeposeDepthaiEdge.py
4 |
5 | In the following:
6 | rrn_ : normalized [0:1] coordinates in rotated rectangle coordinate systems
7 | sqn_ : normalized [0:1] coordinates in squared input image
8 | """
9 |
10 | ${_TRACE} ("Starting manager script node")
11 |
12 | import marshal
13 | from math import sin, cos, atan2, pi, hypot, degrees, floor
14 |
15 | # Indexes of some keypoints
16 | left_shoulder = 11
17 | right_shoulder = 12
18 | left_hip = 23
19 | right_hip = 24
20 |
21 | ${_IF_XYZ}
22 | # We use a filter for smoothing the reference point (mid hips or mid shoulders)
23 | # for which we fetch the (x,y,z). Without this filter, the reference point is very shaky
24 | class SmoothingFilter:
25 | def __init__(self, alpha):
26 | self.alpha = alpha
27 | self.initialized = False
28 | def apply(self, x):
29 | if self.initialized:
30 | result = self.alpha * x + (1 - self.alpha) * self.prev_x
31 | else:
32 | result = x
33 | self.initialized = True
34 | self.prev_x = result
35 | return int(result)
36 | def reset(self):
37 | self.initialized = False
38 |
39 | filter_x = SmoothingFilter(0.5)
40 | filter_y = SmoothingFilter(0.5)
41 | ${_IF_XYZ}
42 |
43 | # BufferMgr is used to statically allocate buffers once
44 | # (replace dynamic allocation).
45 | # These buffers are used for sending result to host
46 | class BufferMgr:
47 | def __init__(self):
48 | self._bufs = {}
49 | def __call__(self, size):
50 | try:
51 | buf = self._bufs[size]
52 | except KeyError:
53 | buf = self._bufs[size] = Buffer(size)
54 | ${_TRACE} (f"New buffer allocated: {size}")
55 | return buf
56 |
57 | buffer_mgr = BufferMgr()
58 |
59 | def send_result(type, lm_score=0, rect_center_x=0, rect_center_y=0, rect_size=0, rotation=0, lms=0, lms_world=0, xyz_ref=0, xyz=0, xyz_zone=0):
60 | # type : 0, 1 or 2
61 | # 0 : pose detection only (detection score < threshold)
62 | # 1 : pose detection + landmark regression
63 | # 2 : landmark regression only (ROI computed from previous landmarks)
64 | result = dict([("type", type), ("lm_score", lm_score), ("rotation", rotation),
65 | ("rect_center_x", rect_center_x), ("rect_center_y", rect_center_y), ("rect_size", rect_size),
66 | ("lms", lms), ('lms_world', lms_world),
67 | ("xyz_ref", xyz_ref), ("xyz", xyz), ("xyz_zone", xyz_zone)])
68 | result_serial = marshal.dumps(result, 2)
69 | buffer = buffer_mgr(len(result_serial))
70 | buffer.getData()[:] = result_serial
71 | ${_TRACE} ("len result:"+str(len(result_serial)))
72 |
73 | node.io['host'].send(buffer)
74 | ${_TRACE} ("Manager sent result to host")
75 |
76 |
77 | def rr2img(rrn_x, rrn_y):
78 | # Convert a point (rrn_x, rrn_y) expressed in normalized rotated rectangle (rrn)
79 | # into (X, Y) expressed in normalized image (sqn)
80 | X = sqn_rr_center_x + sqn_rr_size * ((rrn_x - 0.5) * cos_rot + (0.5 - rrn_y) * sin_rot)
81 | Y = sqn_rr_center_y + sqn_rr_size * ((rrn_y - 0.5) * cos_rot + (rrn_x - 0.5) * sin_rot)
82 | return X, Y
83 |
84 | norm_pad_size = ${_pad_h} / ${_frame_size}
85 |
86 | def is_visible(lm_id):
87 | # Is the landmark lm_id is visible ?
88 | # Here visibility means inferred visibility from the landmark model
89 | return lms[lm_id*5+3] > ${_visibility_threshold}
90 |
91 | def is_in_image(sqn_x, sqn_y):
92 | # Is the point (sqn_x, sqn_y) is included in the useful part of the image (excluding the pads)?
93 | return norm_pad_size < sqn_y < 1 - norm_pad_size
94 |
95 | # send_new_frame_to_branch defines on which branch new incoming frames are sent
96 | # 1 = pose detection branch
97 | # 2 = landmark branch
98 | send_new_frame_to_branch = 1
99 |
100 | next_roi_lm_idx = 33*5
101 |
102 | cfg_pre_pd = ImageManipConfig()
103 | cfg_pre_pd.setResizeThumbnail(224, 224, 0, 0, 0)
104 |
105 | while True:
106 | if send_new_frame_to_branch == 1: # Routing frame to pd
107 | node.io['pre_pd_manip_cfg'].send(cfg_pre_pd)
108 | ${_TRACE} ("Manager sent thumbnail config to pre_pd manip")
109 | # Wait for pd post processing's result
110 | detection = node.io['from_post_pd_nn'].get().getLayerFp16("result")
111 | ${_TRACE} ("Manager received pd result: "+str(detection))
112 | pd_score, sqn_rr_center_x, sqn_rr_center_y, sqn_scale_x, sqn_scale_y = detection
113 | if pd_score < ${_pd_score_thresh}:
114 | send_result(0)
115 | continue
116 | scale_center_x = sqn_scale_x - sqn_rr_center_x
117 | scale_center_y = sqn_scale_y - sqn_rr_center_y
118 | sqn_rr_size = 2 * ${_rect_transf_scale} * hypot(scale_center_x, scale_center_y)
119 | rotation = 0.5 * pi - atan2(-scale_center_y, scale_center_x)
120 | rotation = rotation - 2 * pi *floor((rotation + pi) / (2 * pi))
121 | ${_IF_XYZ}
122 | filter_x.reset()
123 | filter_y.reset()
124 | ${_IF_XYZ}
125 |
126 |
127 | # Routing frame to lm
128 | sin_rot = sin(rotation) # We will often need these values later
129 | cos_rot = cos(rotation)
130 | # Tell pre_lm_manip how to crop body region
131 | rr = RotatedRect()
132 | rr.center.x = sqn_rr_center_x
133 | rr.center.y = (sqn_rr_center_y * ${_frame_size} - ${_pad_h}) / ${_img_h}
134 | rr.size.width = sqn_rr_size
135 | rr.size.height = sqn_rr_size * ${_frame_size} / ${_img_h}
136 | rr.angle = degrees(rotation)
137 | cfg = ImageManipConfig()
138 | cfg.setCropRotatedRect(rr, True)
139 | cfg.setResize(256, 256)
140 | node.io['pre_lm_manip_cfg'].send(cfg)
141 | ${_TRACE} ("Manager sent config to pre_lm manip")
142 |
143 | # Wait for lm's result
144 | lm_result = node.io['from_lm_nn'].get()
145 | ${_TRACE} ("Manager received result from lm nn")
146 | lm_score = lm_result.getLayerFp16("Identity_1")[0]
147 | if lm_score > ${_lm_score_thresh}:
148 | lms = lm_result.getLayerFp16("Identity")
149 | lms_world = lm_result.getLayerFp16("Identity_4")[:99]
150 |
151 | xyz = 0
152 | xyz_zone = 0
153 | xyz_ref = 0
154 | # Query xyz
155 | ${_IF_XYZ}
156 | # Choosing the reference point: mid hips if hips visible, or mid shoulders otherwise
157 | # xyz_ref codes the reference point, 1 if mid hips, 2 if mid shoulders, 0 if no reference point
158 | if is_visible(right_hip) and is_visible(left_hip):
159 | kp1 = right_hip
160 | kp2 = left_hip
161 | rrn_xyz_ref_x = (lms[5*kp1] + lms[5*kp2]) / 512 # 512 = 256*2 (256 for normalizing, 2 for the mean)
162 | rrn_xyz_ref_y = (lms[5*kp1+1] + lms[5*kp2+1]) / 512
163 | sqn_xyz_ref_x, sqn_xyz_ref_y = rr2img(rrn_xyz_ref_x, rrn_xyz_ref_y)
164 | if is_in_image(sqn_xyz_ref_x, sqn_xyz_ref_y):
165 | xyz_ref = 1
166 | if xyz_ref == 0 and is_visible(right_shoulder) and is_visible(left_shoulder):
167 | kp1 = right_shoulder
168 | kp2 = left_shoulder
169 | rrn_xyz_ref_x = (lms[5*kp1] + lms[5*kp2]) / 512 # 512 = 256*2 (256 for normalizing, 2 for the mean)
170 | rrn_xyz_ref_y = (lms[5*kp1+1] + lms[5*kp2+1]) / 512
171 | sqn_xyz_ref_x, sqn_xyz_ref_y = rr2img(rrn_xyz_ref_x, rrn_xyz_ref_y)
172 | if is_in_image(sqn_xyz_ref_x, sqn_xyz_ref_y):
173 | xyz_ref = 2
174 | if xyz_ref:
175 | cfg = SpatialLocationCalculatorConfig()
176 | conf_data = SpatialLocationCalculatorConfigData()
177 | conf_data.depthThresholds.lowerThreshold = 100
178 | conf_data.depthThresholds.upperThreshold = 10000
179 | half_zone_size = max(int(sqn_rr_size * ${_frame_size} / 90), 4)
180 | xc = filter_x.apply(sqn_xyz_ref_x * ${_frame_size} + ${_crop_w})
181 | yc = filter_y.apply(sqn_xyz_ref_y * ${_frame_size} - ${_pad_h})
182 | roi_left = max(0, xc - half_zone_size)
183 | roi_right = min(${_img_w}-1, xc + half_zone_size)
184 | roi_top = max(0, yc - half_zone_size)
185 | roi_bottom = min(${_img_h}-1, yc + half_zone_size)
186 | roi_topleft = Point2f(roi_left, roi_top)
187 | roi_bottomright = Point2f(roi_right, roi_bottom)
188 | conf_data.roi = Rect(roi_topleft, roi_bottomright)
189 | cfg = SpatialLocationCalculatorConfig()
190 | cfg.addROI(conf_data)
191 | node.io['spatial_location_config'].send(cfg)
192 | ${_TRACE} ("Manager sent ROI to spatial_location_config")
193 |
194 | # Wait xyz response
195 | xyz_data = node.io['spatial_data'].get().getSpatialLocations()
196 | ${_TRACE} ("Manager received spatial_location")
197 | coords = xyz_data[0].spatialCoordinates
198 | xyz = [float(coords.x), float(coords.y), float(coords.z)]
199 | roi = xyz_data[0].config.roi
200 | xyz_zone = [int(roi.topLeft().x - ${_crop_w}), int(roi.topLeft().y), int(roi.bottomRight().x - ${_crop_w}), int(roi.bottomRight().y)]
201 | else:
202 | xyz = [0.0] * 3
203 | xyz_zone = [0] * 4
204 | ${_IF_XYZ}
205 |
206 | # Send result to host
207 | send_result(send_new_frame_to_branch, lm_score, sqn_rr_center_x, sqn_rr_center_y, sqn_rr_size, rotation, lms, lms_world, xyz_ref, xyz, xyz_zone)
208 |
209 | if not ${_force_detection}:
210 | send_new_frame_to_branch = 2
211 | # Calculate the ROI for next frame
212 | rrn_rr_center_x = lms[next_roi_lm_idx] / 256
213 | rrn_rr_center_y = lms[next_roi_lm_idx+1] / 256
214 | rrn_scale_x = lms[next_roi_lm_idx+5] / 256
215 | rrn_scale_y = lms[next_roi_lm_idx+6] / 256
216 | sqn_scale_x, sqn_scale_y = rr2img(rrn_scale_x, rrn_scale_y)
217 | sqn_rr_center_x, sqn_rr_center_y = rr2img(rrn_rr_center_x, rrn_rr_center_y)
218 | scale_center_x = sqn_scale_x - sqn_rr_center_x
219 | scale_center_y = sqn_scale_y - sqn_rr_center_y
220 | sqn_rr_size = 2 * ${_rect_transf_scale} * hypot(scale_center_x, scale_center_y)
221 | rotation = 0.5 * pi - atan2(-scale_center_y, scale_center_x)
222 | rotation = rotation - 2 * pi *floor((rotation + pi) / (2 * pi))
223 |
224 | else:
225 | send_result(send_new_frame_to_branch, lm_score)
226 | send_new_frame_to_branch = 1
227 |
--------------------------------------------------------------------------------