├── CMakeLists.txt ├── LICENSE ├── README.md ├── blazepose.h ├── main.cpp └── resources ├── output-full.gif ├── output.gif └── result.png /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # CMakeLists.txt - blazepose 2 | 3 | cmake_minimum_required (VERSION 2.8.1) 4 | 5 | set(TARGET_NAME blazepose) # name of executable file 6 | set(CMAKE_BUILD_TYPE "Release") 7 | 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 10 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIE") 11 | 12 | find_package(InferenceEngine 1.1 REQUIRED) 13 | find_package(OpenCV REQUIRED) 14 | add_definitions(-DUSE_OPENCV) 15 | 16 | include_directories( ${InferenceEngine_INCLUDE_DIRS} ${OpenCV_INCLUDE_DIRS} ) 17 | link_directories( ) 18 | 19 | add_executable( ${TARGET_NAME} main.cpp ) # list of source file(s) 20 | set_target_properties(${TARGET_NAME} PROPERTIES "CMAKE_CXX_FLAGS" "${CMAKE_CXX_FLAGS}") 21 | target_link_libraries(${TARGET_NAME} ${InferenceEngine_LIBRARIES} ${OpenCV_LIBS} ) 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Yasunori Shimura 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # blazepose_openvino 2 | BlazePose model test program for Intel Distribution of Intel OpenVINO toolkit 3 | 4 | **WIP** 5 | 6 | The BlazePose OpenVINO IR model can be obtained from [PINTO model zoo](https://github.com/PINTO0309/PINTO_model_zoo). 7 | 8 | ### Example (Upper body model) 9 | ![Example](./resources/output.gif) 10 | 11 | ### Example (Full body model) 12 | ![Example-full](./resources/output-full.gif) 13 | 14 | ## Configuration / Options 15 | Current program doesn't support command line options. 16 | You can modify the source code to change input media/file and device to use for inferencing. Those configurations can be found on top of the `main.cpp` code. 17 | 18 | ```c++ 19 | #define FULL_POSE 20 | #include "blazepose.h" 21 | 22 | #define IMAGE_INPUT (1) 23 | #define VIDEO_INPUT (2) 24 | #define CAM_INPUT (3) 25 | 26 | #ifndef FULL_POSE 27 | const std::string MODEL_POSE_DET = "../pose_detection/128x128/FP32/pose_detection"; 28 | const std::string MODEL_LM_DET = "../pose_landmark_upper_body/256x256/FP32/pose_landmark_upper_body"; 29 | #else 30 | const std::string MODEL_POSE_DET = "../pose_detection_full/128x128/FP16/pose_detection_full"; 31 | const std::string MODEL_LM_DET = "../pose_landmark_full_body/256x256/FP16/pose_landmark_full_body"; 32 | #endif 33 | 34 | // INPUT_TYPE = { IMAGE_INPUT | VIDEO_INPUT | CAM_INPUT } 35 | #define INPUT_TYPE VIDEO_INPUT 36 | const std::string INPUT_FILE = "../boy.mp4"; /* Image or movie file */ 37 | 38 | // 'output.mp4' will be generated when this macro is defined and the input source is either one of VIDEO_INPUT or CAM_INPUT 39 | #define VIDEO_OUTPUT 40 | #define VIDEO_SIZE (400) /* output video size = (VIDEO_SIZE, VIDEO_SIZE) */ 41 | 42 | // Device to use for inferencing. Possible options = "CPU", "GPU", "MYRIAD", "HDDL", "HETERO:FPGA,CPU", ... 43 | const std::string DEVICE_PD = "CPU"; 44 | const std::string DEVICE_LM = "CPU"; 45 | 46 | // ** Define or Undefine to control the items to display 47 | //#define RENDER_ROI 48 | #define RENDER_TIME 49 | #define RENDER_POINTS 50 | ``` 51 | -------------------------------------------------------------------------------- /blazepose.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | /* ------------------------------------------------ * 3 | * The MIT License (MIT) 4 | * Copyright (c) 2020 terryky1220@gmail.com 5 | * ------------------------------------------------ */ 6 | #ifndef TFLITE_BLAZEPOSE_H_ 7 | #define TFLITE_BLAZEPOSE_H_ 8 | 9 | #ifdef __cplusplus 10 | extern "C" { 11 | #endif 12 | 13 | #define MAX_POSE_NUM 100 14 | 15 | #ifndef FULL_POSE 16 | #define POSE_JOINT_NUM 25 17 | #else 18 | #define POSE_JOINT_NUM 33 19 | #endif 20 | 21 | #ifndef FULL_POSE 22 | enum pose_detect_key_id { 23 | kMidHipCenter = 0, // 0 24 | kFullBodySizeRot, // 1 25 | kMidShoulderCenter, // 2 26 | kUpperBodySizeRot, // 3 27 | 28 | kPoseDetectKeyNum 29 | }; 30 | #else 31 | enum pose_detect_key_id { 32 | kMidHipCenter = 0, // 0 33 | kFullBodySizeRot, // 1 34 | 35 | kPoseDetectKeyNum 36 | }; 37 | #endif 38 | 39 | typedef struct _fvec2 40 | { 41 | float x, y; 42 | } fvec2; 43 | 44 | typedef struct _fvec3 45 | { 46 | float x, y, z; 47 | } fvec3; 48 | 49 | typedef struct _detect_region_t 50 | { 51 | float score; 52 | fvec2 topleft; 53 | fvec2 btmright; 54 | fvec2 keys[kPoseDetectKeyNum]; 55 | 56 | float rotation; 57 | fvec2 roi_center; 58 | fvec2 roi_size; 59 | fvec2 roi_coord[4]; 60 | } detect_region_t; 61 | 62 | typedef struct _pose_detect_result_t 63 | { 64 | int num; 65 | detect_region_t poses[MAX_POSE_NUM]; 66 | } pose_detect_result_t; 67 | 68 | typedef struct _pose_landmark_result_t 69 | { 70 | float score; 71 | fvec3 joint[POSE_JOINT_NUM]; 72 | } pose_landmark_result_t; 73 | 74 | 75 | typedef struct _blazepose_config_t 76 | { 77 | float score_thresh; 78 | float iou_thresh; 79 | } blazepose_config_t; 80 | 81 | int init_tflite_blazepose(int use_quantized_tflite, blazepose_config_t* config); 82 | 83 | void* get_pose_detect_input_buf(int* w, int* h); 84 | int invoke_pose_detect(pose_detect_result_t* detect_result, blazepose_config_t* config); 85 | 86 | void* get_pose_landmark_input_buf(int* w, int* h); 87 | int invoke_pose_landmark(pose_landmark_result_t* pose_landmark_result); 88 | 89 | #ifdef __cplusplus 90 | } 91 | 92 | typedef struct Anchor 93 | { 94 | float x_center, y_center, w, h; 95 | }; 96 | 97 | typedef struct SsdAnchorsCalculatorOptions 98 | { 99 | // Size of input images. 100 | int input_size_width; // [required] 101 | int input_size_height; // [required] 102 | 103 | // Min and max scales for generating anchor boxes on feature maps. 104 | float min_scale; // [required] 105 | float max_scale; // [required] 106 | 107 | // The offset for the center of anchors. The value is in the scale of stride. 108 | // E.g. 0.5 meaning 0.5 * |current_stride| in pixels. 109 | float anchor_offset_x; // default = 0.5 110 | float anchor_offset_y; // default = 0.5 111 | 112 | // Number of output feature maps to generate the anchors on. 113 | int num_layers; // [required] 114 | 115 | // Sizes of output feature maps to create anchors. Either feature_map size or 116 | // stride should be provided. 117 | std::vector feature_map_width; 118 | std::vector feature_map_height; 119 | 120 | // Strides of each output feature maps. 121 | std::vector strides; 122 | 123 | // List of different aspect ratio to generate anchors. 124 | std::vector aspect_ratios; 125 | 126 | // A boolean to indicate whether the fixed 3 boxes per location is used in the 127 | // lowest layer. 128 | bool reduce_boxes_in_lowest_layer; // default = false 129 | 130 | // An additional anchor is added with this aspect ratio and a scale 131 | // interpolated between the scale for a layer and the scale for the next layer 132 | // (1.0 for the last layer). This anchor is not included if this value is 0. 133 | float interpolated_scale_aspect_ratio; // default = 1.0 134 | 135 | // Whether use fixed width and height (e.g. both 1.0f) for each anchor. 136 | // This option can be used when the predicted anchor width and height are in 137 | // pixels. 138 | bool fixed_anchor_size; // default = false 139 | } SsdAnchorsCalculatorOptions_t; 140 | 141 | #endif 142 | 143 | #endif /* TFLITE_BLAZEPOSE_H_ */ 144 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | /* ------------------------------------------------ * 2 | * The MIT License (MIT) 3 | * Copyright (c) 2020 terryky1220@gmail.com 4 | * ------------------------------------------------ */ 5 | 6 | // main.cpp - blazepose for openvino - modified by Yasunori Shimura twitter: @yassim0710 7 | 8 | // Note: This program works with BlazePose model for OpenVINO from PINTO model zoo 9 | // https://github.com/PINTO0309/PINTO_model_zoo 10 | 11 | #define _USE_MATH_DEFINES 12 | #include 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | #include 19 | #include 20 | 21 | #define FULL_POSE 22 | #include "blazepose.h" 23 | 24 | #define IMAGE_INPUT (1) 25 | #define VIDEO_INPUT (2) 26 | #define CAM_INPUT (3) 27 | 28 | #ifndef FULL_POSE 29 | const std::string MODEL_POSE_DET = "../pose_detection/128x128/FP32/pose_detection"; 30 | const std::string MODEL_LM_DET = "../pose_landmark_upper_body/256x256/FP32/pose_landmark_upper_body"; 31 | #else 32 | const std::string MODEL_POSE_DET = "../pose_detection_full/128x128/FP16/pose_detection_full"; 33 | //Input Blob(s): 34 | // BlobName:input, Shape:[1, 3, 128, 128], Precision:FP32 35 | //Output Blob(s): 36 | // BlobName:StatefulPartitionedCall/functional_1/classificators/concat, Shape:[1, 896, 1], Precision:FP32 37 | // BlobName:StatefulPartitionedCall/functional_1/regressors/concat, Shape:[1, 896, 8], Precision:FP32 38 | const std::string MODEL_LM_DET = "../pose_landmark_full_body/256x256/FP16/pose_landmark_full_body"; 39 | //Input Blob(s): 40 | // BlobName:input, Shape:[1, 3, 256, 256], Precision:FP32 41 | //Output Blob(s): 42 | // BlobName:StatefulPartitionedCall/functional_1/output_segmentation/BiasAdd/Add, Shape:[1, 1, 128, 128], Precision:FP32 43 | // BlobName:StatefulPartitionedCall/functional_1/tf_op_layer_Sigmoid/Sigmoid, Shape:[1, 1, 1, 1], Precision:FP32 44 | // BlobName:StatefulPartitionedCall/functional_1/tf_op_layer_ld_3d/ld_3d, Shape:[1, 156], Precision:FP32 45 | #endif 46 | 47 | // INPUT_TYPE = { IMAGE_INPUT | VIDEO_INPUT | CAM_INPUT } 48 | #define INPUT_TYPE VIDEO_INPUT 49 | const std::string INPUT_FILE = "../boy.mp4"; /* Image or movie file */ 50 | //const std::string INPUT_FILE = "../capoeira.mp4"; /* Image or movie file */ 51 | //const std::string INPUT_FILE = "../people-detection.mp4"; /* Image or movie file */ 52 | 53 | // 'output.mp4' will be generated when this macro is defined and the input source is either one of VIDEO_INPUT or CAM_INPUT 54 | #define VIDEO_OUTPUT 55 | #define VIDEO_SIZE (400) /* output video size = (VIDEO_SIZE, VIDEO_SIZE) */ 56 | 57 | // Device to use for inferencing. Possible options = "CPU", "GPU", "MYRIAD", "HDDL", "HETERO:FPGA,CPU", ... 58 | const std::string DEVICE_PD = "CPU"; 59 | const std::string DEVICE_LM = "CPU"; 60 | 61 | // ** Define or Undefine to control the items to display 62 | //#define RENDER_ROI 63 | #define RENDER_TIME 64 | #define RENDER_POINTS 65 | 66 | 67 | 68 | namespace ie = InferenceEngine; 69 | 70 | 71 | float CalculateScale(float min_scale, float max_scale, int stride_index, int num_strides) { 72 | if (num_strides == 1) 73 | return (min_scale + max_scale) * 0.5f; 74 | else 75 | return min_scale + (max_scale - min_scale) * 1.0 * stride_index / (num_strides - 1.0f); 76 | } 77 | 78 | void GenerateAnchors(std::vector& anchors, const SsdAnchorsCalculatorOptions& options) { 79 | int layer_id = 0; 80 | while (layer_id < (int)options.strides.size()) { 81 | std::vector anchor_height; 82 | std::vector anchor_width; 83 | std::vector aspect_ratios; 84 | std::vector scales; 85 | 86 | // For same strides, we merge the anchors in the same order. 87 | int last_same_stride_layer = layer_id; 88 | while (last_same_stride_layer < (int)options.strides.size() && 89 | options.strides[last_same_stride_layer] == options.strides[layer_id]) 90 | { 91 | const float scale = 92 | CalculateScale(options.min_scale, options.max_scale, 93 | last_same_stride_layer, options.strides.size()); 94 | if (last_same_stride_layer == 0 && options.reduce_boxes_in_lowest_layer) { 95 | // For first layer, it can be specified to use predefined anchors. 96 | aspect_ratios.push_back(1.0); 97 | aspect_ratios.push_back(2.0); 98 | aspect_ratios.push_back(0.5); 99 | scales.push_back(0.1); 100 | scales.push_back(scale); 101 | scales.push_back(scale); 102 | } 103 | else { 104 | for (int aspect_ratio_id = 0; 105 | aspect_ratio_id < (int)options.aspect_ratios.size(); 106 | ++aspect_ratio_id) { 107 | aspect_ratios.push_back(options.aspect_ratios[aspect_ratio_id]); 108 | scales.push_back(scale); 109 | } 110 | if (options.interpolated_scale_aspect_ratio > 0.0) { 111 | const float scale_next = 112 | last_same_stride_layer == (int)options.strides.size() - 1 113 | ? 1.0f 114 | : CalculateScale(options.min_scale, options.max_scale, 115 | last_same_stride_layer + 1, 116 | options.strides.size()); 117 | scales.push_back(std::sqrt(scale * scale_next)); 118 | aspect_ratios.push_back(options.interpolated_scale_aspect_ratio); 119 | } 120 | } 121 | last_same_stride_layer++; 122 | } 123 | 124 | for (int i = 0; i < (int)aspect_ratios.size(); ++i) { 125 | const float ratio_sqrts = std::sqrt(aspect_ratios[i]); 126 | anchor_height.push_back(scales[i] / ratio_sqrts); 127 | anchor_width.push_back(scales[i] * ratio_sqrts); 128 | } 129 | 130 | int feature_map_height = 0; 131 | int feature_map_width = 0; 132 | if (options.feature_map_height.size()) { 133 | feature_map_height = options.feature_map_height[layer_id]; 134 | feature_map_width = options.feature_map_width[layer_id]; 135 | } 136 | else { 137 | const int stride = options.strides[layer_id]; 138 | feature_map_height = std::ceil(1.0f * options.input_size_height / stride); 139 | feature_map_width = std::ceil(1.0f * options.input_size_width / stride); 140 | } 141 | 142 | for (int y = 0; y < feature_map_height; ++y) { 143 | for (int x = 0; x < feature_map_width; ++x) { 144 | for (int anchor_id = 0; anchor_id < (int)anchor_height.size(); ++anchor_id) { 145 | // TODO: Support specifying anchor_offset_x, anchor_offset_y. 146 | const float x_center = (x + options.anchor_offset_x) * 1.0f / feature_map_width; 147 | const float y_center = (y + options.anchor_offset_y) * 1.0f / feature_map_height; 148 | 149 | Anchor new_anchor; 150 | new_anchor.x_center = x_center; 151 | new_anchor.y_center = y_center; 152 | 153 | if (options.fixed_anchor_size) { 154 | new_anchor.w = 1.0f; 155 | new_anchor.h = 1.0f; 156 | } 157 | else { 158 | new_anchor.w = anchor_width[anchor_id]; 159 | new_anchor.h = anchor_height[anchor_id]; 160 | } 161 | anchors.push_back(new_anchor); 162 | } 163 | } 164 | } 165 | layer_id = last_same_stride_layer; 166 | } 167 | } 168 | 169 | void create_ssd_anchors(int input_w, int input_h, std::vector &anchors) { 170 | /* 171 | * Anchor parameters are based on: 172 | * mediapipe/modules/pose_detection/pose_detection_cpu.pbtxt 173 | */ 174 | SsdAnchorsCalculatorOptions anchor_options; 175 | anchor_options.num_layers = 4; 176 | anchor_options.min_scale = 0.1484375; 177 | anchor_options.max_scale = 0.75; 178 | anchor_options.input_size_height = 128; 179 | anchor_options.input_size_width = 128; 180 | anchor_options.anchor_offset_x = 0.5f; 181 | anchor_options.anchor_offset_y = 0.5f; 182 | // anchor_options.feature_map_width .push_back(0); 183 | // anchor_options.feature_map_height.push_back(0); 184 | anchor_options.strides.push_back(8); 185 | anchor_options.strides.push_back(16); 186 | anchor_options.strides.push_back(16); 187 | anchor_options.strides.push_back(16); 188 | anchor_options.aspect_ratios.push_back(1.0); 189 | anchor_options.reduce_boxes_in_lowest_layer = false; 190 | anchor_options.interpolated_scale_aspect_ratio = 1.0; 191 | anchor_options.fixed_anchor_size = true; 192 | 193 | GenerateAnchors(anchors, anchor_options); 194 | } 195 | 196 | int get_bbox_idx(int anchor_idx) { 197 | /* 198 | * cx, cy, width, height 199 | * key0_x, key0_y kMidHipCenter 200 | * key1_x, key1_y kFullBodySizeRot 201 | * key2_x, key2_y kMidShoulderCenter - upper body only 202 | * key3_x, key3_y kUpperBodySizeRot - upper body only 203 | */ 204 | int numkey = kPoseDetectKeyNum; // FullBody:4, UpperBody:2 205 | int idx = (4 + 2 * numkey) * anchor_idx; 206 | 207 | return idx; 208 | } 209 | 210 | int decode_bounds(std::list& region_list, float score_thresh, int input_img_w, int input_img_h, float* scores_ptr, float* bboxes_ptr, std::vector& anchors) { 211 | detect_region_t region; 212 | int i = 0; 213 | for (auto &anchor : anchors) { 214 | float score0 = scores_ptr[i]; 215 | float score = 1.0f / (1.0f + exp(-score0)); 216 | 217 | if (score > score_thresh) 218 | { 219 | float* p = bboxes_ptr + get_bbox_idx(i); 220 | 221 | /* boundary box */ 222 | float cx = p[0] / input_img_w + anchor.x_center; 223 | float cy = p[1] / input_img_h + anchor.y_center; 224 | float w = p[2] / input_img_w; 225 | float h = p[3] / input_img_h; 226 | 227 | fvec2 topleft, btmright; 228 | topleft.x = cx - w * 0.5f; 229 | topleft.y = cy - h * 0.5f; 230 | btmright.x = cx + w * 0.5f; 231 | btmright.y = cy + h * 0.5f; 232 | 233 | region.score = score; 234 | region.topleft = topleft; 235 | region.btmright = btmright; 236 | 237 | /* landmark positions (6 keys) */ 238 | for (int j = 0; j < kPoseDetectKeyNum; j++) 239 | { 240 | float lx = p[4 + (2 * j) + 0]; 241 | float ly = p[4 + (2 * j) + 1]; 242 | lx += anchor.x_center * input_img_w; 243 | ly += anchor.y_center * input_img_h; 244 | lx /= (float)input_img_w; 245 | ly /= (float)input_img_h; 246 | 247 | region.keys[j].x = lx; 248 | region.keys[j].y = ly; 249 | } 250 | 251 | region_list.push_back(region); 252 | } 253 | i++; 254 | } 255 | return 0; 256 | } 257 | 258 | /* -------------------------------------------------- * 259 | * Apply NonMaxSuppression: 260 | * https://github.com/tensorflow/tfjs/blob/master/tfjs-core/src/ops/image_ops.ts 261 | * -------------------------------------------------- */ 262 | float calc_intersection_over_union(detect_region_t& region0, detect_region_t& region1) { 263 | float sx0 = region0.topleft.x; 264 | float sy0 = region0.topleft.y; 265 | float ex0 = region0.btmright.x; 266 | float ey0 = region0.btmright.y; 267 | float sx1 = region1.topleft.x; 268 | float sy1 = region1.topleft.y; 269 | float ex1 = region1.btmright.x; 270 | float ey1 = region1.btmright.y; 271 | 272 | float xmin0 = std::min(sx0, ex0); 273 | float ymin0 = std::min(sy0, ey0); 274 | float xmax0 = std::max(sx0, ex0); 275 | float ymax0 = std::max(sy0, ey0); 276 | float xmin1 = std::min(sx1, ex1); 277 | float ymin1 = std::min(sy1, ey1); 278 | float xmax1 = std::max(sx1, ex1); 279 | float ymax1 = std::max(sy1, ey1); 280 | 281 | float area0 = (ymax0 - ymin0) * (xmax0 - xmin0); 282 | float area1 = (ymax1 - ymin1) * (xmax1 - xmin1); 283 | if (area0 <= 0 || area1 <= 0) 284 | return 0.0f; 285 | 286 | float intersect_xmin = std::max(xmin0, xmin1); 287 | float intersect_ymin = std::max(ymin0, ymin1); 288 | float intersect_xmax = std::min(xmax0, xmax1); 289 | float intersect_ymax = std::min(ymax0, ymax1); 290 | 291 | float intersect_area = std::max(intersect_ymax - intersect_ymin, 0.0f) * 292 | std::max(intersect_xmax - intersect_xmin, 0.0f); 293 | 294 | return intersect_area / (area0 + area1 - intersect_area); 295 | } 296 | 297 | 298 | int non_max_suppression(std::list& region_list, std::list& region_nms_list, float iou_thresh) { 299 | region_list.sort([](detect_region_t& v1, detect_region_t& v2) { return v1.score > v2.score ? true : false; }); 300 | 301 | for (auto itr = region_list.begin(); itr != region_list.end(); itr++) 302 | { 303 | detect_region_t region_candidate = *itr; 304 | 305 | int ignore_candidate = false; 306 | for (auto itr_nms = region_nms_list.rbegin(); itr_nms != region_nms_list.rend(); itr_nms++) 307 | { 308 | detect_region_t region_nms = *itr_nms; 309 | 310 | float iou = calc_intersection_over_union(region_candidate, region_nms); 311 | if (iou >= iou_thresh) 312 | { 313 | ignore_candidate = true; 314 | break; 315 | } 316 | } 317 | 318 | if (!ignore_candidate) 319 | { 320 | region_nms_list.push_back(region_candidate); 321 | if (region_nms_list.size() >= MAX_POSE_NUM) 322 | break; 323 | } 324 | } 325 | return 0; 326 | } 327 | 328 | float normalize_radians(float angle) 329 | { 330 | return angle - 2 * M_PI * std::floor((angle - (-M_PI)) / (2 * M_PI)); 331 | } 332 | 333 | void compute_rotation(detect_region_t& region) { 334 | #ifndef FULL_POSE 335 | float x0 = region.keys[kMidHipCenter].x; 336 | float y0 = region.keys[kMidHipCenter].y; 337 | float x1 = region.keys[kMidShoulderCenter].x; 338 | float y1 = region.keys[kMidShoulderCenter].y; 339 | #else 340 | float x0 = region.keys[kMidHipCenter].x; 341 | float y0 = region.keys[kMidHipCenter].y; 342 | float x1 = (region.topleft.x + region.btmright.x) * 0.5f; 343 | float y1 = (region.topleft.y + region.btmright.y) * 0.5f; 344 | #endif 345 | float target_angle = M_PI * 0.5f; 346 | float rotation = target_angle - std::atan2(-(y1 - y0), x1 - x0); 347 | 348 | region.rotation = normalize_radians(rotation); 349 | } 350 | 351 | void rot_vec(fvec2& vec, float rotation) { 352 | float sx = vec.x; 353 | float sy = vec.y; 354 | vec.x = sx * std::cos(rotation) - sy * std::sin(rotation); 355 | vec.y = sx * std::sin(rotation) + sy * std::cos(rotation); 356 | } 357 | 358 | void compute_detect_to_roi(detect_region_t& region, const ie::SizeVector& dims) { 359 | int input_img_w = dims[3]; 360 | int input_img_h = dims[2]; 361 | #ifndef FULL_POSE 362 | float x_center = region.keys[kMidShoulderCenter].x * input_img_w; 363 | float y_center = region.keys[kMidShoulderCenter].y * input_img_h; 364 | float x_scale = region.keys[kUpperBodySizeRot ].x * input_img_w; 365 | float y_scale = region.keys[kUpperBodySizeRot ].y * input_img_h; 366 | #else 367 | float x_center = region.keys[kMidHipCenter ].x * input_img_w; 368 | float y_center = region.keys[kMidHipCenter ].y * input_img_h; 369 | float x_scale = region.keys[kFullBodySizeRot].x * input_img_w; 370 | float y_scale = region.keys[kFullBodySizeRot].y * input_img_h; 371 | #endif 372 | // Bounding box size as double distance from center to scale point. 373 | float box_size = std::sqrt((x_scale - x_center) * (x_scale - x_center) + 374 | (y_scale - y_center) * (y_scale - y_center)) * 2.0; 375 | 376 | /* RectTransformationCalculator::TransformNormalizedRect() */ 377 | float width = box_size; 378 | float height = box_size; 379 | float rotation = region.rotation; 380 | float shift_x = 0.0f; 381 | float shift_y = 0.0f; 382 | float roi_cx; 383 | float roi_cy; 384 | 385 | if (rotation == 0.0f) { 386 | roi_cx = x_center + (width * shift_x); 387 | roi_cy = y_center + (height * shift_y); 388 | } else { 389 | float dx = (width * shift_x) * std::cos(rotation) - (height * shift_y) * std::sin(rotation); 390 | float dy = (width * shift_x) * std::sin(rotation) + (height * shift_y) * std::cos(rotation); 391 | roi_cx = x_center + dx; 392 | roi_cy = y_center + dy; 393 | } 394 | 395 | /* 396 | * calculate ROI width and height. 397 | * scale parameter is based on 398 | * "mediapipe/modules/pose_landmark/pose_detection_to_roi.pbtxt" 399 | */ 400 | float scale_x = 1.5f; 401 | float scale_y = 1.5f; 402 | float long_side = std::max(width, height); 403 | float roi_w = long_side * scale_x; 404 | float roi_h = long_side * scale_y; 405 | 406 | region.roi_center.x = roi_cx / (float)input_img_w; 407 | region.roi_center.y = roi_cy / (float)input_img_h; 408 | region.roi_size.x = roi_w / (float)input_img_w; 409 | region.roi_size.y = roi_h / (float)input_img_h; 410 | 411 | /* calculate ROI coordinates */ 412 | float dx = roi_w * 0.5f; 413 | float dy = roi_h * 0.5f; 414 | region.roi_coord[0].x = -dx; region.roi_coord[0].y = -dy; 415 | region.roi_coord[1].x = +dx; region.roi_coord[1].y = -dy; 416 | region.roi_coord[2].x = +dx; region.roi_coord[2].y = +dy; 417 | region.roi_coord[3].x = -dx; region.roi_coord[3].y = +dy; 418 | 419 | for (int i = 0; i < 4; i++) 420 | { 421 | rot_vec(region.roi_coord[i], rotation); 422 | region.roi_coord[i].x += roi_cx; 423 | region.roi_coord[i].y += roi_cy; 424 | 425 | region.roi_coord[i].x /= (float)input_img_h; 426 | region.roi_coord[i].y /= (float)input_img_h; 427 | } 428 | } 429 | 430 | 431 | static void pack_detect_result(std::vector&detect_results, std::list& region_list, const ie::SizeVector& dims) { 432 | for (auto& region : region_list) { 433 | compute_rotation(region); 434 | compute_detect_to_roi(region, dims); 435 | detect_results.push_back(region); 436 | } 437 | } 438 | 439 | 440 | void dumpPose(detect_region_t& pose) { 441 | std::cout << "Score :" << pose.score << ", topleft: (" << pose.topleft.x << ", " << pose.topleft.y << ") btmright(" << pose.btmright.x << ", " << pose.btmright.y << ")" << std::endl; 442 | std::cout << "keys: "; 443 | for (size_t i = 0; i < kPoseDetectKeyNum; i++) { 444 | std::cout << "(" << pose.keys[i].x << ", " << pose.keys[i].y << ") "; 445 | } 446 | std::cout << std::endl; 447 | std::cout << "rotation(" << pose.rotation << ", roi_center(" << pose.roi_center.x << ", " << pose.roi_center.y << "), roi_size(" << pose.roi_size.x << ", " << pose.roi_size.y << ")" << std::endl; 448 | for (size_t i = 0; i < 4; i++) { 449 | std::cout << "roi_coord[" << i << "]=(" << pose.roi_coord[i].x << ", " << pose.roi_coord[i].y << ") "; 450 | } 451 | std::cout << std::endl; 452 | } 453 | 454 | 455 | void renderROI(cv::Mat& img, const fvec2(&roi_coord)[4]) { 456 | size_t w = img.size().width; 457 | size_t h = img.size().height; 458 | std::vector p; 459 | for (size_t i = 0; i < 4; i++) { 460 | cv::Point pt = cv::Point(roi_coord[i].x * w, roi_coord[i].y * h); 461 | p.push_back(pt); 462 | cv::putText(img, std::to_string(i), pt, cv::FONT_HERSHEY_PLAIN, 1, cv::Scalar(255,255,255), 1); 463 | } 464 | cv::polylines(img, p, true, cv::Scalar(255, 0, 0), 2, cv::LINE_AA); 465 | } 466 | 467 | 468 | 469 | void renderPose(cv::Mat& image_ocv, std::vector& detect_results, pose_landmark_result_t *landmarks) { 470 | const std::vector colors = { 471 | cv::Scalar(255, 0, 0), cv::Scalar(255, 85, 0), cv::Scalar(255, 170, 0), 472 | cv::Scalar(255, 255, 0), cv::Scalar(170, 255, 0), cv::Scalar( 85, 255, 0), 473 | cv::Scalar( 0, 255, 0), cv::Scalar( 0, 255, 85), cv::Scalar( 0, 255, 170), 474 | cv::Scalar( 0, 255, 255), cv::Scalar( 0, 170, 255), cv::Scalar( 0, 85, 255), 475 | cv::Scalar( 0, 0, 255), cv::Scalar( 85, 0, 255), cv::Scalar(170, 0, 255), 476 | cv::Scalar(255, 0, 255), cv::Scalar(255, 0, 170), cv::Scalar(255, 0, 85) 477 | }; 478 | const std::vector> bones = { 479 | { 0, 1, 2, 3, 7}, // 0 right eye and ear 480 | { 0, 4, 5, 6, 8}, // 1 left eye and ear 481 | { 9, 10}, // 2 mouth 482 | { 11, 12, 24, 23, 11}, // 3 body 483 | { 11, 13, 15, 17, 19, 15, 21}, // 4 right arm 484 | { 12, 14, 16, 18, 20, 16, 22} // 5 left arm 485 | #ifdef FULL_POSE 486 | ,{ 23, 25, 27, 29, 31, 27 }, // 6 right leg 487 | { 24, 26, 28, 30, 32, 28} // 7 left leg 488 | #endif 489 | }; 490 | size_t W = image_ocv.size().width; 491 | size_t H = image_ocv.size().height; 492 | for (size_t pose_id = 0; pose_id < detect_results.size(); pose_id++) { 493 | detect_region_t pose = detect_results[pose_id]; 494 | cv::Point2f mat_src[] = { cv::Point2f(0.f, 0.f), cv::Point2f(1.f, 0.f), cv::Point2f(1.f, 1.f) }; 495 | cv::Point2f mat_dst[] = { cv::Point2f(pose.roi_coord[0].x * W, pose.roi_coord[0].y * H), 496 | cv::Point2f(pose.roi_coord[1].x * W, pose.roi_coord[1].y * H), 497 | cv::Point2f(pose.roi_coord[2].x * W, pose.roi_coord[2].y * H) }; 498 | cv::Mat mat = cv::getAffineTransform(mat_src, mat_dst); 499 | mat.resize(3, cv::Scalar(0.f)); 500 | mat.at(2, 2) = 1.f; 501 | 502 | // Apply affine transform to project the junction points to the original ROI in the input image 503 | double px, py, flag; 504 | std::vector> pts; 505 | for (size_t i = 0; i < POSE_JOINT_NUM; i++) { 506 | cv::Mat pt = (cv::Mat_(3, 1) << landmarks[pose_id].joint[i].x, landmarks[pose_id].joint[i].y, 1.f); // 1x3 matrix 507 | pt = mat * pt; // Affine transform (0,0)-(1,1) => ROI 508 | px = pt.at(0, 0); 509 | py = pt.at(1, 0); 510 | flag = landmarks[pose_id].joint[i].z; 511 | pts.push_back(std::pair(cv::Point2f(px, py), flag)); 512 | cv::circle(image_ocv, cv::Point(pt.at(0, 0), pt.at(1, 0)), 4, cv::Scalar(255, 0, 0), -1); 513 | } 514 | // render bones 515 | size_t bone_idx = 0; 516 | for (auto& bone : bones) { 517 | size_t prev_idx = -1; 518 | for (auto& idx : bone) { 519 | if (prev_idx == -1) { 520 | prev_idx = idx; 521 | continue; 522 | } 523 | cv::Point2f pt1 = pts[prev_idx].first; 524 | cv::Point2f pt2 = pts[ idx].first; 525 | float flag1 = pts[prev_idx].second; 526 | float flag2 = pts[ idx].second; 527 | cv::Scalar color = colors[bone_idx++ % colors.size()]; 528 | cv::line(image_ocv, pt1, pt2, color, 2); 529 | prev_idx = idx; 530 | } 531 | } 532 | #ifdef RENDER_POINTS 533 | for (auto& pt : pts ) { 534 | cv::circle(image_ocv, pt.first, 4, cv::Scalar(255, 0, 0), -1); 535 | } 536 | #endif 537 | } 538 | } 539 | 540 | void renderTime(cv::Mat& img, double time_pd, double time_lm, double time_ttl) { 541 | 542 | cv::putText(img, cv::format("PD: %6.2fms", time_pd/1000.f), cv::Point(0, 10), cv::FONT_HERSHEY_PLAIN, 1, cv::Scalar(255, 0, 0), 1); 543 | cv::putText(img, cv::format("LM: %6.2fms", time_lm/1000.f), cv::Point(0, 30), cv::FONT_HERSHEY_PLAIN, 1, cv::Scalar(255, 0, 0), 1); 544 | cv::putText(img, cv::format("TTL: %6.2fms", time_ttl / 1000.f), cv::Point(0, 50), cv::FONT_HERSHEY_PLAIN, 1, cv::Scalar(255, 0, 0), 1); 545 | } 546 | 547 | 548 | 549 | int main(int argc, char *argv[]) { 550 | 551 | std::chrono::system_clock::time_point start, end, start_ttl, end_ttl; 552 | std::chrono::microseconds time_pd, time_lm, time_ttl; 553 | 554 | ie::Core ie; 555 | 556 | // Initialization for pose model 557 | ie::CNNNetwork net_pd; 558 | net_pd = ie.ReadNetwork(MODEL_POSE_DET+".xml", MODEL_POSE_DET+".bin"); 559 | net_pd.setBatchSize(1); 560 | std::string input_name_pd = "input"; // [1,3,128,128] 561 | std::shared_ptr input_info_pd = net_pd.getInputsInfo()[input_name_pd]; 562 | const ie::SizeVector idims_pd = input_info_pd->getTensorDesc().getDims(); // 0,1,2,3 = N,C,H,W 563 | input_info_pd->setLayout(ie::Layout::NHWC); 564 | input_info_pd->setPrecision(ie::Precision::FP32); 565 | 566 | std::vector oname_pd = { "StatefulPartitionedCall/functional_1/classificators/concat" , // [1,896,1] tensor scores 567 | "StatefulPartitionedCall/functional_1/regressors/concat" }; // [1,896,12] tensor bboxes 568 | for (auto& oname : oname_pd) { 569 | ie::DataPtr output_info = net_pd.getOutputsInfo()[oname]; 570 | output_info->setPrecision(ie::Precision::FP32); 571 | } 572 | 573 | ie::ExecutableNetwork exenet_pd = ie.LoadNetwork(net_pd, DEVICE_PD); 574 | ie::InferRequest ireq_pd = exenet_pd.CreateInferRequest(); 575 | 576 | 577 | 578 | // Initialization for landmark model 579 | ie::CNNNetwork net_lm; 580 | net_lm = ie.ReadNetwork(MODEL_LM_DET + ".xml", MODEL_LM_DET + ".bin"); 581 | net_lm.setBatchSize(1); 582 | std::string input_name_lm = "input"; // [1,3,256,256] 583 | std::shared_ptr input_info_lm = net_lm.getInputsInfo()[input_name_lm]; 584 | const ie::SizeVector idims_lm = input_info_lm ->getTensorDesc().getDims(); // 0,1,2,3 = N,C,H,W 585 | input_info_lm->setLayout(ie::Layout::NHWC); 586 | input_info_lm->setPrecision(ie::Precision::FP32); 587 | 588 | std::vector oname_lm = { "StatefulPartitionedCall/functional_1/output_segmentation/BiasAdd/Add", // [1, 1, 128, 128] 589 | "StatefulPartitionedCall/functional_1/tf_op_layer_Sigmoid/Sigmoid", // [1, 1, 1, 1] landmark flag ? 590 | "StatefulPartitionedCall/functional_1/tf_op_layer_ld_3d/ld_3d" }; // [1, 124] landmark 591 | for (auto& oname : oname_lm) { 592 | ie::DataPtr output_info = net_lm.getOutputsInfo()[oname]; 593 | output_info->setPrecision(ie::Precision::FP32); 594 | } 595 | 596 | ie::ExecutableNetwork exenet_lm = ie.LoadNetwork(net_lm, DEVICE_LM); 597 | ie::InferRequest ireq_lm = exenet_lm.CreateInferRequest(); 598 | 599 | 600 | 601 | std::vector anchors; 602 | create_ssd_anchors(idims_pd[3], idims_pd[2], anchors); 603 | 604 | #if INPUT_TYPE == VIDEO_INPUT 605 | cv::VideoCapture cap(INPUT_FILE); 606 | #elif INPUT_TYPE == CAM_INPUT 607 | cv::VideoCapture cap(0); 608 | #endif 609 | 610 | #if INPUT_TYPE == VIDEO_INPUT || INPUT_TYPE == CAM_INPUT 611 | #ifdef VIDEO_OUTPUT 612 | cv::VideoWriter writer("output.mp4", cv::VideoWriter::fourcc('m', 'p', '4', 'v'), 30.0, cv::Size(VIDEO_SIZE, VIDEO_SIZE)); 613 | #endif 614 | #endif 615 | 616 | 617 | int key = -1; 618 | while (key != 27) { // ESC key to quit 619 | 620 | // read an image and trim down to square image 621 | const size_t _N = 0, _C = 1, _H = 2, _W = 3; 622 | cv::Mat image_ocv, image_pd, image_lm, image_f; 623 | cv::Mat image_org; 624 | #if INPUT_TYPE == IMAGE_INPUT 625 | image_org = cv::imread(INPUT_FILE); 626 | #elif INPUT_TYPE == VIDEO_INPUT || INPUT_TYPE == CAM_INPUT 627 | bool flg = cap.read(image_org); 628 | if (flg == false) break; // end of the movie file 629 | #endif 630 | start_ttl = std::chrono::system_clock::now(); 631 | 632 | // resize input image with keeping aspect ratio 633 | size_t iw = image_org.size().width; 634 | size_t ih = image_org.size().height; 635 | if (iw > ih) { 636 | image_ocv = image_org(cv::Rect(iw / 2 - ih / 2, 0, ih, ih)); 637 | } 638 | else if (iw < ih) { 639 | image_ocv = image_org(cv::Rect(0, ih / 2 - iw / 2, iw, iw)); 640 | } 641 | 642 | // pose (ROI) detection 643 | cv::resize(image_ocv, image_pd, cv::Size(idims_pd[_W], idims_pd[_H])); 644 | image_pd.convertTo(image_f, CV_32F, (1.f / 255.f), 0.f); // Convert to FP32 and do pre-processes (scale and mean subtract) 645 | //image_pd.convertTo(image_f, CV_32F, (1.f / 127.5f), -1.f); // Convert to FP32 and do pre-processes (scale and mean subtract) 646 | ie::TensorDesc tDesc(ie::Precision::FP32, { 1, static_cast(image_f.channels()), static_cast(image_f.size().height), static_cast(image_f.size().width) }, ie::Layout::NHWC); 647 | ireq_pd.SetBlob(input_name_pd, ie::make_shared_blob(tDesc, (float*)(image_f.data))); 648 | 649 | start = std::chrono::system_clock::now(); 650 | ireq_pd.Infer(); // Pose detection 651 | end = std::chrono::system_clock::now(); 652 | time_pd = std::chrono::duration_cast(end - start); 653 | 654 | // BlobName:StatefulPartitionedCall/functional_1/classificators/concat, Shape:[1, 896, 1], Precision:FP32 655 | // BlobName:StatefulPartitionedCall/functional_1/regressors/concat, Shape:[1, 896, 8], Precision:FP32 656 | float* scores = ireq_pd.GetBlob(oname_pd[0])->buffer(); 657 | float* bboxes = ireq_pd.GetBlob(oname_pd[1])->buffer(); 658 | 659 | std::list region_list, region_nms_list; 660 | std::vector detect_results; 661 | decode_bounds(region_list, 0.5f, idims_pd[_W], idims_pd[_H], scores, bboxes, anchors); 662 | non_max_suppression(region_list, region_nms_list, 0.3f); 663 | pack_detect_result(detect_results, region_nms_list, idims_pd); 664 | 665 | // landmark detection 666 | cv::Mat outimg = image_ocv.clone(); // clone input image for rendering 667 | size_t W = image_ocv.size().width; 668 | size_t H = image_ocv.size().height; 669 | pose_landmark_result_t landmarks[MAX_POSE_NUM] = { 0 }; 670 | for (size_t pose_id = 0; pose_id < detect_results.size(); pose_id++) { 671 | 672 | detect_region_t pose = detect_results[pose_id]; 673 | 674 | cv::Point2f mat_src[] = { cv::Point2f(pose.roi_coord[0].x * W, pose.roi_coord[0].y * H), 675 | cv::Point2f(pose.roi_coord[1].x * W, pose.roi_coord[1].y * H), 676 | cv::Point2f(pose.roi_coord[2].x * W, pose.roi_coord[2].y * H) }; 677 | cv::Point2f mat_dst[] = { cv::Point2f(0, 0), cv::Point2f(idims_lm[_W], 0), cv::Point2f(idims_lm[_W], idims_lm[_H]) }; 678 | cv::Mat mat = cv::getAffineTransform(mat_src, mat_dst); 679 | cv::Mat img_affine = cv::Mat::zeros(idims_lm[_W], idims_lm[_H], CV_8UC3); 680 | cv::warpAffine(image_ocv, img_affine, mat, img_affine.size()); // Crop and rotate ROI by warp affine transform 681 | 682 | img_affine.convertTo(image_f, CV_32F, (1.f / 255.f), 0.f); 683 | //img_affine.convertTo(image_f, CV_32F, (1.f / 127.5f), -1.f); 684 | tDesc = ie::TensorDesc(ie::Precision::FP32, { 1, (unsigned int)(image_f.channels()), (unsigned int)(image_f.size().height), (unsigned int)(image_f.size().width) }, ie::Layout::NHWC); 685 | ireq_lm.SetBlob(input_name_lm, ie::make_shared_blob(tDesc, (float*)(image_f.data))); 686 | 687 | start = std::chrono::system_clock::now(); 688 | ireq_lm.Infer(); // Landmark detection 689 | end = std::chrono::system_clock::now(); 690 | time_lm = std::chrono::duration_cast(end - start); 691 | 692 | float* output_lm1 = ireq_lm.GetBlob(oname_lm[0])->buffer(); // ?? 693 | float* lm_score = ireq_lm.GetBlob(oname_lm[1])->buffer(); // Landmark score (flag) 694 | float* lm = ireq_lm.GetBlob(oname_lm[2])->buffer(); // Landmarks 695 | 696 | landmarks[pose_id].score = *lm_score; 697 | for (size_t i = 0; i < POSE_JOINT_NUM; i++) { 698 | landmarks[pose_id].joint[i].x = lm[4 * i + 0] / (float)idims_lm[_W]; 699 | landmarks[pose_id].joint[i].y = lm[4 * i + 1] / (float)idims_lm[_H]; 700 | landmarks[pose_id].joint[i].z = lm[4 * i + 2]; 701 | } 702 | #ifdef RENDER_ROI 703 | renderROI(outimg, pose.roi_coord); 704 | #endif 705 | renderPose(outimg, detect_results, landmarks); 706 | } 707 | end_ttl = std::chrono::system_clock::now(); 708 | time_ttl = std::chrono::duration_cast(end_ttl - start_ttl); 709 | 710 | #ifdef RENDER_TIME 711 | renderTime(outimg, time_pd.count(), time_lm.count(), time_ttl.count()); 712 | #endif 713 | 714 | #if INPUT_TYPE == IMAGE_INPUT 715 | cv::imwrite("output.jpg", outimg); 716 | cv::imshow("output", outimg); 717 | cv::waitKey(0); 718 | key = 27; // force exit 719 | #elif INPUT_TYPE == VIDEO_INPUT || INPUT_TYPE == CAM_INPUT 720 | cv::imshow("output", outimg); 721 | key = cv::waitKey(1); 722 | #ifdef VIDEO_OUTPUT 723 | cv::resize(outimg, outimg, cv::Size(VIDEO_SIZE, VIDEO_SIZE)); 724 | writer << outimg; 725 | #endif 726 | #endif 727 | } 728 | return 0; 729 | } 730 | -------------------------------------------------------------------------------- /resources/output-full.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yas-sim/blazepose_openvino/7cdcc10894cf5b01c8018c6b346e0dcc01e42209/resources/output-full.gif -------------------------------------------------------------------------------- /resources/output.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yas-sim/blazepose_openvino/7cdcc10894cf5b01c8018c6b346e0dcc01e42209/resources/output.gif -------------------------------------------------------------------------------- /resources/result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yas-sim/blazepose_openvino/7cdcc10894cf5b01c8018c6b346e0dcc01e42209/resources/result.png --------------------------------------------------------------------------------