├── HKD.gif ├── HKPD_NB.ipynb ├── README.md ├── convert_Oicsv_to_tfcsv.py └── pipeline.jpg /HKD.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rafik-rahoui/Hand-keypoints-detection/b7158fb5c434233c2b22887cbef9648a7d4f7457/HKD.gif -------------------------------------------------------------------------------- /HKPD_NB.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%reload_ext autoreload\n", 10 | "%autoreload 2\n", 11 | "%matplotlib inline" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from fastai import *\n", 21 | "from fastai.vision import *\n", 22 | "from fastai.vision import image as im\n", 23 | "import torch.nn as nn\n", 24 | "from torch.nn.functional import mse_loss\n", 25 | "import json\n", 26 | "import re" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "path= 'path_to_your_dataset' # You need to specify the path to your dataset\n", 36 | " # you can find many datasets out there. the best would be to annotate your own dataset, \n", 37 | " # but it's too time consuming. if like me, you are intrested to learn the dynamics \n", 38 | " # at the heart of computure vision, you can pick one of the datasets available out there.\n", 39 | " # i won't matter. " 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 4, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "# setting the transformations to perform on your dataset\n", 49 | "transforms = get_transforms(do_flip=False, max_zoom=1.1, max_warp=0.01,max_rotate=45) \n", 50 | "# This function gets hand's coordinates from a json file. Very important!!, in fastai \"y\" is expected to be first before \"x\" \n", 51 | "def get_y_func(x):\n", 52 | " pre, ext = os.path.splitext(x)\n", 53 | " img = open_image(x)\n", 54 | " coords = []\n", 55 | " for k in json.load(open(pre + '.json'))['hand_keypoints']:\n", 56 | " coords.append([k[1],k[0]]) # inverting x and y\n", 57 | " return torch.tensor(coords,dtype=torch.float)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 6, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "# setting your data_block\n", 67 | "data = (PointsItemList.from_folder(path=path, extensions=['.jpg'])\n", 68 | " .split_by_folder(train='train', valid='test') # setting training and testing dataset folders paths\n", 69 | " .label_from_func(get_y_func) # using get_y_func() to get coordinates for each image\n", 70 | " .transform(transforms,size=224, tfm_y=True, remove_out=False, # very important!!!: setting remove_out to False, \n", 71 | " # prevents from discarding coordinates that may \n", 72 | " # disappear after data augmentation \n", 73 | " padding_mode='border', resize_method=ResizeMethod.PAD)\n", 74 | " .databunch(bs=8) # Setting your batch size. \n", 75 | " .normalize(imagenet_stats)) # Normalizing the data to help the model converging faster" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 7, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "# This function reshapes the output tensor to (in our case) a 21x2 tensor, witch corresponds to the 21 hand coordinates \n", 85 | "class Reshape(nn.Module):\n", 86 | " def __init__(self, *args):\n", 87 | " super(Reshape, self).__init__()\n", 88 | " self.shape = args\n", 89 | "\n", 90 | " def forward(self, x):\n", 91 | " return x.view(self.shape) " 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 9, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "# removing the classification head and setting a custom head for regression:\n", 101 | "# A basic Flattening and downsizing to get the 21 final coordinates. \n", 102 | "head_reg = nn.Sequential(\n", 103 | " Flatten(), \n", 104 | " nn.ReLU(),\n", 105 | " nn.Dropout(0.5),\n", 106 | " nn.Linear(512*7*7, 256),\n", 107 | " nn.ReLU(),\n", 108 | " nn.BatchNorm1d(256),\n", 109 | " nn.Dropout(0.5),\n", 110 | " nn.Linear(256, 42),\n", 111 | " Reshape(-1,21,2),\n", 112 | " nn.Tanh()) # I added the Tanh function to keep the output in [-1,1] range to help the model converge faster.\n", 113 | " # you can remove it, it won't prevent the model from converging " 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 10, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "#Setting a custom loss function using MSELoss class. We need to flatten the output tensor to be able \n", 123 | "# to compare it to the target (hence .view(-1)). \n", 124 | "class MSELossFlat(nn.MSELoss):\n", 125 | " def forward(self, input:Tensor, target:Tensor):\n", 126 | " return super().forward(input.view(-1), target.view(-1)) \n", 127 | "\n", 128 | "mse_loss_flat = MSELossFlat() #very important!!:initialazing the class\n", 129 | "\n", 130 | "learn = cnn_learner(data, models.resnet34,custom_head=head_reg, loss_func=mse_loss_flat) " 131 | ] 132 | } 133 | ], 134 | "metadata": { 135 | "kernelspec": { 136 | "display_name": "Python 3", 137 | "language": "python", 138 | "name": "python3" 139 | }, 140 | "language_info": { 141 | "codemirror_mode": { 142 | "name": "ipython", 143 | "version": 3 144 | }, 145 | "file_extension": ".py", 146 | "mimetype": "text/x-python", 147 | "name": "python", 148 | "nbconvert_exporter": "python", 149 | "pygments_lexer": "ipython3", 150 | "version": "3.7.3" 151 | } 152 | }, 153 | "nbformat": 4, 154 | "nbformat_minor": 1 155 | } 156 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hand-keypoints-detection 2 | 3 | 4 |
⁃ The image is grabbed by the camera;
18 |⁃ A first deep learning model detects the hand on the image and estimates the coordinates of the box around it (done by retraining tensorflow object detection API on hand detection, you could also achieve it by building a custom deep learning model);
19 |⁃ A second deep learning regression model takes the image inside the box and estimates the coordinates of all hand keypoints (achieved by transfert learning from a resnet34 with a custom head).
20 | 21 |