├── .gitignore ├── LICENSE ├── README.md ├── ai_whiteboard.py ├── config.py ├── custom_dataset ├── .gitignore └── generate_data.py ├── fingertips_detector ├── net │ └── network.py └── unified_detector.py ├── gst_cam.py ├── h5_to_trt.py ├── hand_detector ├── detector.py └── yolo │ ├── __pycache__ │ ├── darknet.cpython-36.pyc │ └── darknet.cpython-37.pyc │ ├── darknet.py │ ├── generator.py │ ├── history.py │ ├── predict.py │ ├── preprocess │ ├── __pycache__ │ │ ├── yolo_flag.cpython-36.pyc │ │ └── yolo_flag.cpython-37.pyc │ ├── augmentation.py │ ├── labelgen.py │ └── yolo_flag.py │ ├── train.py │ └── utils │ ├── info.py │ └── utils.py ├── images ├── 1.jpg ├── 10.jpg ├── 2.jpg ├── 3.jpg ├── 4.jpg ├── 5.jpg ├── 6.jpg ├── 7.jpg ├── 8.jpg ├── 9.jpg ├── ai_whiteboard.gif ├── to_clean.jpg ├── to_erase.jpg ├── to_move.jpg ├── to_paint.jpg └── to_save.jpg ├── metrics.py ├── requirements.txt ├── saved └── .gitignore ├── trt_utils.py ├── weights ├── .gitignore └── engines │ └── .gitignore ├── yolo_test.py └── yolo_train.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.h5 3 | *.pb 4 | *.engine 5 | *.onnx 6 | *.pyc 7 | *.jpg 8 | !images/*.jpg 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 PRESTE 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AI_whiteboard 2 | 3 | ![](images/ai_whiteboard.gif) 4 | 5 | ## Idea 6 | 7 | The idea of this project is to transform any wall or surface into an interactive whiteboard just with an ordinary RGB camera and your hand. 8 | I hope you'll find it interesting ! 9 | 10 | ## Hardware 11 | 12 | - Jetson Xavier NX **JetPack 4.4** 13 | - Raspberry Pi Camera + ArduCam (8MP IMX219 Sensor Module) 14 | 15 | **Note:** The system works also on Jetson Nano, TX2 16 | 17 | ## Details 18 | 19 | To use AI whiteboard correctly you need to find a wall or flat surface and place a camera at a distance of about 1 meter. It can be any wall/surface but the system works more accurately with the dark or light monotonous walls/surfaces. 20 | We capture an image from a camera. Then we crop this image into a square. Next, we use **a hand detector[1] (YOLO[3] - deep neural network)**,to find a hand in the image. If there is a hand in the image, we crop that hand out of the image and feed it to **a Fingertip detector[1] (modified VGG16 - deep neural network)**. Next, if we can detect fingertips, we use their coordinates to control the whiteboard (See the control section below). 21 | 22 | ## Launch AI Whiteboard 23 | 24 | ##### 1. Set up your NVIDIA Jetson Device or use your PC. 25 | - [Jetson Xavier NX](https://developer.nvidia.com/embedded/learn/get-started-jetson-xavier-nx-devkit) with [JetPack 4.4](https://developer.nvidia.com/jetpack-sdk-44-archive) (CUDA 10.2, TensorRT 7.1.3, cuDNN 8.0) 26 | - [Install Tensorflow 1.15.3](https://docs.nvidia.com/deeplearning/frameworks/install-tf-jetson-platform/index.html) 27 | 28 | ##### 2. Download AI Whiteboard project. `$ git clone https://github.com/preste-ai/camera_ai_whiteboard.git ` 29 | 30 | ##### 3. Install packages 31 | 32 | You can download needed packages via pip using the `requirements.txt` file: 33 | 34 | ```python 35 | pip3 install -r requirements.txt 36 | ``` 37 | 38 | ##### 4. Download [weights or TensorRT engines](https://drive.google.com/drive/folders/1eDBqbZfoY7XJ3fYv8FEMJ5AZe_3n0sjU?usp=sharing) and put it to `weights` or `weights/engines`. 39 | 40 | **Note:** The current TensorRT engines work correctly **only** on Jetson Xavier NX devices as TensorRT runs device-specific profiling during the optimization phase.If you want to use this models(engines) on others Jetson devices please convert .h5 model with `h5_to_trt.py` script on your platform. 41 | 42 | ##### 5. Set up the power mode (ID=2, 15W 6 cores) `$ sudo /usr/sbin/nvpmodel -m 2` 43 | 44 | ##### 6. Launch device's fan `sudo jetson_clocks --fan` 45 | 46 | ##### 7. Run AI whiteboard script. 47 | 48 | Check `config.py` file and set up needed parameters. 49 | - whiteboard_w : 200 - whiteboard width (px) (displayed on camera caputed image) 50 | - whiteboard_h : 200 - whiteboard height (px) (displayed on camera caputed image) 51 | - cam_w : 320 - width (px) of a captured image 52 | - cam_h : 240 - height (px) of a captured image 53 | - framerate : 60 - camera capture framerate (for Raspberry Pi Camera) 54 | - zoom_koef : 2 - zoom coefficient to resize whiteboard_w and whiteboard_h 55 | - confidence_ft_threshold : 0.5 - confidence threshold of Fingertips detector 56 | - confidence_hd_threshold : 0.8 - confidence threshold of Hand detector 57 | 58 | --- 59 | Run from a project root directory: 60 | 61 | **Jetson Devices** 62 | ```python 63 | python3 ai_whiteboard.py --rpc --trt 64 | ``` 65 | - rpc : If you want to use a Raspberry Pi Camera. Default: False 66 | - trt : If you want to use TensorRT engines. Default: False 67 | 68 | **Laptop** 69 | ```python 70 | python3 ai_whiteboard.py 71 | ``` 72 | 73 | ###### Control gesture (combination) 74 | | To draw | To move | To erase | To clean | To save | 75 | |:---------------:|:---------------:|:---------------:|:---------------:|:---------------:| 76 | |![](images/to_paint.jpg)|![](images/to_move.jpg)|![](images/to_erase.jpg)|![](images/to_clean.jpg)|![](images/to_save.jpg)| 77 | 78 | 79 | ## Training Hand-detector 80 | 81 | #### Custom Dataset 82 | 83 | A [custom dataset](https://drive.google.com/drive/folders/1rFHtl6A4EKokuOQk-9vqvWV0WiKyRfco?usp=sharing) was collected and labeled (12,000 images) for training. For labeling I used [CVAT](https://github.com/openvinotoolkit/cvat). 84 | 85 | - Train: 9,500 images 86 | - Validation: 1000 images 87 | - Test : 1500 images 88 | 89 | | | | | | | 90 | |:---------------:|:---------------:|:---------------:|:---------------:|:---------------:| 91 | |![](images/1.jpg)|![](images/2.jpg)|![](images/3.jpg)|![](images/4.jpg)|![](images/5.jpg)| 92 | |![](images/6.jpg)|![](images/7.jpg)|![](images/8.jpg)|![](images/9.jpg)|![](images/10.jpg)| 93 | 94 | 95 | Run from a project root directory: 96 | 97 | ```python 98 | python3 yolo_train.py 99 | ``` 100 | 101 | 102 | ## Testing Hand-detector 103 | 104 | Run from a project root directory: 105 | 106 | ```python 107 | python3 yolo_test.py 108 | ``` 109 | 110 | ## Convert .h5 model to TensorRT engine [2] 111 | 112 | The transformation takes place in 3 stages: 113 | 1. Freeze graph and remove training nodes (.h5 -> .pb) 114 | 2. Convert frozen graph to onnx (.pb -> .onnx) 115 | 3. Convert onnx model to TensorRT engine (.onnx -> .engine) 116 | 117 | Run from a project root directory: 118 | 119 | ```python 120 | python3 h5_to_trt.py --folder weights --weights_file yolo --fp 16 121 | ``` 122 | 123 | - folder weights : path to the folder with model 124 | - weights_file : weights file name (**without .h5**) 125 | - fp : TensorRT engine precision (16 or 32) 126 | 127 | Metrics for **Hand detection** after model conversion. 128 | 129 | In order to determine the correctness of the detection, we use the value of [IOU](https://medium.com/towards-artificial-intelligence/understanding-iou-metric-in-object-detection-1e5532f06a76). If the value of IOU is more than 0.5 then the detector predicts a hand correctly otherwise - no. The results are given below. 130 | 131 | 132 | | | keras model before training | keras model after training | TensorRT engine (fp32) | TensorRT engine (fp16) | 133 | |:---------------:|:---------------:|:---------------:|:---------------:|:---------------:| 134 | | Accuracy | 72.68 % | 89.14 % | 89.14 % | 89.07 % | 135 | | Precision | 84.80 % | 99.45 % | 99.45 % | 99.45 % | 136 | | Recall | 50.78 % | 77.24 % | 77.24 % | 77.10 % | 137 | 138 | 139 | ## Solution performance (Hand detector + Fingertips detector) 140 | 141 | Captured image shape : 320x240 142 | Jetson Xavier NX: power mode ID 2: 15W 6 cores 143 | 144 | | | keras model | TensorRT engine (fp32) | TensorRT engine (fp16) | 145 | |:---------------:|:---------------:|:---------------:|:---------------:| 146 | | Average FPS | 12 | 33 | 60 | 147 | 148 | 149 | 150 | ## References 151 | 1. Unified Gesture and Fingertip Detection : https://github.com/MahmudulAlam/Unified-Gesture-and-Fingertip-Detection 152 | 2. TensorRT guide: https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#python_topics 153 | 3. YOLO9000: Better, Faster, Stronger : https://arxiv.org/abs/1612.08242 154 | -------------------------------------------------------------------------------- /ai_whiteboard.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import time 4 | import copy 5 | import argparse 6 | 7 | import tensorflow as tf 8 | config = tf.compat.v1.ConfigProto() 9 | config.gpu_options.allow_growth = True 10 | session = tf.compat.v1.Session(config=config) 11 | 12 | from fingertips_detector.unified_detector import Fingertips 13 | from hand_detector.detector import YOLO 14 | from config import config 15 | from gst_cam import gstreamer_pipeline 16 | 17 | 18 | class AIWhiteboard(): 19 | """AI Whiteboard""" 20 | def __init__(self, args): 21 | """ 22 | Initialization of AI Whiteboard class 23 | 24 | args.trt :boolean : if True - use TensorRT engines for inference 25 | args.raspberry_pi_camera :boolean : if True - capture images from Raspberry Pi Camera 26 | """ 27 | 28 | super(AIWhiteboard, self).__init__() 29 | self.confidence_ft_threshold = config['confidence_ft_threshold'] 30 | self.confidence_hd_threshold = config['confidence_hd_threshold'] 31 | self.colors = [(15, 15, 240), 32 | (15, 240, 155), 33 | (240, 155, 15), 34 | (240, 15, 155), 35 | (240, 15, 240)] 36 | 37 | # init models 38 | self.hand_detector = YOLO(weights='weights/trained_yolo.h5', 39 | trt_engine = 'weights/engines/model_trained_yolo.fp16.engine', 40 | threshold=self.confidence_hd_threshold, 41 | trt = args.trt) 42 | 43 | self.fingertips_detector = Fingertips(weights='weights/classes8.h5', 44 | trt_engine = 'weights/engines/model_classes8.fp16.engine', 45 | trt = args.trt) 46 | if args.raspberry_pi_camera: 47 | self.cam = cv2.VideoCapture(gstreamer_pipeline(capture_width=config['cam_w'], 48 | capture_height=config['cam_h'], 49 | display_width=config['cam_w'], 50 | display_height=config['cam_h'], 51 | framerate=config['framerate']), 52 | cv2.CAP_GSTREAMER) 53 | else: 54 | self.cam = cv2.VideoCapture(0) 55 | self.cam.set(cv2.CAP_PROP_FRAME_WIDTH, config['cam_w']) 56 | self.cam.set(cv2.CAP_PROP_FRAME_HEIGHT, config['cam_h']) 57 | 58 | 59 | origin_w = int(self.cam.get(cv2.CAP_PROP_FRAME_WIDTH)) 60 | origin_h = int(self.cam.get(cv2.CAP_PROP_FRAME_HEIGHT)) 61 | 62 | # cropped coordinates (to get a square image) 63 | self.cropped_x_st = int(origin_w/2) - int(origin_h/2) 64 | self.cropped_x_end = int(origin_w/2) + int(origin_h/2) 65 | 66 | # whiteboard_tl - top left corner of whiteboard on cropped image 67 | # whiteboard_br - bottom right corner of whiteboard on cropped image 68 | self.whiteboard_tl = (int((self.cropped_x_end-self.cropped_x_st-config['whiteboard_w'])/2), int((origin_h-config['whiteboard_h'])/2)) 69 | self.whiteboard_br = (int((self.cropped_x_end-self.cropped_x_st+config['whiteboard_w'])/2), int((origin_h+config['whiteboard_h'])/2)) 70 | 71 | # Create a whiteboard 72 | self.whiteboard = np.zeros((config['zoom_koef']*config['whiteboard_h'], 73 | config['zoom_koef']*config['whiteboard_w'], 74 | 3), np.uint8) + 255 75 | # Create a info whiteboard for demonstration 76 | self.info_whiteboard = copy.deepcopy(self.whiteboard) 77 | 78 | 79 | def draw(self, prob, pos): 80 | """ 81 | Draw detected fingers on whiteboard 82 | 83 | prob :numpy array : array of confidance score of each finger according to Fingertips detector 84 | pos :numpy array : array of relative fingers position on whiteboard according to Fingertips detector 85 | """ 86 | 87 | # whiteboard shape 88 | width = config['whiteboard_w'] * config['zoom_koef'] 89 | height = config['whiteboard_h'] * config['zoom_koef'] 90 | 91 | # number of detected fingers 92 | n_fingers = int(np.sum(prob)) 93 | 94 | # one finger detected : INDEX | action: paint 95 | if n_fingers == 1 and prob[1] == 1.0: 96 | center = (int(pos[2]*width), int(pos[3]*height) ) 97 | cv2.circle(self.whiteboard, center, radius=5, color=(0,0,0), thickness=-1) 98 | 99 | self.info_whiteboard = copy.deepcopy(self.whiteboard) 100 | cv2.circle(self.info_whiteboard, center, radius=5, color=(0,20,200), thickness=2) 101 | 102 | # two fingers detected: THUMB + INDEX | action: show pointer 103 | elif n_fingers == 2 and prob[1] == 1.0 and prob[0] == 1.0: 104 | center = (int(pos[2]*width), int(pos[3]*height) ) 105 | 106 | self.info_whiteboard = copy.deepcopy(self.whiteboard) 107 | cv2.circle(self.info_whiteboard, center, radius=5, color=(255,0,0), thickness=2) 108 | 109 | # five fingers detected | action: erase 110 | elif n_fingers == 5 : 111 | center = (int(pos[2]*width), int(pos[3]*height) ) 112 | cv2.circle(self.whiteboard, center, radius=10, color=(255,255,255), thickness=-1) 113 | 114 | self.info_whiteboard = copy.deepcopy(self.whiteboard) 115 | cv2.circle(self.info_whiteboard, center, radius=12, color=(0,255,0), thickness=2) 116 | 117 | # two fingers detected: THUMB + PINKY | action: clean whiteboard 118 | elif n_fingers == 2 and prob[0] == 1.0 and prob[4] == 1.0: 119 | self.whiteboard = np.zeros((height,width,3), np.uint8) + 255 120 | self.info_whiteboard = copy.deepcopy(self.whiteboard) 121 | 122 | # three fingers detected: THUMB + MIDDLE + RING | action: save whiteboard 123 | elif n_fingers == 3 and prob[1] == 1.0 and prob[2] == 1.0 and prob[3] == 1.0: 124 | cv2.imwrite('saved/whiteboard.jpg', self.whiteboard) 125 | print('-- whiteboard.jpg saved! ') 126 | self.info_whiteboard = copy.deepcopy(self.whiteboard) 127 | 128 | # three fingers detected: THUMB + INDEX + PINKY | action: exit 129 | # elif n_fingers == 3 and prob[0] == 1.0 and prob[1] == 1.0 and prob[4] == 1.0: 130 | # info_whiteboard = copy.deepcopy(whiteboard) 131 | # k = 1 132 | # print('=== EXIT ===') 133 | else: 134 | self.info_whiteboard = copy.deepcopy(self.whiteboard) 135 | 136 | 137 | def run(self): 138 | """ 139 | Run AI Whiteboard 140 | """ 141 | try: 142 | while True: 143 | ret, image = self.cam.read() 144 | image = image[:,self.cropped_x_st:self.cropped_x_end,:] 145 | 146 | if ret is False: 147 | break 148 | 149 | start = time.time() 150 | 151 | # hand detection 152 | # tl - top left corner of hand bbox on cropped image 153 | # br - bottom right corner of hand bbox on cropped image 154 | tl, br = self.hand_detector.detect(image=image) 155 | if tl and br is not None and br[0] - tl[0] >= 5 and br[1] - tl[1] >= 5: 156 | cropped_hand = image[tl[1]:br[1], tl[0]: br[0]] 157 | height_hand, width_hand, _ = cropped_hand.shape 158 | 159 | # gesture classification and fingertips regression 160 | prob, pos = self.fingertips_detector.classify(image=cropped_hand) 161 | pos = np.mean(pos, 0) 162 | 163 | # post-processing: absolute fingers position on an image 164 | prob = np.asarray([(p >= self.confidence_ft_threshold) * 1.0 for p in prob]) 165 | for i in range(0, len(pos), 2): 166 | pos[i] = pos[i] * width_hand + tl[0] 167 | pos[i + 1] = pos[i + 1] * height_hand + tl[1] 168 | 169 | # post-processing: relative fingers position on a whiteboard 170 | relative_pos = [] 171 | for i in range(0, len(pos), 2): 172 | tmp_x = max(-5, pos[i] - self.whiteboard_tl[0])/config['whiteboard_w'] 173 | tmp_y = max(-5, pos[i+1] - self.whiteboard_tl[1])/config['whiteboard_h'] 174 | relative_pos.append(tmp_x) 175 | relative_pos.append(tmp_y) 176 | relative_pos = np.array(relative_pos) 177 | # draw on whiteboard 178 | self.draw(prob, relative_pos) 179 | 180 | # drawing fingertips 181 | index = 0 182 | for c, p in enumerate(prob): 183 | if p >= self.confidence_ft_threshold: 184 | image = cv2.circle(image, (int(pos[index]), int(pos[index + 1])), radius=5, 185 | color=self.colors[c], thickness=-2) 186 | index += 2 187 | 188 | k = cv2.waitKey(1) 189 | if k==27: # Esc key to stop 190 | break 191 | 192 | end = time.time() 193 | 194 | str_fps = '{:.1f} fps'.format(1/(end-start)) 195 | # print(str_fps) 196 | cv2.putText(image, str_fps,(15,15), cv2.FONT_HERSHEY_SIMPLEX, 0.5,(0,255,0),2,cv2.LINE_AA) 197 | image = cv2.rectangle(image, (self.whiteboard_tl[0], self.whiteboard_tl[1]), (self.whiteboard_br[0], self.whiteboard_br[1]), (255, 255, 255), 2) 198 | 199 | # display image 200 | cv2.imshow('Fingertips', cv2.resize(image, (config['zoom_koef']*config['whiteboard_h'],config['zoom_koef']*config['whiteboard_w']))) 201 | # display whiteboard 202 | cv2.imshow('AI_whiteboard', self.info_whiteboard) 203 | 204 | 205 | self.cam.release() 206 | cv2.destroyAllWindows() 207 | 208 | except Exception as e: 209 | self.cam.release() 210 | cv2.destroyAllWindows() 211 | print("Error: {}".format(e)) 212 | exit(1) 213 | 214 | 215 | def parse_args(): 216 | """ Parse input arguments """ 217 | parser = argparse.ArgumentParser(description='Whiteboard arguments') 218 | 219 | parser.add_argument('--rpc', dest='raspberry_pi_camera', action='store_true', help='Run AI whiteboard with Raspberry Pi Camera') 220 | parser.set_defaults(raspberry_pi_camera=False) 221 | parser.add_argument('--trt', dest='trt', action='store_true', help='Use TensoRT engine') 222 | parser.set_defaults(trt=False) 223 | 224 | return parser.parse_args() 225 | 226 | if __name__ == "__main__": 227 | args = parse_args() 228 | ai_w = AIWhiteboard(args) 229 | ai_w.run() 230 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | config = { 2 | 'whiteboard_w': 200, # 640 # 1100 3 | 'whiteboard_h': 200, # 480 # 620 4 | 'cam_w' : 320, 5 | 'cam_h' : 240, 6 | 'framerate' : 60, 7 | 'zoom_koef' : 2, # zoom koef 8 | 'confidence_ft_threshold' : 0.5, # confidence threshold of Fingertips detector 9 | 'confidence_hd_threshold' : 0.8, # confidence threshold of Hand detector 10 | } 11 | -------------------------------------------------------------------------------- /custom_dataset/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | !generate_data.py 6 | -------------------------------------------------------------------------------- /custom_dataset/generate_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import pandas as pd 4 | import xml.etree.ElementTree as ET 5 | import numpy as np 6 | import random 7 | import cv2 8 | 9 | # Function to extract list of frames 10 | def FrameCapture(path_to_video, video_name, train_frame_indices, valid_frame_indices, video_format = '.mp4'): 11 | 12 | # Path to video file 13 | print('Process : ', path_to_video+'/'+video_name + video_format) 14 | vidObj = cv2.VideoCapture(path_to_video+'/'+video_name + video_format) 15 | # Used as counter variable 16 | count = 0 17 | # checks whether frames were extracted 18 | success = 1 19 | max_nb_frame = max(max(train_frame_indices), max(valid_frame_indices)) 20 | while success: 21 | # vidObj object calls read 22 | # function extract frames 23 | success, image = vidObj.read() 24 | # Saves the frames with frame-count 25 | if count in train_frame_indices: 26 | print('to train') 27 | cv2.imwrite( 'train/v_{}_frame_{}.jpg'.format(video_name, count), image) 28 | 29 | elif count in valid_frame_indices: 30 | print('to valid') 31 | cv2.imwrite( 'valid/v_{}_frame_{}.jpg'.format(video_name, count), image) 32 | 33 | if count > max_nb_frame: 34 | break 35 | count += 1 36 | 37 | 38 | def xml_to_csv(video_name, total_train_labels, total_valid_labels, valid_prob): 39 | train_frame_indices, valid_frame_indices = [],[] 40 | if video_name == '15': 41 | for xml_file in glob.glob('annotations' + '/' + video_name + '/Annotations/*.xml'): 42 | tree = ET.parse(xml_file) 43 | root = tree.getroot() 44 | 45 | for member in root.findall('object'): 46 | frame_index = int(root.find('filename').text[6:]) 47 | value = ('v_{}_frame_{}'.format(video_name, frame_index), 48 | 0, 49 | 0, 50 | 0, 51 | 0 52 | ) 53 | 54 | if random.random() < valid_prob: 55 | total_valid_labels.append(value) 56 | valid_frame_indices.append(frame_index) 57 | else: 58 | total_train_labels.append(value) 59 | train_frame_indices.append(frame_index) 60 | else: 61 | for xml_file in glob.glob('annotations' + '/' + video_name + '/Annotations/*.xml'): 62 | tree = ET.parse(xml_file) 63 | root = tree.getroot() 64 | 65 | for member in root.findall('object'): 66 | frame_index = int(root.find('filename').text[6:]) 67 | value = ('v_{}_frame_{}'.format(video_name, frame_index), 68 | float(member[2][0].text)/int(root.find('size')[0].text), 69 | float(member[2][1].text)/int(root.find('size')[1].text), 70 | float(member[2][2].text)/int(root.find('size')[0].text), 71 | float(member[2][3].text)/int(root.find('size')[1].text) 72 | ) 73 | 74 | if random.random() < valid_prob: 75 | total_valid_labels.append(value) 76 | valid_frame_indices.append(frame_index) 77 | else: 78 | total_train_labels.append(value) 79 | train_frame_indices.append(frame_index) 80 | 81 | FrameCapture('videos', video_name, train_frame_indices, valid_frame_indices) 82 | 83 | return total_train_labels, total_valid_labels 84 | 85 | 86 | # Function to extract list of frames 87 | def FrameCapture_t(path_to_video, video_name, test_frame_indices, video_format = '.mp4'): 88 | 89 | # Path to video file 90 | print('Process : ', path_to_video+'/'+video_name + video_format) 91 | vidObj = cv2.VideoCapture(path_to_video+'/'+video_name + video_format) 92 | # Used as counter variable 93 | count = 0 94 | # checks whether frames were extracted 95 | success = 1 96 | max_nb_frame = max(test_frame_indices) 97 | while success: 98 | # vidObj object calls read 99 | # function extract frames 100 | success, image = vidObj.read() 101 | # Saves the frames with frame-count 102 | if count in test_frame_indices: 103 | print('to test') 104 | cv2.imwrite( 'test/v_{}_frame_{}.jpg'.format(video_name, count), image) 105 | 106 | if count > max_nb_frame: 107 | break 108 | count += 1 109 | 110 | 111 | def xml_to_csv_t(video_name, total_test_labels): 112 | test_frame_indices = [] 113 | if video_name == 'test_3': 114 | for xml_file in glob.glob('annotations' + '/' + video_name + '/Annotations/*.xml'): 115 | tree = ET.parse(xml_file) 116 | root = tree.getroot() 117 | 118 | for member in root.findall('object'): 119 | frame_index = int(root.find('filename').text[6:]) 120 | value = ('v_{}_frame_{}'.format(video_name, frame_index), 121 | 0, 122 | 0, 123 | 0, 124 | 0 125 | ) 126 | 127 | 128 | total_test_labels.append(value) 129 | test_frame_indices.append(frame_index) 130 | 131 | else: 132 | for xml_file in glob.glob('annotations' + '/' + video_name + '/Annotations/*.xml'): 133 | tree = ET.parse(xml_file) 134 | root = tree.getroot() 135 | 136 | for member in root.findall('object'): 137 | frame_index = int(root.find('filename').text[6:]) 138 | value = ('v_{}_frame_{}'.format(video_name, frame_index), 139 | float(member[2][0].text)/int(root.find('size')[0].text), 140 | float(member[2][1].text)/int(root.find('size')[1].text), 141 | float(member[2][2].text)/int(root.find('size')[0].text), 142 | float(member[2][3].text)/int(root.find('size')[1].text) 143 | ) 144 | 145 | 146 | total_test_labels.append(value) 147 | test_frame_indices.append(frame_index) 148 | 149 | 150 | FrameCapture_t('videos', video_name, test_frame_indices) 151 | 152 | return total_test_labels 153 | 154 | 155 | 156 | if __name__ == '__main__': 157 | # TRAIN + VALIDATION DATASETs 158 | n_videos = 14 159 | video_names = ['{}.mp4'.format(i) for i in range(1,n_videos+1)] 160 | train_folder = '/train' 161 | valid_folder = '/valid' 162 | total_train_labels, total_valid_labels = [],[] 163 | valid_prob = 0.1 164 | for n in range(1,n_videos + 1): 165 | total_train_labels, total_valid_labels = xml_to_csv('{}'.format(n), total_train_labels, total_valid_labels, valid_prob) 166 | 167 | column_name = ['filename', 'xmin', 'ymin', 'xmax', 'ymax'] 168 | train_df = pd.DataFrame(total_train_labels, columns=column_name) 169 | valid_df = pd.DataFrame(total_valid_labels, columns=column_name) 170 | 171 | train_df.to_csv('train_labels.csv', index=None) 172 | valid_df.to_csv('valid_labels.csv', index=None) 173 | 174 | # # TEST DATASET 175 | # n_videos = 3 176 | # video_names = ['test_{}'.format(i) for i in range(1,n_videos+1)] 177 | # test_folder = '/test' 178 | # total_test_labels = [] 179 | # for video_name in video_names: 180 | # total_test_labels = xml_to_csv_t(video_name, total_test_labels) 181 | 182 | # column_name = ['filename', 'xmin', 'ymin', 'xmax', 'ymax'] 183 | # test_df = pd.DataFrame(total_test_labels, columns=column_name) 184 | # test_df.to_csv('test_labels.csv', index=None) 185 | 186 | -------------------------------------------------------------------------------- /fingertips_detector/net/network.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras import Model 2 | from tensorflow.keras.layers import Conv2D, Flatten, Dense, Dropout, Reshape, UpSampling2D, Activation 3 | from tensorflow.keras.applications import VGG16 4 | 5 | 6 | def model(): 7 | model = VGG16(include_top=False, input_shape=(128, 128, 3)) 8 | x = model.output 9 | 10 | y = x 11 | x = Flatten()(x) 12 | x = Dense(1024, activation='relu')(x) 13 | x = Dropout(0.5)(x) 14 | x = Dense(1024, activation='relu')(x) 15 | x = Dropout(0.5)(x) 16 | probability = Dense(5, activation='sigmoid', name='probabilistic_output')(x) 17 | 18 | y = UpSampling2D((3, 3))(y) 19 | y = Activation('relu')(y) 20 | y = Conv2D(1, (3, 3), activation='linear')(y) 21 | position = Reshape(target_shape=(10, 10), name='positional_output')(y) 22 | model = Model(inputs=model.input, outputs=[probability, position]) 23 | return model 24 | 25 | 26 | if __name__ == '__main__': 27 | model = model() 28 | model.summary() 29 | -------------------------------------------------------------------------------- /fingertips_detector/unified_detector.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from tensorflow.keras.models import load_model 4 | from fingertips_detector.net.network import model 5 | from trt_utils import * 6 | 7 | class Fingertips: 8 | def __init__(self, weights, trt_engine, trt = False): 9 | 10 | self.trt = trt 11 | if self.trt: 12 | self.engine = load_engine(trt_engine) 13 | self.inputs, self.outputs, self.bindings, self.stream = allocate_buffers(self.engine) 14 | self.context = self.engine.create_execution_context() 15 | else: 16 | # self.model = load_model(model) 17 | self.model = model() 18 | self.model.load_weights(weights) 19 | @staticmethod 20 | def class_finder(prob): 21 | cls = '' 22 | classes = [0, 1, 2, 3, 4, 5, 6, 7] 23 | 24 | if np.array_equal(prob, np.array([0, 1, 0, 0, 0])): 25 | cls = classes[0] 26 | elif np.array_equal(prob, np.array([0, 1, 1, 0, 0])): 27 | cls = classes[1] 28 | elif np.array_equal(prob, np.array([0, 1, 1, 1, 0])): 29 | cls = classes[2] 30 | elif np.array_equal(prob, np.array([0, 1, 1, 1, 1])): 31 | cls = classes[3] 32 | elif np.array_equal(prob, np.array([1, 1, 1, 1, 1])): 33 | cls = classes[4] 34 | elif np.array_equal(prob, np.array([1, 0, 0, 0, 1])): 35 | cls = classes[5] 36 | elif np.array_equal(prob, np.array([1, 1, 0, 0, 1])): 37 | cls = classes[6] 38 | elif np.array_equal(prob, np.array([1, 1, 0, 0, 0])): 39 | cls = classes[7] 40 | return cls 41 | 42 | def classify(self, image): 43 | image = np.asarray(image) 44 | image = cv2.resize(image, (128, 128)) 45 | image = image.astype('float32') 46 | image = image / 255.0 47 | image = np.expand_dims(image, axis=0) 48 | # TensorRT engine 49 | if self.trt: 50 | np.copyto(self.inputs[0].host, image.ravel()) 51 | position, probability = do_inference(self.context, 52 | bindings=self.bindings, 53 | inputs=self.inputs, 54 | outputs=self.outputs, 55 | stream=self.stream) 56 | 57 | position = position.reshape((1,10,10)) 58 | probability = probability.reshape((1,5)) 59 | else: 60 | probability, position = self.model.predict(image) 61 | 62 | probability = probability[0] 63 | position = position[0] 64 | return probability, position 65 | -------------------------------------------------------------------------------- /gst_cam.py: -------------------------------------------------------------------------------- 1 | 2 | # MIT License 3 | # Copyright (c) 2019 JetsonHacks 4 | # See license 5 | # Using a CSI camera (such as the Raspberry Pi Version 2) connected to a 6 | # NVIDIA Jetson Nano Developer Kit using OpenCV 7 | # Drivers for the camera and OpenCV are included in the base image 8 | 9 | import cv2 10 | 11 | # gstreamer_pipeline returns a GStreamer pipeline for capturing from the CSI camera 12 | # Defaults to 1280x720 @ 60fps 13 | # Flip the image by setting the flip_method (most common values: 0 and 2) 14 | # display_width and display_height determine the size of the window on the screen 15 | 16 | 17 | def gstreamer_pipeline( 18 | capture_width=1280, 19 | capture_height=720, 20 | display_width=1280, 21 | display_height=720, 22 | framerate=60 23 | ): 24 | return ( 25 | "nvarguscamerasrc ! " 26 | "video/x-raw(memory:NVMM), " 27 | "width=(int)%d, height=(int)%d, " 28 | "format=(string)NV12, framerate=(fraction)%d/1 ! " 29 | "nvvidconv ! " 30 | "video/x-raw, width=(int)%d, height=(int)%d, format=(string)BGRx ! " 31 | "videoconvert ! " 32 | "video/x-raw, format=(string)BGR ! appsink" 33 | % ( 34 | capture_width, 35 | capture_height, 36 | framerate, 37 | display_width, 38 | display_height, 39 | ) 40 | ) 41 | 42 | 43 | def show_camera(): 44 | # To flip the image, modify the flip_method parameter (0 and 2 are the most common) 45 | print(gstreamer_pipeline()) 46 | cap = cv2.VideoCapture(gstreamer_pipeline(), cv2.CAP_GSTREAMER) 47 | if cap.isOpened(): 48 | window_handle = cv2.namedWindow("CSI Camera", cv2.WINDOW_AUTOSIZE) 49 | # Window 50 | while cv2.getWindowProperty("CSI Camera", 0) >= 0: 51 | ret_val, img = cap.read() 52 | print(img) 53 | cv2.imshow("CSI Camera", img) 54 | # This also acts as 55 | keyCode = cv2.waitKey(30) & 0xFF 56 | # Stop the program on the ESC key 57 | if keyCode == 27: 58 | break 59 | cap.release() 60 | cv2.destroyAllWindows() 61 | else: 62 | print("Unable to open camera") 63 | 64 | 65 | if __name__ == "__main__": 66 | show_camera() 67 | -------------------------------------------------------------------------------- /h5_to_trt.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras.models import load_model 3 | from fingertips_detector.net.network import model as fingertips_model 4 | from hand_detector.yolo.darknet import model as yolo_model 5 | from tensorflow.tools.graph_transforms import TransformGraph 6 | import os 7 | import argparse 8 | import tensorrt as trt 9 | import onnx 10 | import onnx.backend as backend 11 | import logging 12 | 13 | 14 | 15 | TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) 16 | logging.basicConfig(level=logging.DEBUG, 17 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 18 | datefmt="%Y-%m-%d %H:%M:%S") 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | def freeze_and_optimize_session(session, keep_var_names=None, input_names=None, output_names=None, clear_devices=True): 23 | graph = session.graph 24 | with graph.as_default(): 25 | freeze_var_names = list(set(v.op.name for v in tf.compat.v1.global_variables()).difference(keep_var_names or [])) 26 | output_names = output_names or [] 27 | output_names += [v.op.name for v in tf.compat.v1.global_variables()] 28 | input_graph_def = graph.as_graph_def() 29 | if clear_devices: 30 | for node in input_graph_def.node: 31 | node.device = "" 32 | graph = tf.graph_util.remove_training_nodes( 33 | input_graph_def, protected_nodes=output_names) 34 | graph = tf.graph_util.convert_variables_to_constants( 35 | session, graph, output_names, freeze_var_names) 36 | transforms = [ 37 | 'remove_nodes(op=Identity)', 38 | 'merge_duplicate_nodes', 39 | 'strip_unused_nodes', 40 | 'fold_constants(ignore_errors=true)', 41 | 'fold_batch_norms', 42 | ] 43 | graph = TransformGraph( 44 | graph, input_names, output_names, transforms) 45 | return graph 46 | 47 | 48 | def h5_to_pb(folder , model_name): 49 | # freeze Keras session - converts all variables to constants 50 | tf.compat.v1.keras.backend.set_learning_phase(0) 51 | print('Model-path -> ', folder +'/'+ model_name + ".h5") 52 | # model = load_model(folder +'/'+ model_name + ".h5", custom_objects=None) 53 | if 'yolo' in model_name: 54 | model = yolo_model() 55 | elif 'classes' in model_name: 56 | model = fingertips_model() 57 | 58 | model.load_weights(folder +'/'+ model_name + ".h5") 59 | graph_before = tf.compat.v1.keras.backend.get_session().graph 60 | print('input : -> ', [inp.op.name for inp in model.inputs]) 61 | print('output: -> ', [out.op.name for out in model.outputs]) 62 | frozen_graph = freeze_and_optimize_session(tf.compat.v1.keras.backend.get_session(), 63 | input_names=[inp.op.name for inp in model.inputs], 64 | output_names=[out.op.name for out in model.outputs]) 65 | tf.io.write_graph(frozen_graph, 66 | logdir=folder, 67 | as_text=False, 68 | name= model_name + '.pb') 69 | 70 | # To check graph in text editor 71 | ### IF YOU WANT TO USE TENSORBOARD - SAVE AS TEXT IN FORMAT .PBTXT 72 | # tf.io.write_graph(frozen_graph, 73 | # logdir=folder, 74 | # as_text=True, 75 | # name=model_name+'.pbtxt') 76 | 77 | 78 | def pb_to_onnx(folder, model_name): 79 | # pb -> onnx 80 | if 'yolo' in model_name: 81 | os.system("python3 -m tf2onnx.convert --graphdef {}.pb --output {}.onnx --inputs input_1:0 --outputs output/Sigmoid:0 --opset=11 ".format(folder +'/'+ model_name, folder +'/' + model_name)) 82 | 83 | elif 'classes' in model_name: 84 | os.system("python3 -m tf2onnx.convert --graphdef {}.pb --output {}.onnx --inputs input_1:0 --outputs probabilistic_output/Sigmoid:0,positional_output/Reshape:0 --opset=11 ".format(folder +'/'+ model_name, folder +'/' + model_name)) 85 | 86 | 87 | 88 | def network_structure(args): 89 | model_path = args['model'] 90 | with tf.Session() as sess: 91 | tf.global_variables_initializer().run() 92 | output_graph_def = tf.GraphDef() 93 | # Get the default picture 94 | graph = tf.get_default_graph() 95 | with open(model_path, "rb") as f: 96 | output_graph_def.ParseFromString(f.read()) 97 | _ = tf.import_graph_def(output_graph_def, name="") 98 | # Get how many operation nodes in the current graph 99 | print("%d ops in the graph." % len(output_graph_def.node)) 100 | op_name = [tensor.name for tensor in output_graph_def.node] 101 | print(op_name) 102 | print('=======================================================') 103 | # Produce log files in the log_graph folder, you can visualize the model in tensorboard 104 | summaryWriter = tf.summary.FileWriter('log_graph_'+args['model'], graph) 105 | cnt = 0 106 | print("%d tensors in the graph." % len(graph.get_operations())) 107 | for tensor in graph.get_operations(): 108 | # print out the name and value of tensor 109 | print(tensor.name, tensor.values()) 110 | cnt += 1 111 | if args['n']: 112 | if cnt == args['n']: 113 | break 114 | 115 | 116 | def add_profiles(config, inputs, opt_profiles): 117 | logger.debug("=== Optimization Profiles ===") 118 | for i, profile in enumerate(opt_profiles): 119 | for inp in inputs: 120 | _min, _opt, _max = profile.get_shape(inp.name) 121 | logger.debug("{} - OptProfile {} - Min {} Opt {} Max {}".format(inp.name, i, _min, _opt, _max)) 122 | config.add_optimization_profile(profile) 123 | 124 | 125 | def create_optimization_profiles(builder, inputs, batch_sizes=[1]): 126 | # Check if all inputs are fixed explicit batch to create a single profile and avoid duplicates 127 | if all([inp.shape[0] > -1 for inp in inputs]): 128 | profile = builder.create_optimization_profile() 129 | for inp in inputs: 130 | fbs, shape = inp.shape[0], inp.shape[1:] 131 | profile.set_shape(inp.name, min=(fbs, *shape), opt=(fbs, *shape), max=(fbs, *shape)) 132 | return [profile] 133 | 134 | # create several profiles 135 | profiles = {} 136 | for bs in batch_sizes: 137 | if not profiles.get(bs): 138 | profiles[bs] = builder.create_optimization_profile() 139 | 140 | for inp in inputs: 141 | shape = inp.shape[1:] 142 | # Check if fixed explicit batch 143 | if inp.shape[0] > -1: 144 | bs = inp.shape[0] 145 | profiles[bs].set_shape(inp.name, min=(bs, *shape), opt=(bs, *shape), max=(bs, *shape)) 146 | 147 | return list(profiles.values()) 148 | 149 | 150 | def onnx_to_trt(folder, model_name, fp = 16): 151 | print('--- fp_{} ---'.format(fp)) 152 | 153 | EXPLICIT_BATCH = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) 154 | F = EXPLICIT_BATCH 155 | 156 | NUM_IMAGES_PER_BATCH = 1 157 | 158 | with trt.Builder(TRT_LOGGER) as builder, builder.create_network(F) as network,trt.OnnxParser(network, TRT_LOGGER) as parser, builder.create_builder_config() as config: 159 | 160 | builder.max_batch_size = NUM_IMAGES_PER_BATCH 161 | builder.max_workspace_size = 1 << 30 162 | if fp == 16: 163 | builder.fp16_mode = True 164 | builder.strict_type_constraints = True 165 | 166 | config.max_workspace_size = 1 << 30 167 | if fp == 16: 168 | config.flags |= 1 << int(trt.BuilderFlag.FP16) 169 | 170 | config.flags |= 1 << int(trt.BuilderFlag.STRICT_TYPES) 171 | 172 | with open("./{}/{}.onnx".format(folder, model_name), 'rb') as model: 173 | PARSED = parser.parse(model.read()) 174 | if not PARSED: 175 | for error in range(parser.num_errors): 176 | print(parser.get_error(error)) 177 | else: 178 | for i in network: 179 | print(i.name) 180 | 181 | inputs = [network.get_input(i) for i in range(network.num_inputs)] 182 | #print('inputs => ', inputs) 183 | opt_profiles = create_optimization_profiles(builder, inputs) 184 | add_profiles(config, inputs, opt_profiles) 185 | 186 | engine = builder.build_engine(network, config) 187 | with open('./{}/{}.fp{}.TEST.engine'.format(folder, model_name, fp), "wb") as engine_file: 188 | engine_file.write(engine.serialize()) 189 | return engine 190 | 191 | 192 | def parse_args(): 193 | """ Parse input arguments """ 194 | parser = argparse.ArgumentParser(description='H5 to TensorRT converter arguments') 195 | 196 | parser.add_argument('--folder', dest='folder', help='Path to folder with h5 model', type=str, required=True) # default='weights/engines' 197 | parser.add_argument('--weights_file', dest='model_name', help='Model name (without .h5)', type=str, required=True ) # default='model_yolo' 198 | parser.add_argument('--fp', dest='fp', help='TensorRT engine precision', type=int, default=16 ) 199 | return parser.parse_args() 200 | 201 | if __name__ == "__main__": 202 | args = parse_args() 203 | 204 | # args = {'model':'converted/model_classes8.pb', 205 | # 'n' : 200} 206 | # network_structure(args) 207 | 208 | try: 209 | h5_to_pb(args.folder, args.model_name) 210 | except Exception as e: 211 | print('\n\nError: h5_to_pb') 212 | print(e) 213 | 214 | try: 215 | pb_to_onnx(args.folder, args.model_name) 216 | except Exception as e: 217 | print('\n\nError: pb_to_onnx') 218 | print(e) 219 | 220 | try: 221 | onnx_to_trt(args.folder, args.model_name, args.fp) 222 | except Exception as e: 223 | print('\n\nError: onnx_to_trt') 224 | print(e) 225 | -------------------------------------------------------------------------------- /hand_detector/detector.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from hand_detector.yolo.darknet import model as yolo_model 4 | from tensorflow.keras.models import load_model 5 | from hand_detector.yolo.preprocess.yolo_flag import Flag as yoloFlag 6 | from trt_utils import * 7 | 8 | 9 | class YOLO: 10 | def __init__(self, weights, trt_engine, threshold, trt = False): 11 | self.f = yoloFlag() 12 | self.threshold = threshold 13 | self.trt = trt 14 | 15 | if self.trt: 16 | self.engine = load_engine(trt_engine) 17 | self.inputs, self.outputs, self.bindings, self.stream = allocate_buffers(self.engine) 18 | self.context = self.engine.create_execution_context() 19 | else: 20 | # self.model = load_model(model) 21 | self.model = yolo_model() 22 | self.model.load_weights(weights) 23 | 24 | def detect(self, image): 25 | height, width, _ = image.shape 26 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 27 | image = cv2.resize(image, (self.f.target_size, self.f.target_size)) / 255.0 28 | image = np.expand_dims(image, axis=0) 29 | 30 | # TensorRT engine 31 | if self.trt: 32 | np.copyto(self.inputs[0].host, image.ravel()) 33 | yolo_out = np.array([do_inference(self.context, 34 | bindings=self.bindings, 35 | inputs=self.inputs, 36 | outputs=self.outputs, 37 | stream=self.stream) 38 | ]).reshape((1, 7, 7, 5)) 39 | else: 40 | yolo_out = self.model.predict(image) 41 | 42 | yolo_out = yolo_out[0] 43 | grid_pred = yolo_out[:, :, 0] 44 | i, j = np.squeeze(np.where(grid_pred == np.amax(grid_pred))) 45 | 46 | try: 47 | if i.shape[0] > 1 : 48 | i = i[0] 49 | j = j[0] 50 | except: 51 | pass 52 | 53 | if grid_pred[i, j] >= self.threshold: 54 | bbox = yolo_out[i, j, 1:] 55 | x1, y1, x2, y2 = bbox[0], bbox[1], bbox[2], bbox[3] 56 | # size conversion 57 | x1 = int(x1 * width) 58 | y1 = int(y1 * height) 59 | x2 = int(x2 * width) 60 | y2 = int(y2 * height) 61 | return (x1, y1), (x2, y2) 62 | else: 63 | return None, None 64 | -------------------------------------------------------------------------------- /hand_detector/yolo/__pycache__/darknet.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preste-ai/AI_whiteboard/edb89ec3621f23de3b3b27133419a72f630c0b1e/hand_detector/yolo/__pycache__/darknet.cpython-36.pyc -------------------------------------------------------------------------------- /hand_detector/yolo/__pycache__/darknet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preste-ai/AI_whiteboard/edb89ec3621f23de3b3b27133419a72f630c0b1e/hand_detector/yolo/__pycache__/darknet.cpython-37.pyc -------------------------------------------------------------------------------- /hand_detector/yolo/darknet.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras import Input, Model 2 | from tensorflow.keras.layers import Conv2D, MaxPooling2D, BatchNormalization, Activation 3 | 4 | 5 | def conv_batch_norm_relu(x, n_filters, f, padding='same', activation='relu'): 6 | x = Conv2D(n_filters, f, padding=padding)(x) 7 | x = BatchNormalization()(x) 8 | x = Activation(activation)(x) 9 | return x 10 | 11 | 12 | def model(): 13 | input = Input(shape=(224, 224, 3)) 14 | x = conv_batch_norm_relu(input, 32, (3, 3), padding='same', activation='relu') 15 | x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x) 16 | 17 | x = conv_batch_norm_relu(x, 64, (3, 3), padding='same', activation='relu') 18 | x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x) 19 | 20 | x = conv_batch_norm_relu(x, 128, (3, 3), padding='same', activation='relu') 21 | x = conv_batch_norm_relu(x, 64, (1, 1), padding='same', activation='relu') 22 | x = conv_batch_norm_relu(x, 128, (3, 3), padding='same', activation='relu') 23 | x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x) 24 | 25 | x = conv_batch_norm_relu(x, 256, (3, 3), padding='same', activation='relu') 26 | x = conv_batch_norm_relu(x, 128, (1, 1), padding='same', activation='relu') 27 | x = conv_batch_norm_relu(x, 256, (3, 3), padding='same', activation='relu') 28 | x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x) 29 | 30 | x = conv_batch_norm_relu(x, 512, (3, 3), padding='same', activation='relu') 31 | x = conv_batch_norm_relu(x, 256, (1, 1), padding='same', activation='relu') 32 | x = conv_batch_norm_relu(x, 512, (3, 3), padding='same', activation='relu') 33 | x = conv_batch_norm_relu(x, 256, (1, 1), padding='same', activation='relu') 34 | x = conv_batch_norm_relu(x, 512, (3, 3), padding='same', activation='relu') 35 | x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x) 36 | 37 | x = conv_batch_norm_relu(x, 1024, (3, 3), padding='same', activation='relu') 38 | x = conv_batch_norm_relu(x, 512, (1, 1), padding='same', activation='relu') 39 | x = conv_batch_norm_relu(x, 1024, (3, 3), padding='same', activation='relu') 40 | x = conv_batch_norm_relu(x, 512, (1, 1), padding='same', activation='relu') 41 | x = conv_batch_norm_relu(x, 1024, (3, 3), padding='same', activation='relu') 42 | x = Conv2D(5, (1, 1), padding='same')(x) 43 | x = BatchNormalization()(x) 44 | x = Activation('sigmoid', name='output')(x) 45 | return Model(inputs=input, outputs=x) 46 | 47 | 48 | if __name__ == '__main__': 49 | model = model() 50 | model.summary() 51 | -------------------------------------------------------------------------------- /hand_detector/yolo/generator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import numpy as np 4 | from hand_detector.yolo.utils.utils import visualize 5 | from hand_detector.yolo.preprocess.augmentation import augment, flip 6 | from hand_detector.yolo.preprocess.labelgen import label_generator, bbox_to_grid 7 | 8 | 9 | def batch_indices(batch_size, dataset_size): 10 | index_a = list(range(0, dataset_size, batch_size)) 11 | index_b = list(range(batch_size, dataset_size, batch_size)) 12 | index_b.append(dataset_size) 13 | indices = list(zip(index_a, index_b)) 14 | return indices 15 | 16 | 17 | def load_train_images(): 18 | folder_names = ['train'] 19 | train_image_files = [] 20 | dataset_directory = 'custom_dataset/' 21 | for folder in folder_names: 22 | train_image_files += os.listdir(dataset_directory + folder + '/') 23 | return train_image_files 24 | 25 | 26 | def load_valid_images(): 27 | folder_names = ['valid'] 28 | valid_image_files = [] 29 | dataset_directory = 'custom_dataset/' 30 | for folder in folder_names: 31 | valid_image_files += os.listdir(dataset_directory + folder + '/') 32 | return valid_image_files 33 | 34 | 35 | def load_test_images(): 36 | folder_names = ['test'] 37 | test_image_files = [] 38 | dataset_directory = 'custom_dataset/' 39 | for folder in folder_names: 40 | test_image_files += os.listdir(dataset_directory + folder + '/') 41 | return test_image_files 42 | 43 | 44 | def train_generator(batch_size, is_augment=True): 45 | if is_augment: 46 | batch_size = int(batch_size / 8) 47 | 48 | directory = 'custom_dataset/' 49 | train_image_files = load_train_images() 50 | dataset_size = len(train_image_files) 51 | indices = batch_indices(batch_size=batch_size, dataset_size=dataset_size) 52 | print('Training Dataset Size: ', dataset_size) 53 | 54 | while True: 55 | # for i in range(0, 2): 56 | # random.shuffle(train_image_files) 57 | 58 | for index in indices: 59 | x_batch = [] 60 | y_batch = [] 61 | 62 | for n in range(index[0], index[1]): 63 | image_name = train_image_files[n] 64 | image, bbox = label_generator(directory, 'train', image_name) 65 | yolo_out = bbox_to_grid(bbox) 66 | x_batch.append(image) 67 | y_batch.append(yolo_out) 68 | # visualize(image, yolo_out, RGB2BGR=True) 69 | 70 | # augment 71 | image_aug, bbox_aug = augment(image, bbox) 72 | yolo_out = bbox_to_grid(bbox_aug) 73 | x_batch.append(image_aug) 74 | y_batch.append(yolo_out) 75 | # visualize(image_aug, yolo_out, RGB2BGR=True) 76 | 77 | # augment 1 78 | image_aug, bbox_aug = augment(image, bbox) 79 | yolo_out = bbox_to_grid(bbox_aug) 80 | x_batch.append(image_aug) 81 | y_batch.append(yolo_out) 82 | 83 | # augment 2 84 | image_aug, bbox_aug = augment(image, bbox) 85 | yolo_out = bbox_to_grid(bbox_aug) 86 | x_batch.append(image_aug) 87 | y_batch.append(yolo_out) 88 | 89 | # augment 3 90 | image_aug, bbox_aug = augment(image, bbox) 91 | yolo_out = bbox_to_grid(bbox_aug) 92 | x_batch.append(image_aug) 93 | y_batch.append(yolo_out) 94 | 95 | 96 | # horizontal flip 97 | image_flip, bbox_flip = flip(image, bbox) 98 | yolo_out = bbox_to_grid(bbox_flip) 99 | x_batch.append(image_flip) 100 | y_batch.append(yolo_out) 101 | # visualize(image_flip, yolo_out, RGB2BGR=True) 102 | 103 | # horizontal flip + augment 104 | image_flip_aug, bbox_flip_aug = augment(image_flip, bbox_flip) 105 | yolo_out = bbox_to_grid(bbox_flip_aug) 106 | x_batch.append(image_flip_aug) 107 | y_batch.append(yolo_out) 108 | # visualize(image_flip_aug, yolo_out, RGB2BGR=True) 109 | 110 | # horizontal flip + augment 2 111 | image_flip_aug, bbox_flip_aug = augment(image_flip, bbox_flip) 112 | yolo_out = bbox_to_grid(bbox_flip_aug) 113 | x_batch.append(image_flip_aug) 114 | y_batch.append(yolo_out) 115 | 116 | x_batch = np.asarray(x_batch) / 255.0 117 | y_batch = np.asarray(y_batch) 118 | yield x_batch, y_batch 119 | 120 | 121 | def valid_generator(batch_size): 122 | directory = 'custom_dataset/' 123 | valid_image_files = load_valid_images() 124 | dataset_size = len(valid_image_files) 125 | indices = batch_indices(batch_size=batch_size, dataset_size=dataset_size) 126 | print('Validation Dataset Size: ', dataset_size) 127 | 128 | while True: 129 | # for i in range(0, 2): 130 | # random.shuffle(valid_image_files) 131 | 132 | for index in indices: 133 | x_batch = [] 134 | y_batch = [] 135 | 136 | for n in range(index[0], index[1]): 137 | image_name = valid_image_files[n] 138 | image, bbox = label_generator(directory, 'valid', image_name) 139 | yolo_out = bbox_to_grid(bbox) 140 | x_batch.append(image) 141 | y_batch.append(yolo_out) 142 | # visualize(image, yolo_out, RGB2BGR=True) 143 | 144 | x_batch = np.asarray(x_batch) / 255.0 145 | y_batch = np.asarray(y_batch) 146 | yield x_batch, y_batch 147 | 148 | 149 | def test_generator(batch_size): 150 | directory = 'custom_dataset/' 151 | test_image_files = load_test_images() 152 | dataset_size = len(test_image_files) 153 | indices = batch_indices(batch_size=batch_size, dataset_size=dataset_size) 154 | print('Test Dataset Size: ', dataset_size) 155 | 156 | while True: 157 | # for i in range(0, 2): 158 | # random.shuffle(valid_image_files) 159 | 160 | for index in indices: 161 | x_batch = [] 162 | y_batch = [] 163 | 164 | for n in range(index[0], index[1]): 165 | image_name = test_image_files[n] 166 | image, bbox = label_generator(directory, 'test', image_name) 167 | yolo_out = bbox_to_grid(bbox) 168 | x_batch.append(image) 169 | y_batch.append(yolo_out) 170 | # visualize(image, yolo_out, RGB2BGR=True) 171 | 172 | x_batch = np.asarray(x_batch) / 255.0 173 | y_batch = np.asarray(y_batch) 174 | yield x_batch, y_batch 175 | 176 | 177 | if __name__ == '__main__': 178 | gen = train_generator(batch_size=100) 179 | # gen = valid_generator(batch_size=100) 180 | batch_x, batch_y = next(gen) 181 | # print(batch_x) 182 | # print(batch_y) 183 | -------------------------------------------------------------------------------- /hand_detector/yolo/history.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import numpy as np 3 | import seaborn as sns 4 | import matplotlib.pyplot as plt 5 | 6 | sns.set_style("darkgrid") 7 | 8 | f = open('../../weights/history.txt', 'r') 9 | losses = f.readlines() 10 | f.close() 11 | 12 | train, valid = [], [] 13 | 14 | for loss in losses: 15 | loss = yaml.load(loss) 16 | train = train + loss.get('loss') 17 | valid = valid + loss.get('val_loss') 18 | 19 | 20 | epoch = range(1, len(train) + 1) 21 | 22 | fig1 = plt.figure(1) 23 | plt.plot(epoch, np.log(train), 'C2', marker='X') 24 | plt.plot(epoch, np.log(valid), '--', marker='>') 25 | plt.legend(['Training Total Loss', 'Validation Total Loss'], loc=1, prop={'size': 18}) 26 | plt.xlabel('Epochs', fontsize=20) 27 | plt.ylabel(r'$\mathit{log}_{e}\:(Total \;\: Loss \;\: \mathcal{L})$', fontsize=20) 28 | plt.xticks(fontsize=16) 29 | plt.yticks(fontsize=16) 30 | plt.savefig('loss_curve.jpg') 31 | plt.show() 32 | -------------------------------------------------------------------------------- /hand_detector/yolo/predict.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from hand_detector.yolo.darknet import model 4 | from hand_detector.yolo.preprocess.yolo_flag import Flag 5 | from hand_detector.yolo.utils.utils import visualize 6 | 7 | f = Flag() 8 | model = model() 9 | model.load_weights('../../weights/yolo.h5') 10 | image = cv2.imread('../../data/sample.jpg', cv2.COLOR_BGR2RGB) 11 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 12 | image = cv2.resize(image, (f.target_size, f.target_size)) 13 | processed_image = np.expand_dims(image, axis=0) / 255.0 14 | yolo_output = model.predict(processed_image) 15 | yolo_output = yolo_output[0] 16 | visualize(image, yolo_output, title='yolo prediction', RGB2BGR=True) 17 | -------------------------------------------------------------------------------- /hand_detector/yolo/preprocess/__pycache__/yolo_flag.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preste-ai/AI_whiteboard/edb89ec3621f23de3b3b27133419a72f630c0b1e/hand_detector/yolo/preprocess/__pycache__/yolo_flag.cpython-36.pyc -------------------------------------------------------------------------------- /hand_detector/yolo/preprocess/__pycache__/yolo_flag.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preste-ai/AI_whiteboard/edb89ec3621f23de3b3b27133419a72f630c0b1e/hand_detector/yolo/preprocess/__pycache__/yolo_flag.cpython-37.pyc -------------------------------------------------------------------------------- /hand_detector/yolo/preprocess/augmentation.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import random 3 | import numpy as np 4 | import imgaug as ia 5 | import imgaug.augmenters as iaa 6 | from hand_detector.yolo.utils.utils import visualize 7 | from hand_detector.yolo.preprocess.yolo_flag import Flag 8 | from hand_detector.yolo.preprocess.labelgen import label_generator 9 | 10 | f = Flag() 11 | size = f.target_size 12 | 13 | 14 | def augment(image, bbox): 15 | x = random.randint(-60, 60) 16 | y = random.randint(-60, 60) 17 | aug = iaa.Sequential([iaa.AdditiveGaussianNoise(scale=random.uniform(.001, .01) * 255), # gaussian noise 18 | iaa.Multiply(random.uniform(0.5, 1.5)), # brightness 19 | iaa.Affine(translate_px={"x": x, "y": y}, # translation 20 | scale=random.uniform(0.5, 1.5), # zoom in and out 21 | rotate=random.uniform(-25, 25), # rotation 22 | shear=random.uniform(-5, 5), # shear transformation 23 | cval=(0, 255))]) # fill the empty space with color 24 | 25 | aug.add(iaa.Salt(.001)) 26 | bbs = ia.BoundingBoxesOnImage([ia.BoundingBox(x1=bbox[0], y1=bbox[1], x2=bbox[2], y2=bbox[3])], shape=image.shape) 27 | aug = aug.to_deterministic() 28 | image_aug = aug.augment_image(image) 29 | bbs_aug = aug.augment_bounding_boxes([bbs])[0] 30 | b = bbs_aug.bounding_boxes 31 | bbs_aug = [b[0].x1, b[0].y1, b[0].x2, b[0].y2] 32 | bbs_aug = np.asarray(bbs_aug) 33 | 34 | bbs_aug[0] = bbs_aug[0] if bbs_aug[0] > 0 else 0 35 | bbs_aug[1] = bbs_aug[1] if bbs_aug[1] > 0 else 0 36 | bbs_aug[2] = bbs_aug[2] if bbs_aug[2] < size else size 37 | bbs_aug[3] = bbs_aug[3] if bbs_aug[3] < size else size 38 | return image_aug, bbs_aug 39 | 40 | 41 | def flip(image, bbox): 42 | aug = iaa.Sequential([iaa.Fliplr(1.0)]) 43 | 44 | bbs = ia.BoundingBoxesOnImage([ 45 | ia.BoundingBox(x1=bbox[0], y1=bbox[1], x2=bbox[2], y2=bbox[3])], shape=image.shape) 46 | 47 | aug = aug.to_deterministic() 48 | image_aug = aug.augment_image(image) 49 | image_aug = image_aug.copy() 50 | bbs_aug = aug.augment_bounding_boxes([bbs])[0] 51 | b = bbs_aug.bounding_boxes 52 | bbs_aug = [b[0].x1, b[0].y1, b[0].x2, b[0].y2] 53 | bbs_aug = np.asarray(bbs_aug) 54 | 55 | bbs_aug[0] = bbs_aug[0] if bbs_aug[0] > 0 else 0 56 | bbs_aug[1] = bbs_aug[1] if bbs_aug[1] > 0 else 0 57 | bbs_aug[2] = bbs_aug[2] if bbs_aug[2] < size else size 58 | bbs_aug[3] = bbs_aug[3] if bbs_aug[3] < size else size 59 | return image_aug, bbs_aug 60 | 61 | 62 | if __name__ == '__main__': 63 | dir = '../../../../EgoGesture Dataset/' 64 | img_name = 'BasketballField_Single_Eight_color_91.jpg' 65 | img, box = label_generator(dir, img_name, type='') 66 | 67 | # image_aug, bbox_aug = img, box 68 | img_aug, bbox_aug = augment(image=img, bbox=box) 69 | # image_aug, bbox_aug = flip(image=img, bbox=box) 70 | # image_aug, bbox_aug = augment(image=image_aug, bbox=bbox_aug) 71 | bbox_aug = [int(b) for b in bbox_aug] 72 | 73 | x1, y1, x2, y2 = bbox_aug[0], bbox_aug[1], bbox_aug[2], bbox_aug[3] 74 | img_aug = cv2.rectangle(img_aug, (x1, y1), (x2, y2), f.box_color, 3) 75 | visualize(image=img_aug, title='visualize', RGB2BGR=True) 76 | -------------------------------------------------------------------------------- /hand_detector/yolo/preprocess/labelgen.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | from hand_detector.yolo.utils.utils import visualize 6 | from hand_detector.yolo.preprocess.yolo_flag import Flag 7 | 8 | f = Flag() 9 | 10 | grid = f.grid 11 | grid_size = f.grid_size 12 | target_size = f.target_size 13 | threshold = f.threshold 14 | 15 | df_train = pd.read_csv('custom_dataset/train_labels.csv') 16 | df_valid = pd.read_csv('custom_dataset/valid_labels.csv') 17 | df_test = pd.read_csv('custom_dataset/test_labels.csv') 18 | # def label_generator(directory, image_name, type=''): 19 | # folder_name = find_folder(image_name) 20 | # image = plt.imread(directory + folder_name + type + '/' + image_name) 21 | # image = cv2.resize(image, (target_size, target_size)) 22 | 23 | # file = open(directory + 'label/' + folder_name + '.txt') 24 | # lines = file.readlines() 25 | # file.close() 26 | 27 | # label = [] 28 | # for line in lines: 29 | # line = line.strip().split() 30 | # name = line[0].split('/')[3] 31 | # if image_name == name: 32 | # label = line[1:] 33 | # break 34 | 35 | # """ bbox: top-left and bottom-right coordinate of the bounding box """ 36 | # label = label[0:4] 37 | # bbox = [float(element) * target_size for element in label] 38 | # bbox = np.array(bbox) 39 | # return image, bbox 40 | 41 | def label_generator(directory, folder, image_name): 42 | image = plt.imread(directory + folder + '/' + image_name) 43 | #image = cv2.resize(image, (target_size, target_size)) 44 | 45 | if folder == 'train': 46 | df = df_train 47 | elif folder == 'valid': 48 | df = df_valid 49 | elif folder == 'test': 50 | df = df_test 51 | else: 52 | exit(1) 53 | """ bbox: top-left and bottom-right coordinate of the bounding box """ 54 | label = df[df.filename == image_name[:-4]].iloc[0][1:].tolist() 55 | bbox = [float(element) * target_size for element in label] 56 | bbox = np.array(bbox) 57 | return image, bbox 58 | 59 | 60 | def bbox_to_grid(bbox): 61 | if bbox[0] == 0 and bbox[1] == 0 and bbox[2] == 0 and bbox[3] == 0: 62 | output = np.zeros(shape=(grid, grid, 5)) 63 | else: 64 | output = np.zeros(shape=(grid, grid, 5)) 65 | center = ((bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2) 66 | i, j = int(np.floor(center[0] / grid_size)), int(np.floor(center[1] / grid_size)) 67 | i = i if i < f.grid else f.grid - 1 68 | j = j if j < f.grid else f.grid - 1 69 | output[i, j, 0] = 1 70 | output[i, j, 1:] = bbox / f.target_size 71 | return output 72 | 73 | 74 | if __name__ == '__main__': 75 | img_name = 'v_1_frame_000418.jpg' 76 | dir = 'custom_dataset/' 77 | img, box = label_generator(directory=dir, folder= 'valid', image_name=img_name) 78 | yolo_out = bbox_to_grid(box) 79 | visualize(img, yolo_out, title='', RGB2BGR=True) 80 | -------------------------------------------------------------------------------- /hand_detector/yolo/preprocess/yolo_flag.py: -------------------------------------------------------------------------------- 1 | class Flag: 2 | def __init__(self): 3 | self.grid = 7 4 | self.grid_size = 32 5 | self.target_size = 224 6 | self.threshold = 0.5 7 | self.alpha = 0.5 8 | self.line_color = (18, 203, 227) 9 | self.grid_color = (81, 189, 42) 10 | self.box_color = (235, 26, 158) 11 | -------------------------------------------------------------------------------- /hand_detector/yolo/train.py: -------------------------------------------------------------------------------- 1 | from math import ceil 2 | import tensorflow as tf 3 | from keras.optimizers import Adam 4 | from keras.callbacks import ModelCheckpoint 5 | from hand_detector.yolo.darknet import model 6 | from hand_detector.yolo.utils.info import data_info 7 | from hand_detector.yolo.generator import train_generator, valid_generator 8 | 9 | 10 | def loss_function(y_true, y_pred): 11 | # binary cross entropy loss 12 | cross_entropy_loss = tf.keras.losses.binary_crossentropy(y_true[:, :, :, 0:1], y_pred[:, :, :, 0:1]) 13 | cross_entropy_loss = tf.reduce_mean(cross_entropy_loss) 14 | # mean square loss 15 | square_diff = tf.math.squared_difference(y_true[:, :, :, 1:5], y_pred[:, :, :, 1:5]) 16 | mask = tf.not_equal(y_true[:, :, :, 1:5], 0) 17 | mask = tf.cast(mask, tf.float32) 18 | coordinate_loss = tf.multiply(square_diff, mask) 19 | coordinate_loss = tf.reduce_sum(coordinate_loss) 20 | loss = cross_entropy_loss + coordinate_loss 21 | return loss 22 | 23 | 24 | # create the model 25 | model = model() 26 | model.summary() 27 | 28 | # compile 29 | adam = Adam(lr=1e-5, beta_1=0.9, beta_2=0.999, epsilon=1e-10, decay=0.0) 30 | model.compile(optimizer=adam, loss={"output": loss_function}, metrics={"output": loss_function}) 31 | 32 | # train 33 | epochs = 10 34 | batch_size = 32 35 | train_set_size = data_info('train') 36 | valid_set_size = data_info('valid') 37 | training_steps_per_epoch = ceil(train_set_size / batch_size) 38 | validation_steps_per_epoch = ceil(valid_set_size / 256) 39 | train_gen = train_generator(batch_size=batch_size) 40 | valid_gen = valid_generator(batch_size=batch_size) 41 | 42 | checkpoints = ModelCheckpoint('weights/weights_{epoch:03d}.h5', save_weights_only=True, period=1) 43 | history = model.fit_generator(train_gen, steps_per_epoch=training_steps_per_epoch, epochs=epochs, verbose=1, 44 | validation_data=valid_gen, validation_steps=validation_steps_per_epoch, 45 | callbacks=[checkpoints], shuffle=True, max_queue_size=128) 46 | 47 | with open('weights/history.txt', 'a+') as f: 48 | print(history.history, file=f) 49 | 50 | print('All Done!') 51 | -------------------------------------------------------------------------------- /hand_detector/yolo/utils/info.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def data_info(type='train'): 5 | dataset_directory = 'custom_dataset/' 6 | folder_names = ['train'] 7 | 8 | if type is 'train': 9 | folder_names = ['train'] 10 | nTrain = 0 11 | for folder in folder_names: 12 | nTrain = nTrain + len(os.listdir(dataset_directory + folder + '/')) 13 | return nTrain 14 | 15 | elif type is 'valid': 16 | folder_names = ['valid'] 17 | nValid = 0 18 | for folder in folder_names: 19 | nValid = nValid + len(os.listdir(dataset_directory + folder + '/')) 20 | return nValid 21 | 22 | elif type is 'test': 23 | folder_names = ['test'] 24 | nTest = 0 25 | for folder in folder_names: 26 | nTest = nTest + len(os.listdir(dataset_directory + folder + '/')) 27 | return nTest 28 | -------------------------------------------------------------------------------- /hand_detector/yolo/utils/utils.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from hand_detector.yolo.preprocess.yolo_flag import Flag 4 | 5 | f = Flag() 6 | grid = f.grid 7 | grid_size = f.grid_size 8 | alpha = f.alpha 9 | 10 | 11 | def draw_grid(image, bbox): 12 | for i in range(0, grid + 1): 13 | image = cv2.line(image, (0, i * grid_size), (grid * grid_size, i * grid_size), f.line_color, 2) 14 | image = cv2.line(image, (i * grid_size, 0), (i * grid_size, grid * grid_size), f.line_color, 2) 15 | 16 | center = ((bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2) 17 | image = cv2.circle(image, (int(center[0]), int(center[1])), 5, (255, 0, 0), -1) 18 | i, j = int(np.floor(center[0] / grid_size)), int(np.floor(center[1] / grid_size)) 19 | glassy_image = image.copy() 20 | i, j = int(i), int(j) 21 | x = i * grid_size 22 | y = j * grid_size 23 | glassy_image = cv2.rectangle(glassy_image, (x, y), (x + grid_size, y + grid_size), f.grid_color, -1) 24 | image = cv2.addWeighted(glassy_image, alpha, image, 1 - alpha, 0) 25 | return image 26 | 27 | 28 | def visualize(image, yolo_out=None, title='output', RGB2BGR=False): 29 | if yolo_out is not None: 30 | predicting_boxes = yolo_out[:, :, 0] 31 | i, j = np.squeeze(np.where(predicting_boxes == np.amax(predicting_boxes))) 32 | 33 | if predicting_boxes[i, j] >= f.threshold: 34 | bbox = yolo_out[i, j, 1:] * f.target_size 35 | image = draw_grid(image, bbox) 36 | x1, y1, x2, y2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]) 37 | image = cv2.rectangle(image, (x1, y1), (x2, y2), f.box_color, 2) 38 | 39 | if RGB2BGR: 40 | image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) 41 | 42 | cv2.imshow(title, image) 43 | 44 | if cv2.waitKey(0) & 0xff == 27: 45 | cv2.destroyAllWindows() 46 | -------------------------------------------------------------------------------- /images/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preste-ai/AI_whiteboard/edb89ec3621f23de3b3b27133419a72f630c0b1e/images/1.jpg -------------------------------------------------------------------------------- /images/10.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preste-ai/AI_whiteboard/edb89ec3621f23de3b3b27133419a72f630c0b1e/images/10.jpg -------------------------------------------------------------------------------- /images/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preste-ai/AI_whiteboard/edb89ec3621f23de3b3b27133419a72f630c0b1e/images/2.jpg -------------------------------------------------------------------------------- /images/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preste-ai/AI_whiteboard/edb89ec3621f23de3b3b27133419a72f630c0b1e/images/3.jpg -------------------------------------------------------------------------------- /images/4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preste-ai/AI_whiteboard/edb89ec3621f23de3b3b27133419a72f630c0b1e/images/4.jpg -------------------------------------------------------------------------------- /images/5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preste-ai/AI_whiteboard/edb89ec3621f23de3b3b27133419a72f630c0b1e/images/5.jpg -------------------------------------------------------------------------------- /images/6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preste-ai/AI_whiteboard/edb89ec3621f23de3b3b27133419a72f630c0b1e/images/6.jpg -------------------------------------------------------------------------------- /images/7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preste-ai/AI_whiteboard/edb89ec3621f23de3b3b27133419a72f630c0b1e/images/7.jpg -------------------------------------------------------------------------------- /images/8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preste-ai/AI_whiteboard/edb89ec3621f23de3b3b27133419a72f630c0b1e/images/8.jpg -------------------------------------------------------------------------------- /images/9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preste-ai/AI_whiteboard/edb89ec3621f23de3b3b27133419a72f630c0b1e/images/9.jpg -------------------------------------------------------------------------------- /images/ai_whiteboard.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preste-ai/AI_whiteboard/edb89ec3621f23de3b3b27133419a72f630c0b1e/images/ai_whiteboard.gif -------------------------------------------------------------------------------- /images/to_clean.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preste-ai/AI_whiteboard/edb89ec3621f23de3b3b27133419a72f630c0b1e/images/to_clean.jpg -------------------------------------------------------------------------------- /images/to_erase.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preste-ai/AI_whiteboard/edb89ec3621f23de3b3b27133419a72f630c0b1e/images/to_erase.jpg -------------------------------------------------------------------------------- /images/to_move.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preste-ai/AI_whiteboard/edb89ec3621f23de3b3b27133419a72f630c0b1e/images/to_move.jpg -------------------------------------------------------------------------------- /images/to_paint.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preste-ai/AI_whiteboard/edb89ec3621f23de3b3b27133419a72f630c0b1e/images/to_paint.jpg -------------------------------------------------------------------------------- /images/to_save.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preste-ai/AI_whiteboard/edb89ec3621f23de3b3b27133419a72f630c0b1e/images/to_save.jpg -------------------------------------------------------------------------------- /metrics.py: -------------------------------------------------------------------------------- 1 | def iou(box1, box2): 2 | """Implement the intersection over union (IoU) between box1 and box2 3 | 4 | Arguments: 5 | box1 -- first box, list object with coordinates (x1, y1, x2, y2) 6 | box2 -- second box, list object with coordinates (x1, y1, x2, y2) 7 | """ 8 | 9 | # Calculate the (y1, x1, y2, x2) coordinates of the intersection of box1 and box2. Calculate its Area. 10 | xi1 = max(box1[0], box2[0]) 11 | yi1 = max(box1[1], box2[1]) 12 | xi2 = min(box1[2], box2[2]) 13 | yi2 = min(box1[3], box2[3]) 14 | inter_area = (xi2 - xi1) * (yi2 - yi1) 15 | 16 | # Calculate the Union area by using Formula: Union(A,B) = A + B - Inter(A,B) 17 | box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1]) 18 | box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1]) 19 | union_area = box1_area + box2_area - inter_area 20 | 21 | # compute the IoU 22 | iou = inter_area / union_area 23 | 24 | return iou 25 | 26 | 27 | def get_stat(gt, pr): 28 | n = len(gt) 29 | smth = 0.00000001 30 | tp,tn,fp,fn = 0,0,0,0 31 | for i in range(n): 32 | l1, l2 = gt[i], pr[i] 33 | if (l1 == 1) and (l2 == 1): 34 | tp +=1 35 | elif (l1 == 1) and (l2 == 0): 36 | fp += 1 37 | elif (l1 == 0) and (l2 == 1): 38 | fn += 1 39 | elif (l1 == 0) and (l2 == 0): 40 | tn += 1 41 | 42 | print('\ntp - {},tn - {},fp - {},fn - {}\n'.format(tp,tn,fp,fn)) 43 | acc = (tp + tn) / (tp + tn + fp + fn+smth) 44 | recall = tp / (tp + fp + smth) 45 | precision = tp / (tp + fn + smth) 46 | 47 | return acc, recall, precision, [tp,tn,fp,fn] -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | imgaug==0.2.6 2 | matplotlib==3.3.2 3 | numpy==1.18.0 4 | pandas==1.1.4 5 | onnx==1.7.0 6 | tf2onnx==1.7.0 7 | -------------------------------------------------------------------------------- /saved/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore -------------------------------------------------------------------------------- /trt_utils.py: -------------------------------------------------------------------------------- 1 | import pycuda.autoinit 2 | import pycuda.driver as cuda 3 | import tensorrt as trt 4 | import numpy as np 5 | 6 | TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) 7 | 8 | class HostDeviceMem(object): 9 | def __init__(self, host_mem, device_mem): 10 | self.host = host_mem 11 | self.device = device_mem 12 | 13 | def __str__(self): 14 | return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) 15 | 16 | def __repr__(self): 17 | return self.__str__() 18 | 19 | def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): 20 | # Transfer input data to the GPU. 21 | [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] 22 | # Run inference. 23 | context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) 24 | # Transfer predictions back from the GPU. 25 | [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] 26 | # Synchronize the stream 27 | stream.synchronize() 28 | # Return only the host outputs. 29 | return [out.host for out in outputs] 30 | 31 | def allocate_buffers(engine): 32 | inputs = [] 33 | outputs = [] 34 | bindings = [] 35 | stream = cuda.Stream() 36 | for binding in engine: 37 | size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size 38 | dtype = trt.nptype(engine.get_binding_dtype(binding)) 39 | # Allocate host and device buffers 40 | host_mem = cuda.pagelocked_empty(size, dtype) 41 | device_mem = cuda.mem_alloc(host_mem.nbytes) 42 | # Append the device buffer to device bindings. 43 | bindings.append(int(device_mem)) 44 | # Append to the appropriate list. 45 | if engine.binding_is_input(binding): 46 | inputs.append(HostDeviceMem(host_mem, device_mem)) 47 | else: 48 | outputs.append(HostDeviceMem(host_mem, device_mem)) 49 | return inputs, outputs, bindings, stream 50 | 51 | def load_engine(engine_path:str): 52 | with open(engine_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime: 53 | # Deserialize ICudaEngine 54 | engine = runtime.deserialize_cuda_engine(f.read()) 55 | return engine -------------------------------------------------------------------------------- /weights/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | !/engines -------------------------------------------------------------------------------- /weights/engines/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore -------------------------------------------------------------------------------- /yolo_test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import tensorflow as tf 4 | config = tf.compat.v1.ConfigProto() 5 | config.gpu_options.allow_growth = True 6 | session = tf.compat.v1.Session(config=config) 7 | 8 | import cv2 9 | from trt_utils import * 10 | from tensorflow.keras.models import load_model 11 | from hand_detector.yolo.darknet import model as yolo_model 12 | from hand_detector.yolo.generator import load_test_images 13 | from hand_detector.yolo.preprocess.yolo_flag import Flag 14 | from metrics import iou, get_stat 15 | 16 | f = Flag() 17 | # TEST DATASET LABELS 18 | df_test = pd.read_csv('custom_dataset/test_labels.csv') 19 | 20 | 21 | def get_test_image(image_name, directory = 'custom_dataset/'): 22 | image = cv2.imread(directory + 'test/' + image_name, cv2.COLOR_BGR2RGB) 23 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 24 | image = cv2.resize(image, (f.target_size, f.target_size)) 25 | processed_image = np.expand_dims(image, axis=0) / 255.0 26 | return processed_image 27 | 28 | 29 | def get_test_bbox(image_name): 30 | 31 | label = df_test[df_test.filename == image_name[:-4]].iloc[0][1:].tolist() 32 | bbox = [float(element) * f.target_size for element in label] 33 | bbox = tuple(bbox) 34 | if bbox[0] == 0 and bbox[1] == 0 and bbox[2] == 0 and bbox[3] == 0: 35 | return None 36 | return bbox 37 | 38 | 39 | def convert_anchor_to_bbox(yolo_out, threshold = 0.8, width=224, height=224): 40 | 41 | grid_pred = yolo_out[:, :, 0] 42 | i, j = np.squeeze(np.where(grid_pred == np.amax(grid_pred))) 43 | 44 | try: 45 | if i.shape[0] > 1 : 46 | i = i[0] 47 | j = j[0] 48 | except: 49 | pass 50 | 51 | if grid_pred[i, j] >= threshold: 52 | bbox = yolo_out[i, j, 1:] 53 | x1, y1, x2, y2 = bbox[0], bbox[1], bbox[2], bbox[3] 54 | # size conversion 55 | x1 = float(x1 * width) 56 | y1 = float(y1 * height) 57 | x2 = float(x2 * width) 58 | y2 = float(y2 * height) 59 | return (x1, y1, x2, y2) 60 | else: 61 | return None 62 | 63 | 64 | # def show_result(preprocess, pr_bbox, gt_bbox, tmp_iou): 65 | # image = preprocess.astype(np.float32) 66 | # if pr_bbox is not None: 67 | # x1, y1, x2, y2 = int(pr_bbox[0]), int(pr_bbox[1]), int(pr_bbox[2]), int(pr_bbox[3]) 68 | # image = cv2.rectangle(image, (x1, y1), (x2, y2), (0,0,0), 2) 69 | # if gt_bbox is not None: 70 | # x1, y1, x2, y2 = int(gt_bbox[0]), int(gt_bbox[1]), int(gt_bbox[2]), int(gt_bbox[3]) 71 | # image = cv2.rectangle(image, (x1, y1), (x2, y2), (0,255,0), 2) 72 | # 73 | # image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) 74 | # cv2.putText(image, '{:.2f}'.format(tmp_iou), (25,25), cv2.FONT_HERSHEY_SIMPLEX, 1,(0,255,0),2,cv2.LINE_AA) 75 | # cv2.imshow('test_image', image) 76 | 77 | 78 | def run_test(weights = 'weights/yolo.h5', trt_engine = 'weights/engines/model_trained_yolo.fp16.engine', iou_threshold = 0.5, confidence_threshold = 0.8, trt = False, show = True): 79 | 80 | 81 | if trt: 82 | engine = load_engine(trt_engine) 83 | inputs, outputs, bindings, stream = allocate_buffers(engine) 84 | context = engine.create_execution_context() 85 | else: 86 | # create the model 87 | model = yolo_model() 88 | model.load_weights(weights) 89 | 90 | # model.summary() 91 | 92 | # test 93 | list_test_images = load_test_images() 94 | test_set_size = len(list_test_images) 95 | print('Test_set_size : ', test_set_size) 96 | 97 | iou_list = [] 98 | pr_list = [] 99 | gt_list = [] 100 | 101 | for i in range(test_set_size): 102 | print(i) 103 | image_name = list_test_images[i] 104 | preprocess = get_test_image(image_name) 105 | np.copyto(inputs[0].host, preprocess.ravel()) 106 | if trt: 107 | yolo_out = np.array([do_inference(context, 108 | bindings=bindings, 109 | inputs=inputs, 110 | outputs=outputs, 111 | stream=stream) 112 | ]).reshape((1, 7, 7, 5)) 113 | yolo_output = yolo_out[0] 114 | else: 115 | yolo_output = model.predict(preprocess)[0] 116 | 117 | pr_bbox = convert_anchor_to_bbox(yolo_output, threshold = confidence_threshold, width=f.target_size, height=f.target_size) 118 | gt_bbox = get_test_bbox(image_name) 119 | 120 | 121 | if gt_bbox is None and pr_bbox is None: 122 | pr_list.append(0) 123 | gt_list.append(0) 124 | tmp_iou = -1 125 | 126 | elif gt_bbox is None and pr_bbox is not None: 127 | pr_list.append(1) 128 | gt_list.append(0) 129 | iou_list.append(0) 130 | tmp_iou = 0 131 | 132 | elif gt_bbox is not None and pr_bbox is None: 133 | pr_list.append(0) 134 | gt_list.append(1) 135 | iou_list.append(0) 136 | tmp_iou = 0 137 | elif gt_bbox is not None and pr_bbox is not None: 138 | gt_list.append(1) 139 | tmp_iou = iou(gt_bbox, pr_bbox) 140 | 141 | if tmp_iou > iou_threshold: 142 | pr_list.append(1) 143 | else: 144 | pr_list.append(0) 145 | 146 | iou_list.append(tmp_iou) 147 | 148 | #if show: 149 | # show_result(preprocess[0], pr_bbox, gt_bbox, tmp_iou) 150 | # if cv2.waitKey(60) & 0xff == 27: 151 | # cv2.destroyAllWindows() 152 | # break 153 | avg_iou = sum(iou_list)/len(iou_list) 154 | acc, recall, precision, _ = get_stat(gt_list, pr_list) 155 | 156 | print('Avg iou : {:.2f}'.format(avg_iou*100)) 157 | print('Accuracy : {:.2f} %'.format(acc*100)) 158 | print('Recall : {:.2f} %'.format(recall*100)) 159 | print('Precision : {:.2f} %'.format(precision*100)) 160 | 161 | if __name__ == '__main__': 162 | print('\n\n --------- yolo -----------') 163 | run_test(weights = 'weights/yolo.h5', trt_engine = 'weights/engines/model_trained_yolo.fp32.engine', iou_threshold = 0.5, confidence_threshold = 0.8, trt = True) 164 | -------------------------------------------------------------------------------- /yolo_train.py: -------------------------------------------------------------------------------- 1 | from math import ceil 2 | import tensorflow as tf 3 | config = tf.compat.v1.ConfigProto() 4 | config.gpu_options.allow_growth = True 5 | session = tf.compat.v1.Session(config=config) 6 | 7 | 8 | from tensorflow.keras.optimizers import Adam 9 | from tensorflow.keras.callbacks import ModelCheckpoint 10 | 11 | from hand_detector.yolo.darknet import model as yolo_model 12 | from hand_detector.yolo.utils.info import data_info 13 | from hand_detector.yolo.generator import train_generator, valid_generator 14 | 15 | 16 | def loss_function(y_true, y_pred): 17 | 18 | # binary cross entropy loss 19 | cross_entropy_loss = tf.keras.losses.binary_crossentropy(y_true[:, :, :, 0:1], y_pred[:, :, :, 0:1]) 20 | cross_entropy_loss = tf.reduce_mean(cross_entropy_loss) 21 | # mean square loss 22 | square_diff = tf.math.squared_difference(y_true[:, :, :, 1:5], y_pred[:, :, :, 1:5]) 23 | mask = tf.not_equal(y_true[:, :, :, 1:5], 0) 24 | mask = tf.cast(mask, tf.float32) 25 | coordinate_loss = tf.multiply(square_diff, mask) 26 | coordinate_loss = tf.reduce_sum(coordinate_loss) 27 | loss = cross_entropy_loss + coordinate_loss 28 | return loss 29 | 30 | 31 | # create the model 32 | model = yolo_model() 33 | model.load_weights('weights/yolo.h5') 34 | model.summary() 35 | # compile 36 | adam = Adam(lr=1e-5, beta_1=0.9, beta_2=0.999, epsilon=1e-10) 37 | model.compile(optimizer=adam, loss={"output": loss_function}, metrics={"output": loss_function}) 38 | 39 | # train 40 | epochs = 100 41 | batch_size = 32 42 | train_set_size = data_info('train') 43 | valid_set_size = data_info('valid') 44 | training_steps_per_epoch = ceil(train_set_size / batch_size) 45 | validation_steps_per_epoch = ceil(valid_set_size / batch_size) 46 | print('training_steps_per_epoch: ', training_steps_per_epoch) 47 | print('validation_steps_per_epoch: ', validation_steps_per_epoch) 48 | train_gen = train_generator(batch_size=batch_size) 49 | valid_gen = valid_generator(batch_size=batch_size) 50 | 51 | checkpoints = ModelCheckpoint('weights/yolo_train_best_{epoch:03d}.h5', save_weights_only=True, monitor='val_loss_function', 52 | mode='min', save_best_only=True) 53 | history = model.fit_generator(train_gen, steps_per_epoch=training_steps_per_epoch, epochs=epochs, verbose=1, 54 | validation_data=valid_gen, validation_steps=validation_steps_per_epoch, 55 | callbacks=[checkpoints], shuffle=True, max_queue_size=10) 56 | 57 | with open('weights/history.txt', 'a+') as f: 58 | print(history.history, file=f) 59 | 60 | print('All Done!') 61 | --------------------------------------------------------------------------------