├── .ipynb_checkpoints └── Untitled-checkpoint.ipynb ├── .vscode └── settings.json ├── DetectionToolKit.py ├── FaceToolKit.py ├── IC_checkpoints.keras ├── IC_logs ├── events.out.tfevents.1552908242.ghost-pc └── events.out.tfevents.1554734574.ghost-pc ├── MAIN_RUN.py ├── README.md ├── Sample Videos ├── lol1.mp4 ├── lol2.mp4 ├── lol3.avi ├── lol4.mp4 └── test.mp4 ├── Untitled.ipynb ├── __pycache__ ├── DetectionToolKit.cpython-35.pyc ├── DetectionToolKit.cpython-36.pyc ├── FaceToolKit.cpython-35.pyc ├── FaceToolKit.cpython-36.pyc ├── cache.cpython-35.pyc ├── cache.cpython-36.pyc ├── caption_tune.cpython-35.pyc ├── caption_tune.cpython-36.pyc ├── coco.cpython-35.pyc ├── coco.cpython-36.pyc ├── download.cpython-35.pyc ├── download.cpython-36.pyc ├── f_part.cpython-36.pyc ├── faceadd.cpython-36.pyc ├── fr_utils.cpython-36.pyc ├── gensound.cpython-35.pyc ├── gensound.cpython-36.pyc ├── inception_blocks_v2.cpython-36.pyc └── p_part.cpython-36.pyc ├── cache.py ├── caption_tune.py ├── coco.py ├── d2.mp4 ├── detection ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-35.pyc │ └── __init__.cpython-36.pyc └── mtcnn │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-35.pyc │ ├── __init__.cpython-36.pyc │ ├── detect_face.cpython-35.pyc │ └── detect_face.cpython-36.pyc │ ├── det1.npy │ ├── det2.npy │ ├── det3.npy │ └── detect_face.py ├── digivision.py ├── digivision2.py ├── download.py ├── f_part.py ├── faceadd.py ├── facenet ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-35.pyc │ ├── __init__.cpython-36.pyc │ ├── face.cpython-35.pyc │ └── face.cpython-36.pyc └── face.py ├── gensound.py ├── gensoundgtts.py ├── haarcascade_frontalface_default.xml ├── images ├── Andrew.jpg ├── Capture.JPG ├── Capture1.JPG ├── Capture2.JPG └── andrew.jpg ├── models └── 20180204-160909 │ ├── 20180204-16090.pb │ ├── checkpoint │ ├── model-20180204-160909.ckpt-264000.data-00000-of-00001 │ ├── model-20180204-160909.ckpt-264000.index │ ├── model-20180204-160909.ckpt-265000.data-00000-of-00001 │ ├── model-20180204-160909.ckpt-265000.index │ ├── model-20180204-160909.ckpt-266000.data-00000-of-00001 │ ├── model-20180204-160909.ckpt-266000.index │ └── model-20180204-160909.meta └── p_part.py /.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "- Data loaded from cache-file: C:\\Users\\User\\Desktop\\image-cap\\data\\coco\\records_train.pkl\n", 13 | "- Data loaded from cache-file: C:\\Users\\User\\Desktop\\image-cap\\data\\coco\\records_val.pkl\n", 14 | "Processing 118287 images in training-set. \n", 15 | "- Data loaded from cache-file: C:\\Users\\User\\Desktop\\image-cap\\data\\coco\\transfer_values_train.pkl\n", 16 | "Processing 5000 images in validation-set. \n", 17 | "- Data loaded from cache-file: C:\\Users\\User\\Desktop\\image-cap\\data\\coco\\transfer_values_val.pkl\n", 18 | "Model directory: ./models/20180204-160909/\n", 19 | "Metagraph file: model-20180204-160909.meta\n", 20 | "Checkpoint file: model-20180204-160909.ckpt-266000\n", 21 | "WARNING:tensorflow:The saved meta_graph is possibly from an older release:\n", 22 | "'model_variables' collection should be of type 'byte_list', but instead is of type 'node_list'.\n", 23 | "INFO:tensorflow:Restoring parameters from ./models/20180204-160909/model-20180204-160909.ckpt-266000\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "import cv2 as cv\n", 29 | "import warnings\n", 30 | "warnings.filterwarnings(\"ignore\")\n", 31 | "import p_part\n", 32 | "import f_part\n", 33 | "from caption_tune import modcap, face_found_cap, face_not_found_cap\n", 34 | "from gensound import generate_sound\n", 35 | "import tkinter as tk\n" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "\n", 45 | "def saveface():\n", 46 | " # generate_sound(\"Tell me the name\")\n", 47 | " x1 = speech(\"Tell me the name\")\n", 48 | " print(x1 + ' face saved')\n", 49 | " cv.imwrite(r\"images//\" +\n", 50 | " str(x1) + \".jpg\", save)\n", 51 | " data = {x1: f_part.img_to_encoding(\n", 52 | " \"images//\" + str(x1) + \".jpg\").tolist()}\n", 53 | " f_part.digi_db.insert_one(data)\n", 54 | "\n", 55 | "\n", 56 | "def ignoreface():\n", 57 | " generate_sound(\"Not saved\")" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 16, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "0.17878464559657153,Tanmay\n", 70 | "known: 1\n", 71 | "i=0\n", 72 | "Tanmay at dist of: 0.17878464559657153\n", 73 | "0.6959873898612923,unknown\n", 74 | "Pls say something....\n", 75 | "Google Audio:no\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "%reload_ext autoreload\n", 81 | "%autoreload 2\n", 82 | "from faceadd import addn,speech\n", 83 | "cap = cv.VideoCapture(0)\n", 84 | "\n", 85 | "while True:\n", 86 | " ret, frame = cap.read()\n", 87 | " facedetect = cv.CascadeClassifier(r'haarcascade_frontalface_default.xml')\n", 88 | " if ret:\n", 89 | " # font = cv.FONT_HERSHEY_SIMPLEX\n", 90 | " cv.imshow(\"Video\", frame)\n", 91 | "\n", 92 | " if cv.waitKey(1) == ord('p'):\n", 93 | "\n", 94 | " cv.imwrite('./test.jpg', frame)\n", 95 | " final_caption = p_part.generate_caption(\n", 96 | " './test.jpg') # create caption\n", 97 | " final_caption = modcap(final_caption) # remove tags\n", 98 | " print(final_caption)\n", 99 | " generate_sound(final_caption) # convert to audio\n", 100 | "\n", 101 | " if cv.waitKey(1) == ord('f'):\n", 102 | " gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)\n", 103 | " faces = facedetect.detectMultiScale(gray, 1.3, 5)\n", 104 | " cv.imwrite('./test.jpg', frame)\n", 105 | " known_detected = 0\n", 106 | " unknown_detected = 0\n", 107 | " known_face_list = []\n", 108 | " known_face_dist = []\n", 109 | " try:\n", 110 | " for x, y, w, h in faces:\n", 111 | " #cv2.imwrite(\"dset//User.\"+str(user)+\".\"+str(sample)+\".jpg\",gray[y:y+h,x:x+w])\n", 112 | " save = frame[y:y+h, x:x+w]\n", 113 | " cv.imwrite('./test.jpg', save)\n", 114 | " dis, name = f_part.who_is_it('./test.jpg')\n", 115 | " print(str(dis)+\",\"+name)\n", 116 | " if name != 'unknown':\n", 117 | " known_face_list.append(name)\n", 118 | " known_face_dist.append(dis)\n", 119 | " known_detected += 1\n", 120 | "\n", 121 | " else:\n", 122 | " unknown_detected += 1\n", 123 | "\n", 124 | " if known_detected > 0:\n", 125 | " print(\"known: \" + str(known_detected))\n", 126 | " for i in range(known_detected):\n", 127 | " print('i=' + str(i))\n", 128 | " print(\n", 129 | " known_face_list[i] + \" at dist of: \" + str(known_face_dist[i]))\n", 130 | " temp = face_found_cap(str(known_face_list[i]))\n", 131 | " generate_sound(temp)\n", 132 | " elif unknown_detected == 1:\n", 133 | " temp = face_not_found_cap()\n", 134 | " generate_sound(temp)\n", 135 | " generate_sound(\"Do you want to add this face in your database\")\n", 136 | " addn(save)\n", 137 | "\n", 138 | " elif known_detected == 0 and unknown_detected == 0:\n", 139 | " print(\"No person found\")\n", 140 | " generate_sound(\"No person found!\")\n", 141 | "\n", 142 | " else:\n", 143 | " print(\"Too many people\")\n", 144 | " generate_sound(\"Too many people.\")\n", 145 | " except Exception as e:\n", 146 | " generate_sound(\"No recognisable face found!\")\n", 147 | " print(e)\n", 148 | "\n", 149 | " if cv.waitKey(1) & 0xFF == 27: # ASCII for Esc Key\n", 150 | " break\n", 151 | " else:\n", 152 | " break\n", 153 | "cap.release()\n", 154 | "cv.destroyAllWindows()\n" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 9, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "cap.release()\n", 164 | "cv.destroyAllWindows()" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 6, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "name": "stdout", 174 | "output_type": "stream", 175 | "text": [ 176 | "100,unknown\n", 177 | "Pls say something....\n", 178 | "Google Audio:yes\n", 179 | "Pls say something....\n", 180 | "Google Audio:Mayank\n", 181 | "Mayank face saved\n", 182 | "0.0,Mayank\n", 183 | "known: 1\n", 184 | "i=0\n", 185 | "Mayank at dist of: 0.0\n", 186 | "a person is there in front of you\n" 187 | ] 188 | } 189 | ], 190 | "source": [ 191 | "'''This part is for testing only\n", 192 | "I repeat this part is only for testing'''\n", 193 | "\n", 194 | "\n", 195 | "\n", 196 | "\n", 197 | "facedetect = cv.CascadeClassifier(r'haarcascade_frontalface_default.xml')\n", 198 | "frame = cv.imread(r'C:\\Users\\User\\Desktop\\projects\\face-recognition-attendance-system-master\\training-data\\s2\\13.jpg')\n", 199 | " # font = cv.FONT_HERSHEY_SIMPLEX\n", 200 | "while True:\n", 201 | " cv.imshow(\"Video\", frame)\n", 202 | " if cv.waitKey(0) == ord('p'):\n", 203 | "\n", 204 | " cv.imwrite('./test.jpg', frame)\n", 205 | " final_caption = p_part.generate_caption(\n", 206 | " './test.jpg') # create caption\n", 207 | " final_caption = modcap(final_caption) # remove tags\n", 208 | " print(final_caption)\n", 209 | " generate_sound(final_caption) # convert to audio\n", 210 | "\n", 211 | " if cv.waitKey(0) == ord('f'):\n", 212 | " gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)\n", 213 | " faces = facedetect.detectMultiScale(gray, 1.3, 5)\n", 214 | " cv.imwrite('./test.jpg', frame)\n", 215 | " known_detected = 0\n", 216 | " unknown_detected = 0\n", 217 | " known_face_list = []\n", 218 | " known_face_dist = []\n", 219 | " try:\n", 220 | " for x, y, w, h in faces:\n", 221 | " #cv2.imwrite(\"dset//User.\"+str(user)+\".\"+str(sample)+\".jpg\",gray[y:y+h,x:x+w])\n", 222 | " save = frame[y:y+h, x:x+w]\n", 223 | " cv.imwrite('./test.jpg', save)\n", 224 | " dis, name = f_part.who_is_it('./test.jpg')\n", 225 | " print(str(dis)+\",\"+name)\n", 226 | " if name != 'unknown':\n", 227 | " known_face_list.append(name)\n", 228 | " known_face_dist.append(dis)\n", 229 | " known_detected += 1\n", 230 | "\n", 231 | " else:\n", 232 | " unknown_detected += 1\n", 233 | "\n", 234 | " if known_detected > 0:\n", 235 | " print(\"known: \" + str(known_detected))\n", 236 | " for i in range(known_detected):\n", 237 | " print('i=' + str(i))\n", 238 | " print(\n", 239 | " known_face_list[i] + \" at dist of: \" + str(known_face_dist[i]))\n", 240 | " temp = face_found_cap(str(known_face_list[i]))\n", 241 | " generate_sound(temp)\n", 242 | " elif unknown_detected == 1:\n", 243 | " temp = face_not_found_cap()\n", 244 | " generate_sound(temp)\n", 245 | " generate_sound(\"Do you want to add this face in your database\")\n", 246 | " addn(save)\n", 247 | "\n", 248 | " elif known_detected == 0 and unknown_detected == 0:\n", 249 | " print(\"No person found\")\n", 250 | " generate_sound(\"No person found!\")\n", 251 | "\n", 252 | " else:\n", 253 | " print(\"Too many people\")\n", 254 | " generate_sound(\"Too many people.\")\n", 255 | " except Exception as e:\n", 256 | " generate_sound(\"No recognisable face found!\")\n", 257 | " print(e)\n", 258 | "\n", 259 | " if cv.waitKey(0) & 0xFF == 27:\n", 260 | " break# ASCII for Esc Key\n", 261 | "cv.destroyAllWindows()\n" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 10, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "# addn()" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 15, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "%reload_ext autoreload\n", 280 | "%autoreload 2\n", 281 | "from faceadd import addn,speech" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | " root = tk.Tk()\n", 291 | " \n", 292 | " large_font = ('Times New Roman', 14)\n", 293 | "\n", 294 | " canvas1 = tk.Canvas(root, width=300, height=200)\n", 295 | " canvas1.pack()\n", 296 | " label = tk.Label(root, text='Enter the Name')\n", 297 | " canvas1.create_window(140, 50, window=label)\n", 298 | " entry1Var = tk.StringVar(value='')\n", 299 | " entry1 = tk.Entry(\n", 300 | " root, textvariable=entry1Var, font=large_font)\n", 301 | " canvas1.create_window(150, 90, window=entry1)\n", 302 | " button1 = tk.Button(text='SAVE', command=saveface)\n", 303 | " button2 = tk.Button(text='IGNORE', command=ignoreface)\n", 304 | " canvas1.create_window(100, 150, window=button1)\n", 305 | " canvas1.create_window(180, 150, window=button2)\n", 306 | "\n", 307 | " root.mainloop()" 308 | ] 309 | } 310 | ], 311 | "metadata": { 312 | "kernelspec": { 313 | "display_name": "Python 3", 314 | "language": "python", 315 | "name": "python3" 316 | }, 317 | "language_info": { 318 | "codemirror_mode": { 319 | "name": "ipython", 320 | "version": 3 321 | }, 322 | "file_extension": ".py", 323 | "mimetype": "text/x-python", 324 | "name": "python", 325 | "nbconvert_exporter": "python", 326 | "pygments_lexer": "ipython3", 327 | "version": "3.6.6" 328 | } 329 | }, 330 | "nbformat": 4, 331 | "nbformat_minor": 2 332 | } 333 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/home/darkghost/anaconda3/envs/pro/bin/python" 3 | } -------------------------------------------------------------------------------- /DetectionToolKit.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from detection.mtcnn import detect_face 4 | from scipy import misc 5 | 6 | default_color = (0, 255, 0) #BGR 7 | default_thickness = 2 8 | minsize = 20 # minimum size of face 9 | threshold = [ 0.6, 0.7, 0.7 ] # three steps's threshold 10 | factor = 0.709 # scale factor 11 | 12 | margin = 44 13 | image_size = 160 14 | 15 | class Detection: 16 | def __init__(self): 17 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2) 18 | self.session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) 19 | self.pnet, self.rnet, self.onet = detect_face.create_mtcnn(self.session, None) 20 | 21 | def detect(self, img, detect_multiple_faces = True): 22 | bboxes = [] 23 | bounding_boxes, points = detect_face.detect_face( 24 | img, minsize, self.pnet, self.rnet, self.onet, threshold, factor) 25 | nrof_faces = bounding_boxes.shape[0] 26 | if nrof_faces > 0: 27 | det = bounding_boxes[:, 0:4] 28 | det_arr = [] 29 | img_size = np.asarray(img.shape)[0:2] 30 | if nrof_faces > 1: 31 | if detect_multiple_faces: 32 | for i in range(nrof_faces): 33 | det_arr.append(np.squeeze(det[i])) 34 | else: 35 | bounding_box_size = (det[:, 2] - det[:, 0]) * (det[:, 3] - det[:, 1]) 36 | img_center = img_size / 2 37 | offsets = np.vstack( 38 | [(det[:, 0] + det[:, 2]) / 2 - img_center[1], (det[:, 1] + det[:, 3]) / 2 - img_center[0]]) 39 | offset_dist_squared = np.sum(np.power(offsets, 2.0), 0) 40 | index = np.argmax( 41 | bounding_box_size - offset_dist_squared * 2.0) # some extra weight on the centering 42 | det_arr.append(det[index, :]) 43 | else: 44 | det_arr.append(np.squeeze(det)) 45 | for i, det in enumerate(det_arr): 46 | det = np.squeeze(det) 47 | bb = np.zeros(4, dtype=np.int32) 48 | bb[0] = np.maximum(det[0] - margin / 2, 0) 49 | bb[1] = np.maximum(det[1] - margin / 2, 0) 50 | bb[2] = np.minimum(det[2] + margin / 2, img_size[1]) 51 | bb[3] = np.minimum(det[3] + margin / 2, img_size[0]) 52 | bboxes.append(bb) 53 | return bboxes 54 | 55 | 56 | 57 | def align(self, img, detect_multiple_faces = True): 58 | faces = [] 59 | bboxes = self.detect(img,False) 60 | for bb in bboxes: 61 | cropped = img[bb[1]:bb[3], bb[0]:bb[2], :] 62 | scaled = misc.imresize(cropped, (image_size, image_size), interp='bilinear') 63 | faces.append(scaled) 64 | return faces 65 | 66 | def crop_detected_face(self, img, bb): 67 | cropped = img[bb[1]:bb[3], bb[0]:bb[2], :] 68 | scaled = misc.imresize(cropped, (image_size, image_size), interp='bilinear') 69 | return scaled 70 | -------------------------------------------------------------------------------- /FaceToolKit.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from facenet import face 4 | 5 | class Verification: 6 | 7 | def __init__(self): 8 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333) 9 | self.session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) 10 | self.images_placeholder = '' 11 | self.embeddings = '' 12 | self.phase_train_placeholder = '' 13 | self.embedding_size = '' 14 | self.session_closed = False 15 | 16 | def __del__(self): 17 | if not self.session_closed: 18 | self.session.close() 19 | 20 | def kill_session(self): 21 | self.session_closed = True 22 | self.session.close() 23 | 24 | def load_model(self, model): 25 | 26 | face.load_model(model, self.session) 27 | 28 | def initial_input_output_tensors(self): 29 | 30 | self.images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0") 31 | self.embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0") 32 | self.phase_train_placeholder = tf.get_default_graph().get_tensor_by_name("phase_train:0") 33 | self.embedding_size = self.embeddings.get_shape()[1] 34 | 35 | 36 | def img_to_encoding(self, img, image_size): 37 | 38 | image = face.make_image_tensor(img, image_size) 39 | 40 | feed_dict = {self.images_placeholder: image, self.phase_train_placeholder:False } 41 | emb_array = np.zeros((1, self.embedding_size)) 42 | emb_array[0, :] = self.session.run(self.embeddings, feed_dict=feed_dict) 43 | 44 | return np.squeeze(emb_array) 45 | 46 | -------------------------------------------------------------------------------- /IC_checkpoints.keras: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/IC_checkpoints.keras -------------------------------------------------------------------------------- /IC_logs/events.out.tfevents.1552908242.ghost-pc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/IC_logs/events.out.tfevents.1552908242.ghost-pc -------------------------------------------------------------------------------- /IC_logs/events.out.tfevents.1554734574.ghost-pc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/IC_logs/events.out.tfevents.1554734574.ghost-pc -------------------------------------------------------------------------------- /MAIN_RUN.py: -------------------------------------------------------------------------------- 1 | import cv2 as cv 2 | import warnings 3 | warnings.filterwarnings("ignore") 4 | import p_part 5 | import f_part 6 | from caption_tune import modcap, face_found_cap, face_not_found_cap 7 | from gensoundgtts import generate_sound 8 | import tkinter as tk 9 | 10 | 11 | def saveface(): 12 | x1 = entry1.get() 13 | print(x1 + ' face saved') 14 | root.destroy() 15 | cv.imwrite(r"images//" + 16 | str(x1) + ".jpg", frame) 17 | data = {x1: f_part.img_to_encoding( 18 | "images//" + str(x1) + ".jpg").tolist()} 19 | f_part.digi_db.insert_one(data) 20 | 21 | 22 | def ignoreface(): 23 | print("Not saved") 24 | root.destroy() 25 | 26 | 27 | cap = cv.VideoCapture('Sample Videos/test.mp4') 28 | 29 | while True: 30 | ret, frame = cap.read() 31 | 32 | if ret: 33 | font = cv.FONT_HERSHEY_SIMPLEX 34 | cv.imshow("Video", frame) 35 | 36 | if cv.waitKey(5) == ord('p'): 37 | # print(K.image_data_format()) 38 | cv.imwrite('./test.jpg', frame) 39 | final_caption = p_part.generate_caption('./test.jpg') # create caption 40 | final_caption = modcap(final_caption) # remove tags 41 | print(final_caption) 42 | generate_sound(final_caption) # convert to audio 43 | 44 | if cv.waitKey(5) == ord('f'): 45 | cv.imwrite('./test.jpg', frame) 46 | try: 47 | dis, name = f_part.who_is_it('./test.jpg') 48 | print(str(dis)+","+name) 49 | temp = face_found_cap(name) 50 | generate_sound(temp) 51 | if(name == 'unknown'): 52 | temp = face_not_found_cap() 53 | generate_sound(temp) 54 | 55 | root = tk.Tk() 56 | 57 | large_font = ('Times New Roman', 14) 58 | 59 | canvas1 = tk.Canvas(root, width=300, height=200) 60 | canvas1.pack() 61 | label = tk.Label(root, text='Enter the Name') 62 | canvas1.create_window(140, 50, window=label) 63 | entry1Var = tk.StringVar(value='') 64 | entry1 = tk.Entry(root, textvariable=entry1Var, font=large_font) 65 | canvas1.create_window(150, 90, window=entry1) 66 | button1 = tk.Button(text='SAVE', command=saveface) 67 | button2 = tk.Button(text='IGNORE', command=ignoreface) 68 | canvas1.create_window(100, 150, window=button1) 69 | canvas1.create_window(180, 150, window=button2) 70 | 71 | root.mainloop() 72 | except: 73 | print("No recognizable face detected") 74 | generate_sound("No recognizable face detected") 75 | 76 | if cv.waitKey(1) & 0xFF == 27: # ASCII for Esc Key 77 | break 78 | else: 79 | break 80 | cap.release() 81 | cv.destroyAllWindows() 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DigiVision 2 | A deep learning based application which is entitled to help the visually impaired people. The application automatically generates the textual description of what's happening in front of the camera and conveys it to person through audio. It is capable of recognising faces and tell the user whether a known person is standing in front of him or not. 3 | 4 | ![logo](images/Capture.JPG) 5 | 6 | 7 | # Requirements 8 | * Tensorflow (>1.9) 9 | * Keras 10 | * OpenCV 11 | * Python 3.5+ 12 | * gTTS 13 | * pygame 14 | * pymongo 15 | 16 | # Dataset used 17 | MS COCO 2017 for Image Processing and Captioning. 18 | 19 | Dataset for face Recognition is manually collected. 20 | 21 | # Features 22 | 23 | ![logo](images/Capture1.JPG) 24 | ![logo](images/Capture2.JPG) 25 | 26 | # Setup 27 | - Install all the required frameworks, libraries and dependecies as mentioned in Requirements above. 28 | - Download the COCO dataset if not available, in order to train the model 29 | - [Train images](http://images.cocodataset.org/zips/train2017.zip) 30 | - [Test images](http://images.cocodataset.org/zips/test2017.zip) 31 | - [Annotations](http://images.cocodataset.org/annotations/annotations_trainval2017.zip) 32 | 33 | Or run: 34 | ``` 35 | python download.py 36 | ``` 37 | - Create your own MongoDB Cluster and replace MONGO_URI in line 16 of f_part.py with your own Mongo AccessID. 38 | - Run the project using: 39 | - MAIN_RUN.py (for gTTS audio and dding names through Canvas/ python gtk) 40 | - digivision.py (for Single face detection along with new face addition through python gtk) 41 | - digivision2.py (for Multiface detection along with all Input/Outputs through Audio) 42 | 43 | ``` 44 | python .py 45 | ``` 46 | - It will take around 90 minutes to process all images and approx 5 minutes to process Validation images. 47 | - Takes around 22 minutes for a single epoch during training on batch size of 256 on NVIDIA GTX 960M. 48 | - Don't need to re-train data on every single run. Once trained, weights gets loaded automatically. 49 | 50 | # Demo 51 | [Click here for demo for MAIN_RUN.py](d2.mp4) 52 | -------------------------------------------------------------------------------- /Sample Videos/lol1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/Sample Videos/lol1.mp4 -------------------------------------------------------------------------------- /Sample Videos/lol2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/Sample Videos/lol2.mp4 -------------------------------------------------------------------------------- /Sample Videos/lol3.avi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/Sample Videos/lol3.avi -------------------------------------------------------------------------------- /Sample Videos/lol4.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/Sample Videos/lol4.mp4 -------------------------------------------------------------------------------- /Sample Videos/test.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/Sample Videos/test.mp4 -------------------------------------------------------------------------------- /Untitled.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "- Data loaded from cache-file: /mnt/MyDrive/Datasets/image-cap/data/coco/records_train.pkl\n", 13 | "- Data loaded from cache-file: /mnt/MyDrive/Datasets/image-cap/data/coco/records_val.pkl\n", 14 | "Processing 118287 images in training-set. \n", 15 | "- Data loaded from cache-file: /mnt/MyDrive/Datasets/image-cap/data/coco/transfer_values_train.pkl\n", 16 | "Processing 5000 images in validation-set. \n", 17 | "- Data loaded from cache-file: /mnt/MyDrive/Datasets/image-cap/data/coco/transfer_values_val.pkl\n", 18 | "Model directory: ./models/20180204-160909/\n", 19 | "Metagraph file: model-20180204-160909.meta\n", 20 | "Checkpoint file: model-20180204-160909.ckpt-266000\n", 21 | "WARNING:tensorflow:The saved meta_graph is possibly from an older release:\n", 22 | "'model_variables' collection should be of type 'byte_list', but instead is of type 'node_list'.\n", 23 | "INFO:tensorflow:Restoring parameters from ./models/20180204-160909/model-20180204-160909.ckpt-266000\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "import cv2 as cv\n", 29 | "import warnings\n", 30 | "warnings.filterwarnings(\"ignore\")\n", 31 | "import p_part\n", 32 | "import f_part\n", 33 | "from caption_tune import modcap, face_found_cap, face_not_found_cap\n", 34 | "from gensound import generate_sound\n", 35 | "import tkinter as tk\n" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "\n", 45 | "def saveface():\n", 46 | " # generate_sound(\"Tell me the name\")\n", 47 | " x1 = speech(\"Tell me the name\")\n", 48 | " print(x1 + ' face saved')\n", 49 | " cv.imwrite(r\"images//\" +\n", 50 | " str(x1) + \".jpg\", save)\n", 51 | " data = {x1: f_part.img_to_encoding(\n", 52 | " \"images//\" + str(x1) + \".jpg\").tolist()}\n", 53 | " f_part.digi_db.insert_one(data)\n", 54 | "\n", 55 | "\n", 56 | "def ignoreface():\n", 57 | " generate_sound(\"Not saved\")" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "1.1346577982680615,unknown\n", 70 | "Pls say something....\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "%reload_ext autoreload\n", 76 | "%autoreload 2\n", 77 | "from faceadd import addn,speech\n", 78 | "cap = cv.VideoCapture(0)\n", 79 | "\n", 80 | "while True:\n", 81 | " ret, frame = cap.read()\n", 82 | " facedetect = cv.CascadeClassifier(r'haarcascade_frontalface_default.xml')\n", 83 | " if ret:\n", 84 | " # font = cv.FONT_HERSHEY_SIMPLEX\n", 85 | " cv.imshow(\"Video\", frame)\n", 86 | "\n", 87 | " if cv.waitKey(1) == ord('p'):\n", 88 | "\n", 89 | " cv.imwrite('./test.jpg', frame)\n", 90 | " final_caption = p_part.generate_caption(\n", 91 | " './test.jpg') # create caption\n", 92 | " final_caption = modcap(final_caption) # remove tags\n", 93 | " print(final_caption)\n", 94 | " generate_sound(final_caption) # convert to audio\n", 95 | "\n", 96 | " if cv.waitKey(1) == ord('f'):\n", 97 | " gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)\n", 98 | " faces = facedetect.detectMultiScale(gray, 1.3, 5)\n", 99 | " cv.imwrite('./test.jpg', frame)\n", 100 | " known_detected = 0\n", 101 | " unknown_detected = 0\n", 102 | " known_face_list = []\n", 103 | " known_face_dist = []\n", 104 | " try:\n", 105 | " for x, y, w, h in faces:\n", 106 | " #cv2.imwrite(\"dset//User.\"+str(user)+\".\"+str(sample)+\".jpg\",gray[y:y+h,x:x+w])\n", 107 | " save = frame[y:y+h, x:x+w]\n", 108 | " cv.imwrite('./test.jpg', save)\n", 109 | " dis, name = f_part.who_is_it('./test.jpg')\n", 110 | " print(str(dis)+\",\"+name)\n", 111 | " if name != 'unknown':\n", 112 | " known_face_list.append(name)\n", 113 | " known_face_dist.append(dis)\n", 114 | " known_detected += 1\n", 115 | "\n", 116 | " else:\n", 117 | " unknown_detected += 1\n", 118 | "\n", 119 | " if known_detected > 0:\n", 120 | " print(\"known: \" + str(known_detected))\n", 121 | " for i in range(known_detected):\n", 122 | " print('i=' + str(i))\n", 123 | " print(\n", 124 | " known_face_list[i] + \" at dist of: \" + str(known_face_dist[i]))\n", 125 | " temp = face_found_cap(str(known_face_list[i]))\n", 126 | " generate_sound(temp)\n", 127 | " elif unknown_detected == 1:\n", 128 | " temp = face_not_found_cap()\n", 129 | " generate_sound(temp)\n", 130 | " generate_sound(\"Do you want to add this face in your database\")\n", 131 | " addn(save)\n", 132 | "\n", 133 | " elif known_detected == 0 and unknown_detected == 0:\n", 134 | " print(\"No person found\")\n", 135 | " generate_sound(\"No person found!\")\n", 136 | "\n", 137 | " else:\n", 138 | " print(\"Too many people\")\n", 139 | " generate_sound(\"Too many people.\")\n", 140 | " except Exception as e:\n", 141 | " generate_sound(\"No recognisable face found!\")\n", 142 | " print(e)\n", 143 | "\n", 144 | " if cv.waitKey(1) & 0xFF == 27: # ASCII for Esc Key\n", 145 | " break\n", 146 | " else:\n", 147 | " break\n", 148 | "cap.release()\n", 149 | "cv.destroyAllWindows()\n" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 9, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "cap.release()\n", 159 | "cv.destroyAllWindows()" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 6, 165 | "metadata": {}, 166 | "outputs": [ 167 | { 168 | "name": "stdout", 169 | "output_type": "stream", 170 | "text": [ 171 | "100,unknown\n", 172 | "Pls say something....\n", 173 | "Google Audio:yes\n", 174 | "Pls say something....\n", 175 | "Google Audio:Mayank\n", 176 | "Mayank face saved\n", 177 | "0.0,Mayank\n", 178 | "known: 1\n", 179 | "i=0\n", 180 | "Mayank at dist of: 0.0\n", 181 | "a person is there in front of you\n" 182 | ] 183 | } 184 | ], 185 | "source": [ 186 | "'''This part is for testing only\n", 187 | "I repeat this part is only for testing'''\n", 188 | "\n", 189 | "\n", 190 | "\n", 191 | "\n", 192 | "facedetect = cv.CascadeClassifier(r'haarcascade_frontalface_default.xml')\n", 193 | "frame = cv.imread(r'C:\\Users\\User\\Desktop\\projects\\face-recognition-attendance-system-master\\training-data\\s2\\13.jpg')\n", 194 | " # font = cv.FONT_HERSHEY_SIMPLEX\n", 195 | "while True:\n", 196 | " cv.imshow(\"Video\", frame)\n", 197 | " if cv.waitKey(0) == ord('p'):\n", 198 | "\n", 199 | " cv.imwrite('./test.jpg', frame)\n", 200 | " final_caption = p_part.generate_caption(\n", 201 | " './test.jpg') # create caption\n", 202 | " final_caption = modcap(final_caption) # remove tags\n", 203 | " print(final_caption)\n", 204 | " generate_sound(final_caption) # convert to audio\n", 205 | "\n", 206 | " if cv.waitKey(0) == ord('f'):\n", 207 | " gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)\n", 208 | " faces = facedetect.detectMultiScale(gray, 1.3, 5)\n", 209 | " cv.imwrite('./test.jpg', frame)\n", 210 | " known_detected = 0\n", 211 | " unknown_detected = 0\n", 212 | " known_face_list = []\n", 213 | " known_face_dist = []\n", 214 | " try:\n", 215 | " for x, y, w, h in faces:\n", 216 | " #cv2.imwrite(\"dset//User.\"+str(user)+\".\"+str(sample)+\".jpg\",gray[y:y+h,x:x+w])\n", 217 | " save = frame[y:y+h, x:x+w]\n", 218 | " cv.imwrite('./test.jpg', save)\n", 219 | " dis, name = f_part.who_is_it('./test.jpg')\n", 220 | " print(str(dis)+\",\"+name)\n", 221 | " if name != 'unknown':\n", 222 | " known_face_list.append(name)\n", 223 | " known_face_dist.append(dis)\n", 224 | " known_detected += 1\n", 225 | "\n", 226 | " else:\n", 227 | " unknown_detected += 1\n", 228 | "\n", 229 | " if known_detected > 0:\n", 230 | " print(\"known: \" + str(known_detected))\n", 231 | " for i in range(known_detected):\n", 232 | " print('i=' + str(i))\n", 233 | " print(\n", 234 | " known_face_list[i] + \" at dist of: \" + str(known_face_dist[i]))\n", 235 | " temp = face_found_cap(str(known_face_list[i]))\n", 236 | " generate_sound(temp)\n", 237 | " elif unknown_detected == 1:\n", 238 | " temp = face_not_found_cap()\n", 239 | " generate_sound(temp)\n", 240 | " generate_sound(\"Do you want to add this face in your database\")\n", 241 | " addn(save)\n", 242 | "\n", 243 | " elif known_detected == 0 and unknown_detected == 0:\n", 244 | " print(\"No person found\")\n", 245 | " generate_sound(\"No person found!\")\n", 246 | "\n", 247 | " else:\n", 248 | " print(\"Too many people\")\n", 249 | " generate_sound(\"Too many people.\")\n", 250 | " except Exception as e:\n", 251 | " generate_sound(\"No recognisable face found!\")\n", 252 | " print(e)\n", 253 | "\n", 254 | " if cv.waitKey(0) & 0xFF == 27:\n", 255 | " break# ASCII for Esc Key\n", 256 | "cv.destroyAllWindows()\n" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 10, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "# addn()" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 15, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "%reload_ext autoreload\n", 275 | "%autoreload 2\n", 276 | "from faceadd import addn,speech" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | " root = tk.Tk()\n", 286 | " \n", 287 | " large_font = ('Times New Roman', 14)\n", 288 | "\n", 289 | " canvas1 = tk.Canvas(root, width=300, height=200)\n", 290 | " canvas1.pack()\n", 291 | " label = tk.Label(root, text='Enter the Name')\n", 292 | " canvas1.create_window(140, 50, window=label)\n", 293 | " entry1Var = tk.StringVar(value='')\n", 294 | " entry1 = tk.Entry(\n", 295 | " root, textvariable=entry1Var, font=large_font)\n", 296 | " canvas1.create_window(150, 90, window=entry1)\n", 297 | " button1 = tk.Button(text='SAVE', command=saveface)\n", 298 | " button2 = tk.Button(text='IGNORE', command=ignoreface)\n", 299 | " canvas1.create_window(100, 150, window=button1)\n", 300 | " canvas1.create_window(180, 150, window=button2)\n", 301 | "\n", 302 | " root.mainloop()" 303 | ] 304 | } 305 | ], 306 | "metadata": { 307 | "kernelspec": { 308 | "display_name": "Python 3", 309 | "language": "python", 310 | "name": "python3" 311 | }, 312 | "language_info": { 313 | "codemirror_mode": { 314 | "name": "ipython", 315 | "version": 3 316 | }, 317 | "file_extension": ".py", 318 | "mimetype": "text/x-python", 319 | "name": "python", 320 | "nbconvert_exporter": "python", 321 | "pygments_lexer": "ipython3", 322 | "version": "3.6.6" 323 | } 324 | }, 325 | "nbformat": 4, 326 | "nbformat_minor": 2 327 | } 328 | -------------------------------------------------------------------------------- /__pycache__/DetectionToolKit.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/__pycache__/DetectionToolKit.cpython-35.pyc -------------------------------------------------------------------------------- /__pycache__/DetectionToolKit.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/__pycache__/DetectionToolKit.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/FaceToolKit.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/__pycache__/FaceToolKit.cpython-35.pyc -------------------------------------------------------------------------------- /__pycache__/FaceToolKit.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/__pycache__/FaceToolKit.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/cache.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/__pycache__/cache.cpython-35.pyc -------------------------------------------------------------------------------- /__pycache__/cache.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/__pycache__/cache.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/caption_tune.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/__pycache__/caption_tune.cpython-35.pyc -------------------------------------------------------------------------------- /__pycache__/caption_tune.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/__pycache__/caption_tune.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/coco.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/__pycache__/coco.cpython-35.pyc -------------------------------------------------------------------------------- /__pycache__/coco.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/__pycache__/coco.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/download.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/__pycache__/download.cpython-35.pyc -------------------------------------------------------------------------------- /__pycache__/download.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/__pycache__/download.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/f_part.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/__pycache__/f_part.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/faceadd.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/__pycache__/faceadd.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/fr_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/__pycache__/fr_utils.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/gensound.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/__pycache__/gensound.cpython-35.pyc -------------------------------------------------------------------------------- /__pycache__/gensound.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/__pycache__/gensound.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/inception_blocks_v2.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/__pycache__/inception_blocks_v2.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/p_part.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/__pycache__/p_part.cpython-36.pyc -------------------------------------------------------------------------------- /cache.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import numpy as np 4 | 5 | ######################################################################## 6 | 7 | 8 | def cache(cache_path, fn, *args, **kwargs): 9 | """ 10 | Cache-wrapper for a function or class. If the cache-file exists 11 | then the data is reloaded and returned, otherwise the function 12 | is called and the result is saved to cache. The fn-argument can 13 | also be a class instead, in which case an object-instance is 14 | created and saved to the cache-file. 15 | :param cache_path: 16 | File-path for the cache-file. 17 | :param fn: 18 | Function or class to be called. 19 | :param args: 20 | Arguments to the function or class-init. 21 | :param kwargs: 22 | Keyword arguments to the function or class-init. 23 | :return: 24 | The result of calling the function or creating the object-instance. 25 | """ 26 | 27 | # If the cache-file exists. 28 | if os.path.exists(cache_path): 29 | # Load the cached data from the file. 30 | with open(cache_path, mode='rb') as file: 31 | obj = pickle.load(file) 32 | 33 | print("- Data loaded from cache-file: " + cache_path) 34 | else: 35 | # The cache-file does not exist. 36 | 37 | # Call the function / class-init with the supplied arguments. 38 | obj = fn(*args, **kwargs) 39 | 40 | # Save the data to a cache-file. 41 | with open(cache_path, mode='wb') as file: 42 | pickle.dump(obj, file) 43 | 44 | print("- Data saved to cache-file: " + cache_path) 45 | 46 | return obj 47 | 48 | 49 | ######################################################################## 50 | 51 | 52 | def convert_numpy2pickle(in_path, out_path): 53 | """ 54 | Convert a numpy-file to pickle-file. 55 | The first version of the cache-function used numpy for saving the data. 56 | Instead of re-calculating all the data, you can just convert the 57 | cache-file using this function. 58 | :param in_path: 59 | Input file in numpy-format written using numpy.save(). 60 | :param out_path: 61 | Output file written as a pickle-file. 62 | :return: 63 | Nothing. 64 | """ 65 | 66 | # Load the data using numpy. 67 | data = np.load(in_path) 68 | 69 | # Save the data using pickle. 70 | with open(out_path, mode='wb') as file: 71 | pickle.dump(data, file) 72 | 73 | 74 | ######################################################################## 75 | 76 | if __name__ == '__main__': 77 | # This is a short example of using a cache-file. 78 | 79 | # This is the function that will only get called if the result 80 | # is not already saved in the cache-file. This would normally 81 | # be a function that takes a long time to compute, or if you 82 | # need persistent data for some other reason. 83 | def expensive_function(a, b): 84 | return a * b 85 | 86 | print('Computing expensive_function() ...') 87 | 88 | # Either load the result from a cache-file if it already exists, 89 | # otherwise calculate expensive_function(a=123, b=456) and 90 | # save the result to the cache-file for next time. 91 | result = cache(cache_path='cache_expensive_function.pkl', 92 | fn=expensive_function, a=123, b=456) 93 | 94 | print('result =', result) 95 | 96 | # Newline. 97 | print() 98 | 99 | # This is another example which saves an object to a cache-file. 100 | 101 | # We want to cache an object-instance of this class. 102 | # The motivation is to do an expensive computation only once, 103 | # or if we need to persist the data for some other reason. 104 | class ExpensiveClass: 105 | def __init__(self, c, d): 106 | self.c = c 107 | self.d = d 108 | self.result = c * d 109 | 110 | def print_result(self): 111 | print('c =', self.c) 112 | print('d =', self.d) 113 | print('result = c * d =', self.result) 114 | 115 | print('Creating object from ExpensiveClass() ...') 116 | 117 | # Either load the object from a cache-file if it already exists, 118 | # otherwise make an object-instance ExpensiveClass(c=123, d=456) 119 | # and save the object to the cache-file for the next time. 120 | obj = cache(cache_path='cache_ExpensiveClass.pkl', 121 | fn=ExpensiveClass, c=123, d=456) 122 | 123 | obj.print_result() 124 | 125 | ######################################################################## -------------------------------------------------------------------------------- /caption_tune.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | def face_found_cap(text): 4 | cap_list = [ 5 | 'Hey, it is ' + text + '. Say Hello!', 6 | 'I see ' + text + '. Go and say Hello!', 7 | 'I see a familiar face. It seems to be ' + text 8 | ] 9 | 10 | return random.choice(cap_list) 11 | 12 | def face_not_found_cap(): 13 | cap_list = [ 14 | 'I bet I have never seen this person in my life before', 15 | 'No, I do not know who this person is.', 16 | 'Unknown person alert.' 17 | ] 18 | 19 | return random.choice(cap_list) 20 | 21 | def modcap(text): 22 | text = text[:-4] 23 | text = text.strip() 24 | if text == "a man in a suit and tie holding a glass of wine": 25 | return "a person standing just in front." 26 | else: 27 | return text 28 | -------------------------------------------------------------------------------- /coco.py: -------------------------------------------------------------------------------- 1 | ######################################################################## 2 | # 3 | # Functions for downloading the COCO data-set from the internet 4 | # and loading it into memory. This data-set contains images and 5 | # various associated data such as text-captions describing the images. 6 | # 7 | # http://cocodataset.org 8 | # 9 | # Implemented in Python 3.6 10 | # 11 | # Usage: 12 | # 1) Call set_data_dir() to set the desired storage directory. 13 | # 2) Call maybe_download_and_extract() to download the data-set 14 | # if it is not already located in the given data_dir. 15 | # 3) Call load_records(train=True) and load_records(train=False) 16 | # to load the data-records for the training- and validation sets. 17 | # 5) Use the returned data in your own program. 18 | # 19 | # Format: 20 | # The COCO data-set contains a large number of images and various 21 | # data for each image stored in a JSON-file. 22 | # Functionality is provided for getting a list of image-filenames 23 | # (but not actually loading the images) along with their associated 24 | # data such as text-captions describing the contents of the images. 25 | # 26 | ######################################################################## 27 | # 28 | # This file is part of the TensorFlow Tutorials available at: 29 | # 30 | # https://github.com/Hvass-Labs/TensorFlow-Tutorials 31 | # 32 | # Published under the MIT License. See the file LICENSE for details. 33 | # 34 | # Copyright 2018 by Magnus Erik Hvass Pedersen 35 | # 36 | ######################################################################## 37 | 38 | import json 39 | import os 40 | import download 41 | from cache import cache 42 | 43 | ######################################################################## 44 | 45 | # Directory where you want to download and save the data-set. 46 | # Set this before you start calling any of the functions below. 47 | # Use the function set_data_dir() to also update train_dir and val_dir. 48 | data_dir = "data/coco/" 49 | 50 | # Sub-directories for the training- and validation-sets. 51 | train_dir = "data/coco/train2017" 52 | val_dir = "data/coco/val2017" 53 | 54 | # Base-URL for the data-sets on the internet. 55 | data_url = "http://images.cocodataset.org/" 56 | 57 | 58 | ######################################################################## 59 | # Private helper-functions. 60 | 61 | def _load_records(train=True): 62 | """ 63 | Load the image-filenames and captions 64 | for either the training-set or the validation-set. 65 | """ 66 | 67 | if train: 68 | # Training-set. 69 | filename = "captions_train2017.json" 70 | else: 71 | # Validation-set. 72 | filename = "captions_val2017.json" 73 | 74 | # Full path for the data-file. 75 | path = os.path.join(data_dir, "annotations", filename) 76 | 77 | # Load the file. 78 | with open(path, "r", encoding="utf-8") as file: 79 | data_raw = json.load(file) 80 | 81 | # Convenience variables. 82 | images = data_raw['images'] 83 | annotations = data_raw['annotations'] 84 | 85 | # Initialize the dict for holding our data. 86 | # The lookup-key is the image-id. 87 | records = dict() 88 | 89 | # Collect all the filenames for the images. 90 | for image in images: 91 | # Get the id and filename for this image. 92 | image_id = image['id'] 93 | filename = image['file_name'] 94 | 95 | # Initialize a new data-record. 96 | record = dict() 97 | 98 | # Set the image-filename in the data-record. 99 | record['filename'] = filename 100 | 101 | # Initialize an empty list of image-captions 102 | # which will be filled further below. 103 | record['captions'] = list() 104 | 105 | # Save the record using the the image-id as the lookup-key. 106 | records[image_id] = record 107 | 108 | # Collect all the captions for the images. 109 | for ann in annotations: 110 | # Get the id and caption for an image. 111 | image_id = ann['image_id'] 112 | caption = ann['caption'] 113 | 114 | # Lookup the data-record for this image-id. 115 | # This data-record should already exist from the loop above. 116 | record = records[image_id] 117 | 118 | # Append the current caption to the list of captions in the 119 | # data-record that was initialized in the loop above. 120 | record['captions'].append(caption) 121 | 122 | # Convert the records-dict to a list of tuples. 123 | records_list = [(key, record['filename'], record['captions']) 124 | for key, record in sorted(records.items())] 125 | 126 | # Convert the list of tuples to separate tuples with the data. 127 | ids, filenames, captions = zip(*records_list) 128 | 129 | return ids, filenames, captions 130 | 131 | 132 | ######################################################################## 133 | # Public functions that you may call to download the data-set from 134 | # the internet and load the data into memory. 135 | 136 | 137 | def set_data_dir(new_data_dir): 138 | """ 139 | Set the base-directory for data-files and then 140 | set the sub-dirs for training and validation data. 141 | """ 142 | 143 | # Ensure we update the global variables. 144 | global data_dir, train_dir, val_dir 145 | 146 | data_dir = new_data_dir 147 | train_dir = os.path.join(new_data_dir, "train2017") 148 | val_dir = os.path.join(new_data_dir, "val2017") 149 | 150 | 151 | def maybe_download_and_extract(): 152 | """ 153 | Download and extract the COCO data-set if the data-files don't 154 | already exist in data_dir. 155 | """ 156 | 157 | # Filenames to download from the internet. 158 | filenames = ["zips/train2017.zip", "zips/val2017.zip", 159 | "annotations/annotations_trainval2017.zip"] 160 | 161 | # Download these files. 162 | for filename in filenames: 163 | # Create the full URL for the given file. 164 | url = data_url + filename 165 | 166 | print("Downloading " + url) 167 | 168 | download.maybe_download_and_extract(url=url, download_dir=data_dir) 169 | 170 | 171 | def load_records(train=True): 172 | """ 173 | Load the data-records for the data-set. This returns the image ids, 174 | filenames and text-captions for either the training-set or validation-set. 175 | 176 | This wraps _load_records() above with a cache, so if the cache-file already 177 | exists then it is loaded instead of processing the original data-file. 178 | 179 | :param train: 180 | Bool whether to load the training-set (True) or validation-set (False). 181 | :return: 182 | ids, filenames, captions for the images in the data-set. 183 | """ 184 | 185 | if train: 186 | # Cache-file for the training-set data. 187 | cache_filename = "records_train.pkl" 188 | else: 189 | # Cache-file for the validation-set data. 190 | cache_filename = "records_val.pkl" 191 | 192 | # Path for the cache-file. 193 | cache_path = os.path.join(data_dir, cache_filename) 194 | 195 | # If the data-records already exist in a cache-file then load it, 196 | # otherwise call the _load_records() function and save its 197 | # return-values to the cache-file so it can be loaded the next time. 198 | records = cache(cache_path=cache_path, 199 | fn=_load_records, 200 | train=train) 201 | 202 | return records 203 | 204 | ######################################################################## 205 | -------------------------------------------------------------------------------- /d2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/d2.mp4 -------------------------------------------------------------------------------- /detection/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/detection/__init__.py -------------------------------------------------------------------------------- /detection/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/detection/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /detection/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/detection/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /detection/mtcnn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/detection/mtcnn/__init__.py -------------------------------------------------------------------------------- /detection/mtcnn/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/detection/mtcnn/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /detection/mtcnn/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/detection/mtcnn/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /detection/mtcnn/__pycache__/detect_face.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/detection/mtcnn/__pycache__/detect_face.cpython-35.pyc -------------------------------------------------------------------------------- /detection/mtcnn/__pycache__/detect_face.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/detection/mtcnn/__pycache__/detect_face.cpython-36.pyc -------------------------------------------------------------------------------- /detection/mtcnn/det1.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/detection/mtcnn/det1.npy -------------------------------------------------------------------------------- /detection/mtcnn/det2.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/detection/mtcnn/det2.npy -------------------------------------------------------------------------------- /detection/mtcnn/det3.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/detection/mtcnn/det3.npy -------------------------------------------------------------------------------- /detection/mtcnn/detect_face.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | from six import string_types, iteritems 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | #from math import floor 9 | import cv2 10 | import os 11 | 12 | def layer(op): 13 | """Decorator for composable network layers.""" 14 | 15 | def layer_decorated(self, *args, **kwargs): 16 | # Automatically set a name if not provided. 17 | name = kwargs.setdefault('name', self.get_unique_name(op.__name__)) 18 | # Figure out the layer inputs. 19 | if len(self.terminals) == 0: 20 | raise RuntimeError('No input variables found for layer %s.' % name) 21 | elif len(self.terminals) == 1: 22 | layer_input = self.terminals[0] 23 | else: 24 | layer_input = list(self.terminals) 25 | # Perform the operation and get the output. 26 | layer_output = op(self, layer_input, *args, **kwargs) 27 | # Add to layer LUT. 28 | self.layers[name] = layer_output 29 | # This output is now the input for the next layer. 30 | self.feed(layer_output) 31 | # Return self for chained calls. 32 | return self 33 | 34 | return layer_decorated 35 | 36 | class Network(object): 37 | 38 | def __init__(self, inputs, trainable=True): 39 | # The input nodes for this network 40 | self.inputs = inputs 41 | # The current list of terminal nodes 42 | self.terminals = [] 43 | # Mapping from layer names to layers 44 | self.layers = dict(inputs) 45 | # If true, the resulting variables are set as trainable 46 | self.trainable = trainable 47 | 48 | self.setup() 49 | 50 | def setup(self): 51 | """Construct the network. """ 52 | raise NotImplementedError('Must be implemented by the subclass.') 53 | 54 | def load(self, data_path, session, ignore_missing=False): 55 | """Load network weights. 56 | data_path: The path to the numpy-serialized network weights 57 | session: The current TensorFlow session 58 | ignore_missing: If true, serialized weights for missing layers are ignored. 59 | """ 60 | data_dict = np.load(data_path, encoding='latin1').item() #pylint: disable=no-member 61 | 62 | for op_name in data_dict: 63 | with tf.variable_scope(op_name, reuse=True): 64 | for param_name, data in iteritems(data_dict[op_name]): 65 | try: 66 | var = tf.get_variable(param_name) 67 | session.run(var.assign(data)) 68 | except ValueError: 69 | if not ignore_missing: 70 | raise 71 | 72 | def feed(self, *args): 73 | """Set the input(s) for the next operation by replacing the terminal nodes. 74 | The arguments can be either layer names or the actual layers. 75 | """ 76 | assert len(args) != 0 77 | self.terminals = [] 78 | for fed_layer in args: 79 | if isinstance(fed_layer, string_types): 80 | try: 81 | fed_layer = self.layers[fed_layer] 82 | except KeyError: 83 | raise KeyError('Unknown layer name fed: %s' % fed_layer) 84 | self.terminals.append(fed_layer) 85 | return self 86 | 87 | def get_output(self): 88 | """Returns the current network output.""" 89 | return self.terminals[-1] 90 | 91 | def get_unique_name(self, prefix): 92 | """Returns an index-suffixed unique name for the given prefix. 93 | This is used for auto-generating layer names based on the type-prefix. 94 | """ 95 | ident = sum(t.startswith(prefix) for t, _ in self.layers.items()) + 1 96 | return '%s_%d' % (prefix, ident) 97 | 98 | def make_var(self, name, shape): 99 | """Creates a new TensorFlow variable.""" 100 | return tf.get_variable(name, shape, trainable=self.trainable) 101 | 102 | def validate_padding(self, padding): 103 | """Verifies that the padding is one of the supported ones.""" 104 | assert padding in ('SAME', 'VALID') 105 | 106 | @layer 107 | def conv(self, 108 | inp, 109 | k_h, 110 | k_w, 111 | c_o, 112 | s_h, 113 | s_w, 114 | name, 115 | relu=True, 116 | padding='SAME', 117 | group=1, 118 | biased=True): 119 | # Verify that the padding is acceptable 120 | self.validate_padding(padding) 121 | # Get the number of channels in the input 122 | c_i = int(inp.get_shape()[-1]) 123 | # Verify that the grouping parameter is valid 124 | assert c_i % group == 0 125 | assert c_o % group == 0 126 | # Convolution for a given input and kernel 127 | convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding) 128 | with tf.variable_scope(name) as scope: 129 | kernel = self.make_var('weights', shape=[k_h, k_w, c_i // group, c_o]) 130 | # This is the common-case. Convolve the input without any further complications. 131 | output = convolve(inp, kernel) 132 | # Add the biases 133 | if biased: 134 | biases = self.make_var('biases', [c_o]) 135 | output = tf.nn.bias_add(output, biases) 136 | if relu: 137 | # ReLU non-linearity 138 | output = tf.nn.relu(output, name=scope.name) 139 | return output 140 | 141 | @layer 142 | def prelu(self, inp, name): 143 | with tf.variable_scope(name): 144 | i = int(inp.get_shape()[-1]) 145 | alpha = self.make_var('alpha', shape=(i,)) 146 | output = tf.nn.relu(inp) + tf.multiply(alpha, -tf.nn.relu(-inp)) 147 | return output 148 | 149 | @layer 150 | def max_pool(self, inp, k_h, k_w, s_h, s_w, name, padding='SAME'): 151 | self.validate_padding(padding) 152 | return tf.nn.max_pool(inp, 153 | ksize=[1, k_h, k_w, 1], 154 | strides=[1, s_h, s_w, 1], 155 | padding=padding, 156 | name=name) 157 | 158 | @layer 159 | def fc(self, inp, num_out, name, relu=True): 160 | with tf.variable_scope(name): 161 | input_shape = inp.get_shape() 162 | if input_shape.ndims == 4: 163 | # The input is spatial. Vectorize it first. 164 | dim = 1 165 | for d in input_shape[1:].as_list(): 166 | dim *= int(d) 167 | feed_in = tf.reshape(inp, [-1, dim]) 168 | else: 169 | feed_in, dim = (inp, input_shape[-1].value) 170 | weights = self.make_var('weights', shape=[dim, num_out]) 171 | biases = self.make_var('biases', [num_out]) 172 | op = tf.nn.relu_layer if relu else tf.nn.xw_plus_b 173 | fc = op(feed_in, weights, biases, name=name) 174 | return fc 175 | 176 | 177 | """ 178 | Multi dimensional softmax, 179 | refer to https://github.com/tensorflow/tensorflow/issues/210 180 | compute softmax along the dimension of target 181 | the native softmax only supports batch_size x dimension 182 | """ 183 | @layer 184 | def softmax(self, target, axis, name=None): 185 | max_axis = tf.reduce_max(target, axis, keepdims=True) 186 | target_exp = tf.exp(target-max_axis) 187 | normalize = tf.reduce_sum(target_exp, axis, keepdims=True) 188 | softmax = tf.div(target_exp, normalize, name) 189 | return softmax 190 | 191 | class PNet(Network): 192 | def setup(self): 193 | (self.feed('data') #pylint: disable=no-value-for-parameter, no-member 194 | .conv(3, 3, 10, 1, 1, padding='VALID', relu=False, name='conv1') 195 | .prelu(name='PReLU1') 196 | .max_pool(2, 2, 2, 2, name='pool1') 197 | .conv(3, 3, 16, 1, 1, padding='VALID', relu=False, name='conv2') 198 | .prelu(name='PReLU2') 199 | .conv(3, 3, 32, 1, 1, padding='VALID', relu=False, name='conv3') 200 | .prelu(name='PReLU3') 201 | .conv(1, 1, 2, 1, 1, relu=False, name='conv4-1') 202 | .softmax(3,name='prob1')) 203 | 204 | (self.feed('PReLU3') #pylint: disable=no-value-for-parameter 205 | .conv(1, 1, 4, 1, 1, relu=False, name='conv4-2')) 206 | 207 | class RNet(Network): 208 | def setup(self): 209 | (self.feed('data') #pylint: disable=no-value-for-parameter, no-member 210 | .conv(3, 3, 28, 1, 1, padding='VALID', relu=False, name='conv1') 211 | .prelu(name='prelu1') 212 | .max_pool(3, 3, 2, 2, name='pool1') 213 | .conv(3, 3, 48, 1, 1, padding='VALID', relu=False, name='conv2') 214 | .prelu(name='prelu2') 215 | .max_pool(3, 3, 2, 2, padding='VALID', name='pool2') 216 | .conv(2, 2, 64, 1, 1, padding='VALID', relu=False, name='conv3') 217 | .prelu(name='prelu3') 218 | .fc(128, relu=False, name='conv4') 219 | .prelu(name='prelu4') 220 | .fc(2, relu=False, name='conv5-1') 221 | .softmax(1,name='prob1')) 222 | 223 | (self.feed('prelu4') #pylint: disable=no-value-for-parameter 224 | .fc(4, relu=False, name='conv5-2')) 225 | 226 | class ONet(Network): 227 | def setup(self): 228 | (self.feed('data') #pylint: disable=no-value-for-parameter, no-member 229 | .conv(3, 3, 32, 1, 1, padding='VALID', relu=False, name='conv1') 230 | .prelu(name='prelu1') 231 | .max_pool(3, 3, 2, 2, name='pool1') 232 | .conv(3, 3, 64, 1, 1, padding='VALID', relu=False, name='conv2') 233 | .prelu(name='prelu2') 234 | .max_pool(3, 3, 2, 2, padding='VALID', name='pool2') 235 | .conv(3, 3, 64, 1, 1, padding='VALID', relu=False, name='conv3') 236 | .prelu(name='prelu3') 237 | .max_pool(2, 2, 2, 2, name='pool3') 238 | .conv(2, 2, 128, 1, 1, padding='VALID', relu=False, name='conv4') 239 | .prelu(name='prelu4') 240 | .fc(256, relu=False, name='conv5') 241 | .prelu(name='prelu5') 242 | .fc(2, relu=False, name='conv6-1') 243 | .softmax(1, name='prob1')) 244 | 245 | (self.feed('prelu5') #pylint: disable=no-value-for-parameter 246 | .fc(4, relu=False, name='conv6-2')) 247 | 248 | (self.feed('prelu5') #pylint: disable=no-value-for-parameter 249 | .fc(10, relu=False, name='conv6-3')) 250 | 251 | def create_mtcnn(sess, model_path): 252 | if not model_path: 253 | model_path,_ = os.path.split(os.path.realpath(__file__)) 254 | 255 | with tf.variable_scope('pnet'): 256 | data = tf.placeholder(tf.float32, (None,None,None,3), 'input') 257 | pnet = PNet({'data':data}) 258 | pnet.load(os.path.join(model_path, 'det1.npy'), sess) 259 | with tf.variable_scope('rnet'): 260 | data = tf.placeholder(tf.float32, (None,24,24,3), 'input') 261 | rnet = RNet({'data':data}) 262 | rnet.load(os.path.join(model_path, 'det2.npy'), sess) 263 | with tf.variable_scope('onet'): 264 | data = tf.placeholder(tf.float32, (None,48,48,3), 'input') 265 | onet = ONet({'data':data}) 266 | onet.load(os.path.join(model_path, 'det3.npy'), sess) 267 | 268 | pnet_fun = lambda img : sess.run(('pnet/conv4-2/BiasAdd:0', 'pnet/prob1:0'), feed_dict={'pnet/input:0':img}) 269 | rnet_fun = lambda img : sess.run(('rnet/conv5-2/conv5-2:0', 'rnet/prob1:0'), feed_dict={'rnet/input:0':img}) 270 | onet_fun = lambda img : sess.run(('onet/conv6-2/conv6-2:0', 'onet/conv6-3/conv6-3:0', 'onet/prob1:0'), feed_dict={'onet/input:0':img}) 271 | return pnet_fun, rnet_fun, onet_fun 272 | 273 | def detect_face(img, minsize, pnet, rnet, onet, threshold, factor): 274 | """Detects faces in an image, and returns bounding boxes and points for them. 275 | img: input image 276 | minsize: minimum faces' size 277 | pnet, rnet, onet: caffemodel 278 | threshold: threshold=[th1, th2, th3], th1-3 are three steps's threshold 279 | factor: the factor used to create a scaling pyramid of face sizes to detect in the image. 280 | """ 281 | factor_count=0 282 | total_boxes=np.empty((0,9)) 283 | points=np.empty(0) 284 | h=img.shape[0] 285 | w=img.shape[1] 286 | minl=np.amin([h, w]) 287 | m=12.0/minsize 288 | minl=minl*m 289 | # create scale pyramid 290 | scales=[] 291 | while minl>=12: 292 | scales += [m*np.power(factor, factor_count)] 293 | minl = minl*factor 294 | factor_count += 1 295 | 296 | # first stage 297 | for scale in scales: 298 | hs=int(np.ceil(h*scale)) 299 | ws=int(np.ceil(w*scale)) 300 | im_data = imresample(img, (hs, ws)) 301 | im_data = (im_data-127.5)*0.0078125 302 | img_x = np.expand_dims(im_data, 0) 303 | img_y = np.transpose(img_x, (0,2,1,3)) 304 | out = pnet(img_y) 305 | out0 = np.transpose(out[0], (0,2,1,3)) 306 | out1 = np.transpose(out[1], (0,2,1,3)) 307 | 308 | boxes, _ = generateBoundingBox(out1[0,:,:,1].copy(), out0[0,:,:,:].copy(), scale, threshold[0]) 309 | 310 | # inter-scale nms 311 | pick = nms(boxes.copy(), 0.5, 'Union') 312 | if boxes.size>0 and pick.size>0: 313 | boxes = boxes[pick,:] 314 | total_boxes = np.append(total_boxes, boxes, axis=0) 315 | 316 | numbox = total_boxes.shape[0] 317 | if numbox>0: 318 | pick = nms(total_boxes.copy(), 0.7, 'Union') 319 | total_boxes = total_boxes[pick,:] 320 | regw = total_boxes[:,2]-total_boxes[:,0] 321 | regh = total_boxes[:,3]-total_boxes[:,1] 322 | qq1 = total_boxes[:,0]+total_boxes[:,5]*regw 323 | qq2 = total_boxes[:,1]+total_boxes[:,6]*regh 324 | qq3 = total_boxes[:,2]+total_boxes[:,7]*regw 325 | qq4 = total_boxes[:,3]+total_boxes[:,8]*regh 326 | total_boxes = np.transpose(np.vstack([qq1, qq2, qq3, qq4, total_boxes[:,4]])) 327 | total_boxes = rerec(total_boxes.copy()) 328 | total_boxes[:,0:4] = np.fix(total_boxes[:,0:4]).astype(np.int32) 329 | dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h) 330 | 331 | numbox = total_boxes.shape[0] 332 | if numbox>0: 333 | # second stage 334 | tempimg = np.zeros((24,24,3,numbox)) 335 | for k in range(0,numbox): 336 | tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3)) 337 | tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:] 338 | if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0: 339 | tempimg[:,:,:,k] = imresample(tmp, (24, 24)) 340 | else: 341 | return np.empty() 342 | tempimg = (tempimg-127.5)*0.0078125 343 | tempimg1 = np.transpose(tempimg, (3,1,0,2)) 344 | out = rnet(tempimg1) 345 | out0 = np.transpose(out[0]) 346 | out1 = np.transpose(out[1]) 347 | score = out1[1,:] 348 | ipass = np.where(score>threshold[1]) 349 | total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)]) 350 | mv = out0[:,ipass[0]] 351 | if total_boxes.shape[0]>0: 352 | pick = nms(total_boxes, 0.7, 'Union') 353 | total_boxes = total_boxes[pick,:] 354 | total_boxes = bbreg(total_boxes.copy(), np.transpose(mv[:,pick])) 355 | total_boxes = rerec(total_boxes.copy()) 356 | 357 | numbox = total_boxes.shape[0] 358 | if numbox>0: 359 | # third stage 360 | total_boxes = np.fix(total_boxes).astype(np.int32) 361 | dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h) 362 | tempimg = np.zeros((48,48,3,numbox)) 363 | for k in range(0,numbox): 364 | tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3)) 365 | tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:] 366 | if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0: 367 | tempimg[:,:,:,k] = imresample(tmp, (48, 48)) 368 | else: 369 | return np.empty() 370 | tempimg = (tempimg-127.5)*0.0078125 371 | tempimg1 = np.transpose(tempimg, (3,1,0,2)) 372 | out = onet(tempimg1) 373 | out0 = np.transpose(out[0]) 374 | out1 = np.transpose(out[1]) 375 | out2 = np.transpose(out[2]) 376 | score = out2[1,:] 377 | points = out1 378 | ipass = np.where(score>threshold[2]) 379 | points = points[:,ipass[0]] 380 | total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)]) 381 | mv = out0[:,ipass[0]] 382 | 383 | w = total_boxes[:,2]-total_boxes[:,0]+1 384 | h = total_boxes[:,3]-total_boxes[:,1]+1 385 | points[0:5,:] = np.tile(w,(5, 1))*points[0:5,:] + np.tile(total_boxes[:,0],(5, 1))-1 386 | points[5:10,:] = np.tile(h,(5, 1))*points[5:10,:] + np.tile(total_boxes[:,1],(5, 1))-1 387 | if total_boxes.shape[0]>0: 388 | total_boxes = bbreg(total_boxes.copy(), np.transpose(mv)) 389 | pick = nms(total_boxes.copy(), 0.7, 'Min') 390 | total_boxes = total_boxes[pick,:] 391 | points = points[:,pick] 392 | 393 | return total_boxes, points 394 | 395 | 396 | def bulk_detect_face(images, detection_window_size_ratio, pnet, rnet, onet, threshold, factor): 397 | """Detects faces in a list of images 398 | images: list containing input images 399 | detection_window_size_ratio: ratio of minimum face size to smallest image dimension 400 | pnet, rnet, onet: caffemodel 401 | threshold: threshold=[th1 th2 th3], th1-3 are three steps's threshold [0-1] 402 | factor: the factor used to create a scaling pyramid of face sizes to detect in the image. 403 | """ 404 | all_scales = [None] * len(images) 405 | images_with_boxes = [None] * len(images) 406 | 407 | for i in range(len(images)): 408 | images_with_boxes[i] = {'total_boxes': np.empty((0, 9))} 409 | 410 | # create scale pyramid 411 | for index, img in enumerate(images): 412 | all_scales[index] = [] 413 | h = img.shape[0] 414 | w = img.shape[1] 415 | minsize = int(detection_window_size_ratio * np.minimum(w, h)) 416 | factor_count = 0 417 | minl = np.amin([h, w]) 418 | if minsize <= 12: 419 | minsize = 12 420 | 421 | m = 12.0 / minsize 422 | minl = minl * m 423 | while minl >= 12: 424 | all_scales[index].append(m * np.power(factor, factor_count)) 425 | minl = minl * factor 426 | factor_count += 1 427 | 428 | # # # # # # # # # # # # # 429 | # first stage - fast proposal network (pnet) to obtain face candidates 430 | # # # # # # # # # # # # # 431 | 432 | images_obj_per_resolution = {} 433 | 434 | # TODO: use some type of rounding to number module 8 to increase probability that pyramid images will have the same resolution across input images 435 | 436 | for index, scales in enumerate(all_scales): 437 | h = images[index].shape[0] 438 | w = images[index].shape[1] 439 | 440 | for scale in scales: 441 | hs = int(np.ceil(h * scale)) 442 | ws = int(np.ceil(w * scale)) 443 | 444 | if (ws, hs) not in images_obj_per_resolution: 445 | images_obj_per_resolution[(ws, hs)] = [] 446 | 447 | im_data = imresample(images[index], (hs, ws)) 448 | im_data = (im_data - 127.5) * 0.0078125 449 | img_y = np.transpose(im_data, (1, 0, 2)) # caffe uses different dimensions ordering 450 | images_obj_per_resolution[(ws, hs)].append({'scale': scale, 'image': img_y, 'index': index}) 451 | 452 | for resolution in images_obj_per_resolution: 453 | images_per_resolution = [i['image'] for i in images_obj_per_resolution[resolution]] 454 | outs = pnet(images_per_resolution) 455 | 456 | for index in range(len(outs[0])): 457 | scale = images_obj_per_resolution[resolution][index]['scale'] 458 | image_index = images_obj_per_resolution[resolution][index]['index'] 459 | out0 = np.transpose(outs[0][index], (1, 0, 2)) 460 | out1 = np.transpose(outs[1][index], (1, 0, 2)) 461 | 462 | boxes, _ = generateBoundingBox(out1[:, :, 1].copy(), out0[:, :, :].copy(), scale, threshold[0]) 463 | 464 | # inter-scale nms 465 | pick = nms(boxes.copy(), 0.5, 'Union') 466 | if boxes.size > 0 and pick.size > 0: 467 | boxes = boxes[pick, :] 468 | images_with_boxes[image_index]['total_boxes'] = np.append(images_with_boxes[image_index]['total_boxes'], 469 | boxes, 470 | axis=0) 471 | 472 | for index, image_obj in enumerate(images_with_boxes): 473 | numbox = image_obj['total_boxes'].shape[0] 474 | if numbox > 0: 475 | h = images[index].shape[0] 476 | w = images[index].shape[1] 477 | pick = nms(image_obj['total_boxes'].copy(), 0.7, 'Union') 478 | image_obj['total_boxes'] = image_obj['total_boxes'][pick, :] 479 | regw = image_obj['total_boxes'][:, 2] - image_obj['total_boxes'][:, 0] 480 | regh = image_obj['total_boxes'][:, 3] - image_obj['total_boxes'][:, 1] 481 | qq1 = image_obj['total_boxes'][:, 0] + image_obj['total_boxes'][:, 5] * regw 482 | qq2 = image_obj['total_boxes'][:, 1] + image_obj['total_boxes'][:, 6] * regh 483 | qq3 = image_obj['total_boxes'][:, 2] + image_obj['total_boxes'][:, 7] * regw 484 | qq4 = image_obj['total_boxes'][:, 3] + image_obj['total_boxes'][:, 8] * regh 485 | image_obj['total_boxes'] = np.transpose(np.vstack([qq1, qq2, qq3, qq4, image_obj['total_boxes'][:, 4]])) 486 | image_obj['total_boxes'] = rerec(image_obj['total_boxes'].copy()) 487 | image_obj['total_boxes'][:, 0:4] = np.fix(image_obj['total_boxes'][:, 0:4]).astype(np.int32) 488 | dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(image_obj['total_boxes'].copy(), w, h) 489 | 490 | numbox = image_obj['total_boxes'].shape[0] 491 | tempimg = np.zeros((24, 24, 3, numbox)) 492 | 493 | if numbox > 0: 494 | for k in range(0, numbox): 495 | tmp = np.zeros((int(tmph[k]), int(tmpw[k]), 3)) 496 | tmp[dy[k] - 1:edy[k], dx[k] - 1:edx[k], :] = images[index][y[k] - 1:ey[k], x[k] - 1:ex[k], :] 497 | if tmp.shape[0] > 0 and tmp.shape[1] > 0 or tmp.shape[0] == 0 and tmp.shape[1] == 0: 498 | tempimg[:, :, :, k] = imresample(tmp, (24, 24)) 499 | else: 500 | return np.empty() 501 | 502 | tempimg = (tempimg - 127.5) * 0.0078125 503 | image_obj['rnet_input'] = np.transpose(tempimg, (3, 1, 0, 2)) 504 | 505 | # # # # # # # # # # # # # 506 | # second stage - refinement of face candidates with rnet 507 | # # # # # # # # # # # # # 508 | 509 | bulk_rnet_input = np.empty((0, 24, 24, 3)) 510 | for index, image_obj in enumerate(images_with_boxes): 511 | if 'rnet_input' in image_obj: 512 | bulk_rnet_input = np.append(bulk_rnet_input, image_obj['rnet_input'], axis=0) 513 | 514 | out = rnet(bulk_rnet_input) 515 | out0 = np.transpose(out[0]) 516 | out1 = np.transpose(out[1]) 517 | score = out1[1, :] 518 | 519 | i = 0 520 | for index, image_obj in enumerate(images_with_boxes): 521 | if 'rnet_input' not in image_obj: 522 | continue 523 | 524 | rnet_input_count = image_obj['rnet_input'].shape[0] 525 | score_per_image = score[i:i + rnet_input_count] 526 | out0_per_image = out0[:, i:i + rnet_input_count] 527 | 528 | ipass = np.where(score_per_image > threshold[1]) 529 | image_obj['total_boxes'] = np.hstack([image_obj['total_boxes'][ipass[0], 0:4].copy(), 530 | np.expand_dims(score_per_image[ipass].copy(), 1)]) 531 | 532 | mv = out0_per_image[:, ipass[0]] 533 | 534 | if image_obj['total_boxes'].shape[0] > 0: 535 | h = images[index].shape[0] 536 | w = images[index].shape[1] 537 | pick = nms(image_obj['total_boxes'], 0.7, 'Union') 538 | image_obj['total_boxes'] = image_obj['total_boxes'][pick, :] 539 | image_obj['total_boxes'] = bbreg(image_obj['total_boxes'].copy(), np.transpose(mv[:, pick])) 540 | image_obj['total_boxes'] = rerec(image_obj['total_boxes'].copy()) 541 | 542 | numbox = image_obj['total_boxes'].shape[0] 543 | 544 | if numbox > 0: 545 | tempimg = np.zeros((48, 48, 3, numbox)) 546 | image_obj['total_boxes'] = np.fix(image_obj['total_boxes']).astype(np.int32) 547 | dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(image_obj['total_boxes'].copy(), w, h) 548 | 549 | for k in range(0, numbox): 550 | tmp = np.zeros((int(tmph[k]), int(tmpw[k]), 3)) 551 | tmp[dy[k] - 1:edy[k], dx[k] - 1:edx[k], :] = images[index][y[k] - 1:ey[k], x[k] - 1:ex[k], :] 552 | if tmp.shape[0] > 0 and tmp.shape[1] > 0 or tmp.shape[0] == 0 and tmp.shape[1] == 0: 553 | tempimg[:, :, :, k] = imresample(tmp, (48, 48)) 554 | else: 555 | return np.empty() 556 | tempimg = (tempimg - 127.5) * 0.0078125 557 | image_obj['onet_input'] = np.transpose(tempimg, (3, 1, 0, 2)) 558 | 559 | i += rnet_input_count 560 | 561 | # # # # # # # # # # # # # 562 | # third stage - further refinement and facial landmarks positions with onet 563 | # # # # # # # # # # # # # 564 | 565 | bulk_onet_input = np.empty((0, 48, 48, 3)) 566 | for index, image_obj in enumerate(images_with_boxes): 567 | if 'onet_input' in image_obj: 568 | bulk_onet_input = np.append(bulk_onet_input, image_obj['onet_input'], axis=0) 569 | 570 | out = onet(bulk_onet_input) 571 | 572 | out0 = np.transpose(out[0]) 573 | out1 = np.transpose(out[1]) 574 | out2 = np.transpose(out[2]) 575 | score = out2[1, :] 576 | points = out1 577 | 578 | i = 0 579 | ret = [] 580 | for index, image_obj in enumerate(images_with_boxes): 581 | if 'onet_input' not in image_obj: 582 | ret.append(None) 583 | continue 584 | 585 | onet_input_count = image_obj['onet_input'].shape[0] 586 | 587 | out0_per_image = out0[:, i:i + onet_input_count] 588 | score_per_image = score[i:i + onet_input_count] 589 | points_per_image = points[:, i:i + onet_input_count] 590 | 591 | ipass = np.where(score_per_image > threshold[2]) 592 | points_per_image = points_per_image[:, ipass[0]] 593 | 594 | image_obj['total_boxes'] = np.hstack([image_obj['total_boxes'][ipass[0], 0:4].copy(), 595 | np.expand_dims(score_per_image[ipass].copy(), 1)]) 596 | mv = out0_per_image[:, ipass[0]] 597 | 598 | w = image_obj['total_boxes'][:, 2] - image_obj['total_boxes'][:, 0] + 1 599 | h = image_obj['total_boxes'][:, 3] - image_obj['total_boxes'][:, 1] + 1 600 | points_per_image[0:5, :] = np.tile(w, (5, 1)) * points_per_image[0:5, :] + np.tile( 601 | image_obj['total_boxes'][:, 0], (5, 1)) - 1 602 | points_per_image[5:10, :] = np.tile(h, (5, 1)) * points_per_image[5:10, :] + np.tile( 603 | image_obj['total_boxes'][:, 1], (5, 1)) - 1 604 | 605 | if image_obj['total_boxes'].shape[0] > 0: 606 | image_obj['total_boxes'] = bbreg(image_obj['total_boxes'].copy(), np.transpose(mv)) 607 | pick = nms(image_obj['total_boxes'].copy(), 0.7, 'Min') 608 | image_obj['total_boxes'] = image_obj['total_boxes'][pick, :] 609 | points_per_image = points_per_image[:, pick] 610 | 611 | ret.append((image_obj['total_boxes'], points_per_image)) 612 | else: 613 | ret.append(None) 614 | 615 | i += onet_input_count 616 | 617 | return ret 618 | 619 | 620 | # function [boundingbox] = bbreg(boundingbox,reg) 621 | def bbreg(boundingbox,reg): 622 | """Calibrate bounding boxes""" 623 | if reg.shape[1]==1: 624 | reg = np.reshape(reg, (reg.shape[2], reg.shape[3])) 625 | 626 | w = boundingbox[:,2]-boundingbox[:,0]+1 627 | h = boundingbox[:,3]-boundingbox[:,1]+1 628 | b1 = boundingbox[:,0]+reg[:,0]*w 629 | b2 = boundingbox[:,1]+reg[:,1]*h 630 | b3 = boundingbox[:,2]+reg[:,2]*w 631 | b4 = boundingbox[:,3]+reg[:,3]*h 632 | boundingbox[:,0:4] = np.transpose(np.vstack([b1, b2, b3, b4 ])) 633 | return boundingbox 634 | 635 | def generateBoundingBox(imap, reg, scale, t): 636 | """Use heatmap to generate bounding boxes""" 637 | stride=2 638 | cellsize=12 639 | 640 | imap = np.transpose(imap) 641 | dx1 = np.transpose(reg[:,:,0]) 642 | dy1 = np.transpose(reg[:,:,1]) 643 | dx2 = np.transpose(reg[:,:,2]) 644 | dy2 = np.transpose(reg[:,:,3]) 645 | y, x = np.where(imap >= t) 646 | if y.shape[0]==1: 647 | dx1 = np.flipud(dx1) 648 | dy1 = np.flipud(dy1) 649 | dx2 = np.flipud(dx2) 650 | dy2 = np.flipud(dy2) 651 | score = imap[(y,x)] 652 | reg = np.transpose(np.vstack([ dx1[(y,x)], dy1[(y,x)], dx2[(y,x)], dy2[(y,x)] ])) 653 | if reg.size==0: 654 | reg = np.empty((0,3)) 655 | bb = np.transpose(np.vstack([y,x])) 656 | q1 = np.fix((stride*bb+1)/scale) 657 | q2 = np.fix((stride*bb+cellsize-1+1)/scale) 658 | boundingbox = np.hstack([q1, q2, np.expand_dims(score,1), reg]) 659 | return boundingbox, reg 660 | 661 | # function pick = nms(boxes,threshold,type) 662 | def nms(boxes, threshold, method): 663 | if boxes.size==0: 664 | return np.empty((0,3)) 665 | x1 = boxes[:,0] 666 | y1 = boxes[:,1] 667 | x2 = boxes[:,2] 668 | y2 = boxes[:,3] 669 | s = boxes[:,4] 670 | area = (x2-x1+1) * (y2-y1+1) 671 | I = np.argsort(s) 672 | pick = np.zeros_like(s, dtype=np.int16) 673 | counter = 0 674 | while I.size>0: 675 | i = I[-1] 676 | pick[counter] = i 677 | counter += 1 678 | idx = I[0:-1] 679 | xx1 = np.maximum(x1[i], x1[idx]) 680 | yy1 = np.maximum(y1[i], y1[idx]) 681 | xx2 = np.minimum(x2[i], x2[idx]) 682 | yy2 = np.minimum(y2[i], y2[idx]) 683 | w = np.maximum(0.0, xx2-xx1+1) 684 | h = np.maximum(0.0, yy2-yy1+1) 685 | inter = w * h 686 | if method is 'Min': 687 | o = inter / np.minimum(area[i], area[idx]) 688 | else: 689 | o = inter / (area[i] + area[idx] - inter) 690 | I = I[np.where(o<=threshold)] 691 | pick = pick[0:counter] 692 | return pick 693 | 694 | # function [dy edy dx edx y ey x ex tmpw tmph] = pad(total_boxes,w,h) 695 | def pad(total_boxes, w, h): 696 | """Compute the padding coordinates (pad the bounding boxes to square)""" 697 | tmpw = (total_boxes[:,2]-total_boxes[:,0]+1).astype(np.int32) 698 | tmph = (total_boxes[:,3]-total_boxes[:,1]+1).astype(np.int32) 699 | numbox = total_boxes.shape[0] 700 | 701 | dx = np.ones((numbox), dtype=np.int32) 702 | dy = np.ones((numbox), dtype=np.int32) 703 | edx = tmpw.copy().astype(np.int32) 704 | edy = tmph.copy().astype(np.int32) 705 | 706 | x = total_boxes[:,0].copy().astype(np.int32) 707 | y = total_boxes[:,1].copy().astype(np.int32) 708 | ex = total_boxes[:,2].copy().astype(np.int32) 709 | ey = total_boxes[:,3].copy().astype(np.int32) 710 | 711 | tmp = np.where(ex>w) 712 | edx.flat[tmp] = np.expand_dims(-ex[tmp]+w+tmpw[tmp],1) 713 | ex[tmp] = w 714 | 715 | tmp = np.where(ey>h) 716 | edy.flat[tmp] = np.expand_dims(-ey[tmp]+h+tmph[tmp],1) 717 | ey[tmp] = h 718 | 719 | tmp = np.where(x<1) 720 | dx.flat[tmp] = np.expand_dims(2-x[tmp],1) 721 | x[tmp] = 1 722 | 723 | tmp = np.where(y<1) 724 | dy.flat[tmp] = np.expand_dims(2-y[tmp],1) 725 | y[tmp] = 1 726 | 727 | return dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph 728 | 729 | # function [bboxA] = rerec(bboxA) 730 | def rerec(bboxA): 731 | """Convert bboxA to square.""" 732 | h = bboxA[:,3]-bboxA[:,1] 733 | w = bboxA[:,2]-bboxA[:,0] 734 | l = np.maximum(w, h) 735 | bboxA[:,0] = bboxA[:,0]+w*0.5-l*0.5 736 | bboxA[:,1] = bboxA[:,1]+h*0.5-l*0.5 737 | bboxA[:,2:4] = bboxA[:,0:2] + np.transpose(np.tile(l,(2,1))) 738 | return bboxA 739 | 740 | def imresample(img, sz): 741 | im_data = cv2.resize(img, (sz[1], sz[0]), interpolation=cv2.INTER_AREA) #@UndefinedVariable 742 | return im_data 743 | 744 | # This method is kept for debugging purpose 745 | # h=img.shape[0] 746 | # w=img.shape[1] 747 | # hs, ws = sz 748 | # dx = float(w) / ws 749 | # dy = float(h) / hs 750 | # im_data = np.zeros((hs,ws,3)) 751 | # for a1 in range(0,hs): 752 | # for a2 in range(0,ws): 753 | # for a3 in range(0,3): 754 | # im_data[a1,a2,a3] = img[int(floor(a1*dy)),int(floor(a2*dx)),a3] 755 | # return im_data 756 | 757 | -------------------------------------------------------------------------------- /digivision.py: -------------------------------------------------------------------------------- 1 | import cv2 as cv 2 | import warnings 3 | warnings.filterwarnings("ignore") 4 | import p_part 5 | import f_part 6 | from caption_tune import modcap, face_found_cap, face_not_found_cap 7 | from gensoundgtts import generate_sound 8 | import tkinter as tk 9 | 10 | def saveface(): 11 | x1 = entry1.get() 12 | print(x1 + ' face saved') 13 | root.destroy() 14 | cv.imwrite(r"images//" + 15 | str(x1) + ".jpg", save) 16 | data = {x1: f_part.img_to_encoding( 17 | "images//" + str(x1) + ".jpg").tolist()} 18 | f_part.digi_db.insert_one(data) 19 | 20 | 21 | def ignoreface(): 22 | print("Not saved") 23 | root.destroy() 24 | 25 | 26 | cap = cv.VideoCapture('Sample Videos/test.mp4') 27 | 28 | while True: 29 | ret, frame = cap.read() 30 | facedetect = cv.CascadeClassifier(r'haarcascade_frontalface_default.xml') 31 | if ret: 32 | # font = cv.FONT_HERSHEY_SIMPLEX 33 | cv.imshow("Video", frame) 34 | 35 | if cv.waitKey(5) == ord('p'): 36 | 37 | cv.imwrite('./test.jpg', frame) 38 | final_caption = p_part.generate_caption( 39 | './test.jpg') # create caption 40 | final_caption = modcap(final_caption) # remove tags 41 | print(final_caption) 42 | generate_sound(final_caption) # convert to audio 43 | 44 | if cv.waitKey(5) == ord('f'): 45 | gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY) 46 | faces = facedetect.detectMultiScale(gray, 1.3, 5) 47 | cv.imwrite('./test.jpg', frame) 48 | known_detected = 0 49 | unknown_detected = 0 50 | known_face_list = [] 51 | known_face_dist = [] 52 | try: 53 | for x, y, w, h in faces: 54 | #cv2.imwrite("dset//User."+str(user)+"."+str(sample)+".jpg",gray[y:y+h,x:x+w]) 55 | save = frame[y:y+h, x:x+w] 56 | cv.imwrite('./test.jpg', save) 57 | dis, name = f_part.who_is_it('./test.jpg') 58 | print(str(dis)+","+name) 59 | if name != 'unknown': 60 | known_face_list.append(name) 61 | known_face_dist.append(dis) 62 | known_detected += 1 63 | 64 | else: 65 | unknown_detected += 1 66 | 67 | if known_detected > 0: 68 | print("known: " + str(known_detected)) 69 | for i in range(known_detected): 70 | print('i=' + str(i)) 71 | print( 72 | known_face_list[i] + " at dist of: " + str(known_face_dist[i])) 73 | temp = face_found_cap(str(known_face_list[i])) 74 | generate_sound(temp) 75 | elif unknown_detected == 1: 76 | temp = face_not_found_cap() 77 | generate_sound(temp) 78 | 79 | root = tk.Tk() 80 | 81 | large_font = ('Times New Roman', 14) 82 | 83 | canvas1 = tk.Canvas(root, width=300, height=200) 84 | canvas1.pack() 85 | label = tk.Label(root, text='Enter the Name') 86 | canvas1.create_window(140, 50, window=label) 87 | entry1Var = tk.StringVar(value='') 88 | entry1 = tk.Entry( 89 | root, textvariable=entry1Var, font=large_font) 90 | canvas1.create_window(150, 90, window=entry1) 91 | button1 = tk.Button(text='SAVE', command=saveface) 92 | button2 = tk.Button(text='IGNORE', command=ignoreface) 93 | canvas1.create_window(100, 150, window=button1) 94 | canvas1.create_window(180, 150, window=button2) 95 | 96 | root.mainloop() 97 | 98 | elif known_detected == 0 and unknown_detected == 0: 99 | print("No person found") 100 | generate_sound("No person found!") 101 | 102 | else: 103 | print("Too many unknown people") 104 | generate_sound("Too many unknown people.") 105 | except: 106 | generate_sound("No recognisable face found!") 107 | 108 | if cv.waitKey(1) & 0xFF == 27: # ASCII for Esc Key 109 | break 110 | else: 111 | break 112 | cap.release() 113 | cv.destroyAllWindows() 114 | -------------------------------------------------------------------------------- /digivision2.py: -------------------------------------------------------------------------------- 1 | import cv2 as cv 2 | import warnings 3 | warnings.filterwarnings("ignore") 4 | import p_part 5 | import f_part 6 | from caption_tune import modcap, face_found_cap, face_not_found_cap 7 | from gensound import generate_sound 8 | import tkinter as tk 9 | from faceadd import addn,speech 10 | 11 | 12 | def saveface(): 13 | # generate_sound("Tell me the name") 14 | x1 = speech("What is this human called?") 15 | print(x1 + ' face saved') 16 | cv.imwrite(r"images//" + 17 | str(x1) + ".jpg", save) 18 | data = {x1: f_part.img_to_encoding( 19 | "images//" + str(x1) + ".jpg").tolist()} 20 | f_part.digi_db.insert_one(data) 21 | 22 | 23 | def ignoreface(): 24 | generate_sound("Not saved") 25 | 26 | 27 | cap = cv.VideoCapture(0) 28 | 29 | while True: 30 | ret, frame = cap.read() 31 | facedetect = cv.CascadeClassifier(r'haarcascade_frontalface_default.xml') 32 | if ret: 33 | # font = cv.FONT_HERSHEY_SIMPLEX 34 | cv.imshow("Video", frame) 35 | 36 | if cv.waitKey(1) == ord('p'): 37 | 38 | cv.imwrite('./test.jpg', frame) 39 | final_caption = p_part.generate_caption( 40 | './test.jpg') # create caption 41 | final_caption = modcap(final_caption) # remove tags 42 | print(final_caption) 43 | generate_sound(final_caption) # convert to audio 44 | 45 | if cv.waitKey(1) == ord('f'): 46 | gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY) 47 | faces = facedetect.detectMultiScale(gray, 1.3, 5) 48 | cv.imwrite('./test.jpg', frame) 49 | known_detected = 0 50 | unknown_detected = 0 51 | known_face_list = [] 52 | known_face_dist = [] 53 | try: 54 | for x, y, w, h in faces: 55 | #cv2.imwrite("dset//User."+str(user)+"."+str(sample)+".jpg",gray[y:y+h,x:x+w]) 56 | save = frame[y:y+h, x:x+w] 57 | cv.imwrite('./test.jpg', save) 58 | dis, name = f_part.who_is_it('./test.jpg') 59 | print(str(dis)+","+name) 60 | if name != 'unknown': 61 | known_face_list.append(name) 62 | known_face_dist.append(dis) 63 | known_detected += 1 64 | 65 | else: 66 | unknown_detected += 1 67 | 68 | if known_detected > 0: 69 | print("known: " + str(known_detected)) 70 | for i in range(known_detected): 71 | print('i=' + str(i)) 72 | print( 73 | known_face_list[i] + " at dist of: " + str(known_face_dist[i])) 74 | temp = face_found_cap(str(known_face_list[i])) 75 | generate_sound(temp) 76 | elif unknown_detected == 1: 77 | temp = face_not_found_cap() 78 | generate_sound(temp) 79 | generate_sound("Do you want to add this face in your database") 80 | addn(save) 81 | 82 | elif known_detected == 0 and unknown_detected == 0: 83 | print("No person found") 84 | generate_sound("No person found!") 85 | 86 | else: 87 | print("Too many people") 88 | generate_sound("Too many people.") 89 | except Exception as e: 90 | generate_sound("No recognisable face found!") 91 | print(e) 92 | 93 | if cv.waitKey(1) & 0xFF == 27: # ASCII for Esc Key 94 | break 95 | else: 96 | break 97 | cap.release() 98 | cv.destroyAllWindows() -------------------------------------------------------------------------------- /download.py: -------------------------------------------------------------------------------- 1 | ######################################################################## 2 | # 3 | # Functions for downloading and extracting data-files from the internet. 4 | # 5 | # Implemented in Python 3.5 6 | # 7 | ######################################################################## 8 | # 9 | # This file is part of the TensorFlow Tutorials available at: 10 | # 11 | # https://github.com/Hvass-Labs/TensorFlow-Tutorials 12 | # 13 | # Published under the MIT License. See the file LICENSE for details. 14 | # 15 | # Copyright 2016 by Magnus Erik Hvass Pedersen 16 | # 17 | ######################################################################## 18 | 19 | import sys 20 | import os 21 | import urllib.request 22 | import tarfile 23 | import zipfile 24 | 25 | ######################################################################## 26 | 27 | 28 | def _print_download_progress(count, block_size, total_size): 29 | """ 30 | Function used for printing the download progress. 31 | Used as a call-back function in maybe_download_and_extract(). 32 | """ 33 | 34 | # Percentage completion. 35 | pct_complete = float(count * block_size) / total_size 36 | 37 | # Limit it because rounding errors may cause it to exceed 100%. 38 | pct_complete = min(1.0, pct_complete) 39 | 40 | # Status-message. Note the \r which means the line should overwrite itself. 41 | msg = "\r- Download progress: {0:.1%}".format(pct_complete) 42 | 43 | # Print it. 44 | sys.stdout.write(msg) 45 | sys.stdout.flush() 46 | 47 | 48 | ######################################################################## 49 | 50 | def download(base_url, filename, download_dir): 51 | """ 52 | Download the given file if it does not already exist in the download_dir. 53 | :param base_url: The internet URL without the filename. 54 | :param filename: The filename that will be added to the base_url. 55 | :param download_dir: Local directory for storing the file. 56 | :return: Nothing. 57 | """ 58 | 59 | # Path for local file. 60 | save_path = os.path.join(download_dir, filename) 61 | 62 | # Check if the file already exists, otherwise we need to download it now. 63 | if not os.path.exists(save_path): 64 | # Check if the download directory exists, otherwise create it. 65 | if not os.path.exists(download_dir): 66 | os.makedirs(download_dir) 67 | 68 | print("Downloading", filename, "...") 69 | 70 | # Download the file from the internet. 71 | url = base_url + filename 72 | file_path, _ = urllib.request.urlretrieve(url=url, 73 | filename=save_path, 74 | reporthook=_print_download_progress) 75 | 76 | print(" Done!") 77 | 78 | 79 | def maybe_download_and_extract(url, download_dir): 80 | """ 81 | Download and extract the data if it doesn't already exist. 82 | Assumes the url is a tar-ball file. 83 | :param url: 84 | Internet URL for the tar-file to download. 85 | Example: "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz" 86 | :param download_dir: 87 | Directory where the downloaded file is saved. 88 | Example: "data/CIFAR-10/" 89 | :return: 90 | Nothing. 91 | """ 92 | 93 | # Filename for saving the file downloaded from the internet. 94 | # Use the filename from the URL and add it to the download_dir. 95 | filename = url.split('/')[-1] 96 | file_path = os.path.join(download_dir, filename) 97 | 98 | # Check if the file already exists. 99 | # If it exists then we assume it has also been extracted, 100 | # otherwise we need to download and extract it now. 101 | if not os.path.exists(file_path): 102 | # Check if the download directory exists, otherwise create it. 103 | if not os.path.exists(download_dir): 104 | os.makedirs(download_dir) 105 | 106 | # Download the file from the internet. 107 | file_path, _ = urllib.request.urlretrieve(url=url, 108 | filename=file_path, 109 | reporthook=_print_download_progress) 110 | 111 | print() 112 | print("Download finished. Extracting files.") 113 | 114 | if file_path.endswith(".zip"): 115 | # Unpack the zip-file. 116 | zipfile.ZipFile(file=file_path, mode="r").extractall(download_dir) 117 | elif file_path.endswith((".tar.gz", ".tgz")): 118 | # Unpack the tar-ball. 119 | tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) 120 | 121 | print("Done.") 122 | else: 123 | print("Data has apparently already been downloaded and unpacked.") 124 | 125 | 126 | ######################################################################## 127 | -------------------------------------------------------------------------------- /f_part.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from PIL import Image 3 | import coco 4 | from cache import cache 5 | import cv2 as cv 6 | from gtts import gTTS 7 | import matplotlib.pyplot as plt 8 | import tensorflow as tf 9 | import numpy as np 10 | import FaceToolKit as ftk 11 | import DetectionToolKit as dtk 12 | warnings.filterwarnings("ignore") 13 | from pymongo import MongoClient 14 | 15 | 16 | MONGODB_URI = "mongodb+srv://digivision:digivision@cluster0-3yht7.mongodb.net/test?retryWrites=true" 17 | client = MongoClient(MONGODB_URI) 18 | db = client.get_database("people") 19 | digi_db = db.trusted_people 20 | 21 | 22 | 23 | verification_threshhold = 0.600 24 | image_size = 160 25 | v = ftk.Verification() 26 | # Pre-load model for Verification 27 | v.load_model("./models/20180204-160909/") 28 | v.initial_input_output_tensors() 29 | 30 | d = dtk.Detection() 31 | 32 | def img_to_encoding(img): 33 | image = plt.imread(img) 34 | aligned = d.align(image, False)[0] 35 | return v.img_to_encoding(aligned, image_size) 36 | 37 | def distance(emb1, emb2): 38 | diff = np.subtract(emb1, emb2) 39 | return np.sum(np.square(diff)) 40 | 41 | def who_is_it(image_path): 42 | 43 | # Compute the target "encoding" for the image. Use img_to_encoding() 44 | encoding = img_to_encoding(image_path) 45 | 46 | # Find the closest encoding ## 47 | 48 | # Initialize "min_dist" to a large value, say 1000 49 | min_dist = 1000 50 | # Loop over the database dictionary's names and encodings. 51 | data = digi_db.find() 52 | for i in data: 53 | if list(i.keys())[0] != '_id': 54 | name = list(i.keys())[0] 55 | else: 56 | name = list(i.keys())[1] 57 | db_enc = np.array(i[name]) 58 | 59 | # Compute L2 distance between the target "encoding" and the current "emb" from the database. (≈ 1 line) 60 | dist = distance(encoding, db_enc) 61 | 62 | # If this distance is less than the min_dist, then set min_dist to dist, and identity to name. (≈ 3 lines) 63 | if min_dist > dist: 64 | min_dist = dist 65 | identity = name 66 | 67 | if min_dist > verification_threshhold: 68 | return min_dist, 'unknown' 69 | # else: 70 | # print ("it's " + str(identity) + ", the distance is " + str(min_dist)) 71 | 72 | return min_dist, identity 73 | -------------------------------------------------------------------------------- /faceadd.py: -------------------------------------------------------------------------------- 1 | import speech_recognition as sr 2 | from gensound import generate_sound 3 | import f_part 4 | import cv2 as cv 5 | def speech(abc): 6 | 7 | # obtain audio from the microphone 8 | r = sr.Recognizer() 9 | with sr.Microphone() as source: 10 | print("Pls say something....") 11 | generate_sound(abc) 12 | audio = r.listen(source) 13 | 14 | # recognize speech 15 | try: 16 | print("Google Audio:" + r.recognize_google(audio)) 17 | return(r.recognize_google(audio)) 18 | # print("Sphinx:" + r.recognize_sphinx(audio)) 19 | except sr.UnknownValueError: 20 | generate_sound("Could not understand your response Speak again") 21 | speech(abc) 22 | except sr.RequestError as e: 23 | print("error: {0}".format(e)) 24 | generate_sound("Connection Error") 25 | 26 | 27 | def addn(save): 28 | ans = str(speech("Yes or No")) 29 | if(ans == "yes" or ans == "Yes"): 30 | saveface(save) 31 | elif(ans == "no" or ans == "No"): 32 | ignoreface() 33 | else: 34 | generate_sound("Could not understand your response Answer again") 35 | addn(save) 36 | 37 | def saveface(save): 38 | # generate_sound("Tell me the name") 39 | x1 = speech("Tell me the name") 40 | print(x1 + ' face saved') 41 | cv.imwrite(r"images//" + 42 | str(x1) + ".jpg", save) 43 | data = {x1: f_part.img_to_encoding( 44 | "images//" + str(x1) + ".jpg").tolist()} 45 | f_part.digi_db.insert_one(data) 46 | 47 | 48 | def ignoreface(): 49 | generate_sound("Not saved") -------------------------------------------------------------------------------- /facenet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/facenet/__init__.py -------------------------------------------------------------------------------- /facenet/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/facenet/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /facenet/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/facenet/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /facenet/__pycache__/face.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/facenet/__pycache__/face.cpython-35.pyc -------------------------------------------------------------------------------- /facenet/__pycache__/face.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/facenet/__pycache__/face.cpython-36.pyc -------------------------------------------------------------------------------- /facenet/face.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from tensorflow.python.platform import gfile 4 | import tensorflow as tf 5 | import re 6 | 7 | def prewhiten(x): 8 | mean = np.mean(x) 9 | std = np.std(x) 10 | std_adj = np.maximum(std, 1.0 / np.sqrt(x.size)) 11 | y = np.multiply(np.subtract(x, mean), 1 / std_adj) 12 | return y 13 | 14 | 15 | def to_rgb(img): 16 | w, h = img.shape 17 | ret = np.empty((w, h, 3), dtype=np.uint8) 18 | ret[:, :, 0] = ret[:, :, 1] = ret[:, :, 2] = img 19 | return ret 20 | 21 | 22 | def get_model_filenames(model_dir): 23 | files = os.listdir(model_dir) 24 | meta_files = [s for s in files if s.endswith('.meta')] 25 | if len(meta_files)==0: 26 | raise ValueError('No meta file found in the model directory (%s)' % model_dir) 27 | elif len(meta_files)>1: 28 | raise ValueError('There should not be more than one meta file in the model directory (%s)' % model_dir) 29 | meta_file = meta_files[0] 30 | meta_files = [s for s in files if '.ckpt' in s] 31 | max_step = -1 32 | for f in files: 33 | step_str = re.match(r'(^model-[\w\- ]+.ckpt-(\d+))', f) 34 | if step_str is not None and len(step_str.groups())>=2: 35 | step = int(step_str.groups()[1]) 36 | if step > max_step: 37 | max_step = step 38 | ckpt_file = step_str.groups()[0] 39 | return meta_file, ckpt_file 40 | 41 | def make_image_tensor(img, image_size, do_prewhiten=True): 42 | image = np.zeros((1, image_size, image_size, 3)) 43 | if img.ndim == 2: 44 | img = to_rgb(img) 45 | if do_prewhiten: 46 | img = prewhiten(img) 47 | image[0, :, :, :] = img 48 | return image 49 | 50 | def make_images_tensor(img1,img2,image_size, do_prewhiten=True): 51 | images = np.zeros((2, image_size, image_size, 3)) 52 | for i,img in enumerate([img1,img2]): 53 | if img.ndim == 2: 54 | img = to_rgb(img) 55 | if do_prewhiten: 56 | img = prewhiten(img) 57 | images[i, :, :, :] = img 58 | return images 59 | 60 | def load_model(model,session): 61 | # Check if the model is a model directory (containing a metagraph and a checkpoint file) 62 | # or if it is a protobuf file with a frozen graph 63 | model_exp = os.path.expanduser(model) 64 | if os.path.isfile(model_exp): 65 | print('Model filename: %s' % model_exp) 66 | with gfile.FastGFile(model_exp,'rb') as f: 67 | graph_def = tf.GraphDef() 68 | graph_def.ParseFromString(f.read()) 69 | tf.import_graph_def(graph_def, name='') 70 | else: 71 | print('Model directory: %s' % model_exp) 72 | meta_file, ckpt_file = get_model_filenames(model_exp) 73 | 74 | print('Metagraph file: %s' % meta_file) 75 | print('Checkpoint file: %s' % ckpt_file) 76 | 77 | saver = tf.train.import_meta_graph(os.path.join(model_exp, meta_file)) 78 | saver.restore(session, os.path.join(model_exp, ckpt_file)) -------------------------------------------------------------------------------- /gensound.py: -------------------------------------------------------------------------------- 1 | import pyttsx3 2 | 3 | def generate_sound(text): 4 | ''' 5 | Converts text to audio and plays it. 6 | ''' 7 | engine = pyttsx3.init() 8 | engine.say(text) 9 | engine.runAndWait() -------------------------------------------------------------------------------- /gensoundgtts.py: -------------------------------------------------------------------------------- 1 | from pygame import mixer 2 | from tempfile import TemporaryFile 3 | from gtts import gTTS 4 | 5 | 6 | def generate_sound(text): 7 | ''' 8 | Converts text to audio and plays it. 9 | ''' 10 | language = 'en' 11 | myobj = gTTS(text=text, lang=language, slow=False) 12 | # slow = False for high speed 13 | sf = TemporaryFile() 14 | myobj.write_to_fp(sf) 15 | sf.seek(0) 16 | mixer.init() 17 | mixer.music.load(sf) 18 | mixer.music.play() 19 | -------------------------------------------------------------------------------- /images/Andrew.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/images/Andrew.jpg -------------------------------------------------------------------------------- /images/Capture.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/images/Capture.JPG -------------------------------------------------------------------------------- /images/Capture1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/images/Capture1.JPG -------------------------------------------------------------------------------- /images/Capture2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/images/Capture2.JPG -------------------------------------------------------------------------------- /images/andrew.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/images/andrew.jpg -------------------------------------------------------------------------------- /models/20180204-160909/20180204-16090.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/models/20180204-160909/20180204-16090.pb -------------------------------------------------------------------------------- /models/20180204-160909/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "/Users/yipsangleung/facenet/models/20180204-160909/model-20180204-160909.ckpt-266000" 2 | all_model_checkpoint_paths: "/Users/yipsangleung/facenet/models/20180204-160909/model-20180204-160909.ckpt-264000" 3 | all_model_checkpoint_paths: "/Users/yipsangleung/facenet/models/20180204-160909/model-20180204-160909.ckpt-265000" 4 | all_model_checkpoint_paths: "/Users/yipsangleung/facenet/models/20180204-160909/model-20180204-160909.ckpt-266000" 5 | -------------------------------------------------------------------------------- /models/20180204-160909/model-20180204-160909.ckpt-264000.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/models/20180204-160909/model-20180204-160909.ckpt-264000.data-00000-of-00001 -------------------------------------------------------------------------------- /models/20180204-160909/model-20180204-160909.ckpt-264000.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/models/20180204-160909/model-20180204-160909.ckpt-264000.index -------------------------------------------------------------------------------- /models/20180204-160909/model-20180204-160909.ckpt-265000.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/models/20180204-160909/model-20180204-160909.ckpt-265000.data-00000-of-00001 -------------------------------------------------------------------------------- /models/20180204-160909/model-20180204-160909.ckpt-265000.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/models/20180204-160909/model-20180204-160909.ckpt-265000.index -------------------------------------------------------------------------------- /models/20180204-160909/model-20180204-160909.ckpt-266000.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/models/20180204-160909/model-20180204-160909.ckpt-266000.data-00000-of-00001 -------------------------------------------------------------------------------- /models/20180204-160909/model-20180204-160909.ckpt-266000.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/models/20180204-160909/model-20180204-160909.ckpt-266000.index -------------------------------------------------------------------------------- /models/20180204-160909/model-20180204-160909.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fazeVaib/DigiVision/ca8d4d08552bf9b36755f11be7393428ee83a3ea/models/20180204-160909/model-20180204-160909.meta -------------------------------------------------------------------------------- /p_part.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import tensorflow as tf 3 | import numpy as np 4 | import sys 5 | import os 6 | from PIL import Image 7 | import coco 8 | from cache import cache 9 | import cv2 as cv 10 | from gtts import gTTS 11 | from datetime import datetime, timedelta 12 | from tensorflow.python.keras import backend as K 13 | from tensorflow.python.keras.models import Model 14 | from tensorflow.python.keras.layers import GRU, Embedding, Dense, Input 15 | from tensorflow.python.keras.applications import VGG16 16 | from tensorflow.python.keras.optimizers import RMSprop 17 | from tensorflow.python.keras.callbacks import ModelCheckpoint, TensorBoard 18 | from tensorflow.python.keras.preprocessing.text import Tokenizer 19 | from tensorflow.python.keras.preprocessing.sequence import pad_sequences 20 | 21 | 22 | def load_image(path, size=None): 23 | """ 24 | Load the image from the given file-path and resize it 25 | to the given size if not None. 26 | """ 27 | 28 | img = Image.open(path) # loading image using PIL 29 | 30 | if not size is None: 31 | img = img.resize(size=size, resample=Image.LANCZOS) 32 | 33 | img = np.array(img) # img to numpy array 34 | 35 | img = img/255.0 # scaling them so they fall between 0 and 1 36 | 37 | # Convert 2-dim gray-scale array to 3-dim RGB array. 38 | if len(img.shape) == 2: 39 | img = np.repeat(img[:, :, np.newaxis], 3, axis=2) 40 | 41 | return img 42 | 43 | 44 | def show_image(idx, train): 45 | """ 46 | Load and plot an image from the training- or validation-set 47 | with the given index. 48 | """ 49 | 50 | if train: # uses image from training set 51 | dir = coco.train_dir 52 | filename = filenames_train[idx] 53 | captions = captions_train[idx] 54 | 55 | else: # uses image from validation set 56 | dir = coco.val_dir 57 | filename = filenames_val[idx] 58 | captions = captions_val[idx] 59 | 60 | # path for the image file 61 | path = os.path.join(dir, filename) 62 | 63 | # printing the captions for this image 64 | for caption in captions: 65 | print(caption) 66 | 67 | # load the image & plot it 68 | image = load_image(path) 69 | plt.imshow(image) 70 | plt.show() 71 | 72 | 73 | def generate_caption(image_path, max_tokens=30): 74 | """ 75 | Generate a caption for the image in the given path. 76 | The caption is limited to the given number of tokens (words). 77 | """ 78 | 79 | # Load and resize the image. 80 | image = load_image(image_path, size=img_size) 81 | 82 | # Expand the 3-dim numpy array to 4-dim 83 | # because the image-model expects a whole batch as input, 84 | # so we give it a batch with just one image. 85 | image_batch = np.expand_dims(image, axis=0) 86 | 87 | transfer_values = image_model_transfer.predict(image_batch) 88 | 89 | shape = (1, max_tokens) 90 | decoder_input_data = np.zeros(shape=shape, dtype=np.int) 91 | 92 | token_int = token_start 93 | 94 | output_text = '' 95 | 96 | count_tokens = 0 97 | 98 | while token_int != token_end and count_tokens < max_tokens: 99 | 100 | decoder_input_data[0, count_tokens] = token_int 101 | 102 | x_data = { 103 | 'transfer_values_input': transfer_values, 104 | 'decoder_input': decoder_input_data 105 | } 106 | 107 | # Input this data to the decoder and get the predicted output. 108 | decoder_output = decoder_model.predict(x_data) 109 | 110 | token_onehot = decoder_output[0, count_tokens, :] 111 | 112 | token_int = np.argmax(token_onehot) 113 | 114 | sampled_word = tokenizer.token_to_word(token_int) 115 | 116 | output_text += " " + sampled_word 117 | 118 | # Increment the token-counter. 119 | count_tokens += 1 120 | 121 | output_tokens = decoder_input_data[0] 122 | return output_text 123 | 124 | 125 | def print_progress(count, max_count): 126 | # Percentage Completion 127 | pct_complete = count/max_count 128 | 129 | # Status-message. Note the \r which means the line should overwrite itself 130 | msg = '\r- Progress: {0:.1%}'.format(pct_complete) 131 | 132 | sys.stdout.write(msg) 133 | sys.stdout.flush() 134 | 135 | 136 | def process_images(data_dir, filenames, batch_size=32): 137 | """ 138 | Process all the given files in the given data_dir using the 139 | pre-trained image-model and return their transfer-values. 140 | 141 | Note that we process the images in batches to save 142 | memory and improve efficiency on the GPU. 143 | """ 144 | 145 | # Number of images to process. 146 | num_images = len(filenames) 147 | 148 | # Pre-allocate input-batch-array for images. 149 | shape = (batch_size,) + img_size + (3,) 150 | image_batch = np.zeros(shape=shape, dtype=np.float16) 151 | 152 | # Pre-allocate output-array for transfer-values. 153 | # Note that we use 16-bit floating-points to save memory. 154 | shape = (num_images, transfer_values_size) 155 | transfer_values = np.zeros(shape=shape, dtype=np.float16) 156 | 157 | # Initialize index into the filenames. 158 | start_index = 0 159 | 160 | # Process batches of image-files. 161 | while start_index < num_images: 162 | # Print the percentage-progress. 163 | print_progress(count=start_index, max_count=num_images) 164 | 165 | # End-index for this batch. 166 | end_index = start_index + batch_size 167 | 168 | # Ensure end-index is within bounds. 169 | if end_index > num_images: 170 | end_index = num_images 171 | 172 | # The last batch may have a different batch-size. 173 | current_batch_size = end_index - start_index 174 | 175 | # Load all the images in the batch. 176 | for i, filename in enumerate(filenames[start_index:end_index]): 177 | # Path for the image-file. 178 | path = os.path.join(data_dir, filename) 179 | 180 | # Load and resize the image. 181 | # This returns the image as a numpy-array. 182 | img = load_image(path, size=img_size) 183 | 184 | # Save the image for later use. 185 | image_batch[i] = img 186 | 187 | # Use the pre-trained image-model to process the image. 188 | # Note that the last batch may have a different size, 189 | # so we only use the relevant images. 190 | transfer_values_batch = image_model_transfer.predict( 191 | image_batch[0:current_batch_size]) 192 | 193 | # Save the transfer-values in the pre-allocated array. 194 | transfer_values[start_index:end_index] = transfer_values_batch[0:current_batch_size] 195 | 196 | # Increase the index for the next loop-iteration. 197 | start_index = end_index 198 | 199 | # Print newline. 200 | print() 201 | 202 | return transfer_values 203 | 204 | 205 | def process_images_train(): 206 | print( 207 | "Processing {0} images in training-set. ".format(len(filenames_train))) 208 | 209 | # path for cache file 210 | cache_path = os.path.join(coco.data_dir, "transfer_values_train.pkl") 211 | 212 | # If the cache-file already exists then reload it, 213 | # otherwise process all images and save their transfer-values 214 | # to the cache-file so it can be reloaded quickly. 215 | transfer_values = cache(cache_path=cache_path, fn=process_images, 216 | data_dir=coco.train_dir, filenames=filenames_train) 217 | return transfer_values 218 | 219 | 220 | def process_images_val(): 221 | print( 222 | "Processing {0} images in validation-set. ".format(len(filenames_val))) 223 | 224 | # path for cache file 225 | cache_path = os.path.join(coco.data_dir, "transfer_values_val.pkl") 226 | 227 | # If the cache-file already exists then reload it, 228 | # otherwise process all images and save their transfer-values 229 | # to the cache-file so it can be reloaded quickly. 230 | transfer_values = cache(cache_path=cache_path, fn=process_images, 231 | data_dir=coco.val_dir, filenames=filenames_val) 232 | return transfer_values 233 | 234 | 235 | def mark_captions(multi_cap_list): 236 | captions_marked = [ 237 | [mark_start + caption + mark_end for caption in cap_list] 238 | for cap_list in multi_cap_list] 239 | return captions_marked 240 | 241 | 242 | def flatten(multi_cap_list): 243 | captions_list = [caption 244 | for cap_list in multi_cap_list 245 | for caption in cap_list] 246 | return captions_list 247 | 248 | 249 | class TokenizerWrap(Tokenizer): 250 | """Wrap the Tokenizer-class from Keras with more functionality.""" 251 | 252 | def __init__(self, texts, num_words=None): 253 | """ 254 | :param texts: List of strings with the data-set. 255 | :param num_words: Max number of words to use. 256 | """ 257 | 258 | Tokenizer.__init__(self, num_words=num_words) 259 | 260 | # Create the vocabulary from the texts. 261 | self.fit_on_texts(texts) 262 | 263 | # Create inverse lookup from integer-tokens to words. 264 | self.index_to_word = dict(zip(self.word_index.values(), 265 | self.word_index.keys())) 266 | 267 | def token_to_word(self, token): 268 | """Lookup a single word from an integer-token.""" 269 | 270 | word = " " if token == 0 else self.index_to_word[token] 271 | return word 272 | 273 | def tokens_to_string(self, tokens): 274 | """Convert a list of integer-tokens to a string.""" 275 | 276 | # Create a list of the individual words. 277 | words = [self.index_to_word[token] 278 | for token in tokens 279 | if token != 0] 280 | 281 | # Concatenate the words to a single string 282 | # with space between all the words. 283 | text = " ".join(words) 284 | 285 | return text 286 | 287 | def captions_to_tokens(self, captions_listlist): 288 | """ 289 | Convert a list-of-list with text-captions to 290 | a list-of-list of integer-tokens. 291 | """ 292 | 293 | # Note that text_to_sequences() takes a list of texts. 294 | tokens = [self.texts_to_sequences(captions_list) 295 | for captions_list in captions_listlist] 296 | 297 | return tokens 298 | 299 | 300 | def get_random_cap_tokens(idx): 301 | """ 302 | Given a list of indices for images in the training-set, 303 | select a token-sequence for a random caption, 304 | and return a list of all these token-sequences. 305 | """ 306 | 307 | result = [] # empty list for result 308 | 309 | # for each of the indices 310 | for i in idx: 311 | j = np.random.choice(len(tokens_train[i])) 312 | 313 | # get jth token-seq for image i 314 | tokens = tokens_train[i][j] 315 | 316 | result.append(tokens) 317 | 318 | return result 319 | 320 | 321 | def batch_generator(batch_size): 322 | """ 323 | Generator function for creating random batches of training-data. 324 | """ 325 | 326 | # Infinite loop 327 | while True: 328 | # returns list of randomly selected indices 329 | idx = np.random.randint(num_train_img, size=batch_size) 330 | 331 | # Get the pre-computed transfer-values for those images. 332 | # These are the outputs of the pre-trained image-model. 333 | transfer_values = transfer_values_train[idx] 334 | 335 | # get raandom token respective to the image chosen randomly 336 | tokens = get_random_cap_tokens(idx) 337 | 338 | # num of tokens in all token sequences 339 | num_tokens = [len(t) for t in tokens] 340 | 341 | # Max num of tokens 342 | max_tokens = np.max(num_tokens) 343 | 344 | # pad all other token sequences so all have same length to input into neural network 345 | tokens_padded = pad_sequences( 346 | tokens, maxlen=max_tokens, padding='post', truncating='post') 347 | 348 | # the decoder part of neural network will try to map token-seq to themselves shifted one time-step 349 | decoder_input_data = tokens_padded[:, 0:-1] 350 | decoder_output_data = tokens_padded[:, 1:] 351 | 352 | # dict for input data as we have several inputs, we used named dict to ensure data is assigned correctly 353 | x_data = { 354 | 'decoder_input': decoder_input_data, 355 | 'transfer_values_input': transfer_values 356 | } 357 | 358 | # Dict for output data 359 | y_data = { 360 | 'decoder_output': decoder_output_data 361 | } 362 | 363 | yield(x_data, y_data) 364 | 365 | 366 | def connect_decoder(transfer_values): 367 | # Map the transfer-values so the dimensionality matches the internal state of the GRU layers. This means 368 | # we can use the mapped transfer-values as the initial state of the GRU layers. 369 | 370 | initial_state = decoder_transfer_map(transfer_values) 371 | 372 | # start the decoder network with input layer 373 | net = decoder_input 374 | 375 | # connect the embedding layer 376 | net = decoder_embedding(net) 377 | 378 | # connect all GRU layers 379 | net = decoder_gru1(net, initial_state=initial_state) 380 | net = decoder_gru2(net, initial_state=initial_state) 381 | net = decoder_gru3(net, initial_state=initial_state) 382 | 383 | # Connects the final dense layer that converts to one-hot encoded arrays 384 | decoder_output = decoder_dense(net) 385 | 386 | return decoder_output 387 | 388 | 389 | def sparse_cross_entropy(y_true, y_pred): 390 | """ 391 | Calculate the cross-entropy loss between y_true and y_pred. 392 | 393 | y_true is a 2-rank tensor with the desired output. 394 | The shape is [batch_size, sequence_length] and it 395 | contains sequences of integer-tokens. 396 | 397 | y_pred is the decoder's output which is a 3-rank tensor 398 | with shape [batch_size, sequence_length, num_words] 399 | so that for each sequence in the batch there is a one-hot 400 | encoded array of length num_words. 401 | """ 402 | 403 | # Calculate the loss. This outputs a 2 rank tensor of shape [batch_size, seq_length] 404 | loss = tf.nn.sparse_softmax_cross_entropy_with_logits( 405 | labels=y_true, logits=y_pred) 406 | 407 | # Keras may reduce this across the first axis (the batch) but the semantics are unclear, so to be sure we use 408 | # the loss across the entire 2-rank tensor, we reduce it to a single scalar with the mean function. 409 | loss_mean = tf.reduce_mean(loss) 410 | 411 | return loss_mean 412 | 413 | # ENTER YOUR CUSTOM PATH WHERE COCO DATASET IS STORED 414 | coco.set_data_dir("/mnt/MyDrive/Datasets/image-cap/data/coco") 415 | 416 | _, filenames_train, captions_train = coco.load_records(train=True) 417 | 418 | num_train_img = len(filenames_train) 419 | 420 | _, filenames_val, captions_val = coco.load_records(train=False) 421 | 422 | num_val_img = len(filenames_val) 423 | 424 | image_model = VGG16(include_top=True, weights='imagenet') 425 | 426 | transfer_layer = image_model.get_layer('fc2') 427 | 428 | image_model_transfer = Model( 429 | inputs=image_model.input, outputs=transfer_layer.output) 430 | 431 | img_size = K.int_shape(image_model.input)[1:3] 432 | # print(img_size) 433 | 434 | transfer_values_size = K.int_shape(transfer_layer.output)[1] 435 | 436 | transfer_values_train = process_images_train() 437 | 438 | transfer_values_val = process_images_val() 439 | 440 | mark_start = 'ssss ' 441 | mark_end = ' eeee' 442 | 443 | captions_train_marked = mark_captions(captions_train) 444 | 445 | captions_train_flat = flatten(captions_train_marked) 446 | 447 | num_words = 10000 448 | 449 | tokenizer = TokenizerWrap(texts=captions_train_flat, num_words=num_words) 450 | 451 | token_start = tokenizer.word_index[mark_start.strip()] 452 | 453 | token_end = tokenizer.word_index[mark_end.strip()] 454 | 455 | tokens_train = tokenizer.captions_to_tokens(captions_train_marked) 456 | 457 | batch_size = 256 458 | 459 | generator = batch_generator(batch_size=batch_size) 460 | 461 | batch = next(generator) 462 | batch_x = batch[0] 463 | batch_y = batch[1] 464 | 465 | num_cap_train = [len(cap) for cap in captions_train] 466 | 467 | total_num_cap_train = np.sum(num_cap_train) 468 | 469 | steps_per_epoch = int(total_num_cap_train / batch_size) 470 | 471 | state_size = 512 472 | 473 | embedding_size = 128 474 | 475 | transfer_values_input = Input( 476 | shape=(transfer_values_size,), name='transfer_values_input') 477 | 478 | decoder_transfer_map = Dense( 479 | state_size, activation='tanh', name='decoder_transfer_map') 480 | 481 | decoder_input = Input(shape=(None,), name='decoder_input') 482 | 483 | decoder_embedding = Embedding( 484 | input_dim=num_words, output_dim=embedding_size, name='decoder_embedding') 485 | 486 | decoder_gru1 = GRU(state_size, name='decoder_gru1', return_sequences=True) 487 | decoder_gru2 = GRU(state_size, name='decoder_gru2', return_sequences=True) 488 | decoder_gru3 = GRU(state_size, name='decoder_gru3', return_sequences=True) 489 | 490 | decoder_dense = Dense(num_words, activation='linear', name='decoder_output') 491 | 492 | decoder_output = connect_decoder(transfer_values=transfer_values_input) 493 | decoder_model = Model( 494 | inputs=[transfer_values_input, decoder_input], outputs=[decoder_output]) 495 | 496 | optimizer = RMSprop(lr=1e-3) 497 | 498 | decoder_target = tf.placeholder(dtype='int32', shape=(None, None)) 499 | 500 | decoder_model.compile( 501 | optimizer=optimizer, loss=sparse_cross_entropy, target_tensors=[decoder_target]) 502 | 503 | path_checkpoint = './IC_checkpoints.keras' 504 | callback_checkpoints = ModelCheckpoint( 505 | filepath=path_checkpoint, verbose=1, save_weights_only=True) 506 | 507 | callback_tensorboard = TensorBoard( 508 | log_dir='./IC_logs/', histogram_freq=0, write_graph=False) 509 | 510 | callbacks = [callback_checkpoints, callback_tensorboard] 511 | 512 | try: 513 | decoder_model.load_weights(path_checkpoint) 514 | except Exception as error: 515 | print('Error trying to load chkpoint') 516 | print(error) 517 | --------------------------------------------------------------------------------