├── LICENSE
├── README.md
├── classify_real_time.py
└── example.png


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Lucas Gago
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Classify real time desktop and speech
 3 | 
 4 | 
 5 | [image1]: ./example.png "example"
 6 | 
 7 | Overview
 8 | ---
 9 | Team DeepThings (Mez Gebre and I) won the Best Product Category at the Deep Learning Hackathon in San Francisco. We developed in three days a real-time system capable of identifying objects and speaking what it sees, thinking about making a useful tool for the visually impaired, as it could make navigation easier. Proof of concept on a laptop, final model running on Android.
10 | 
11 | This is only the first prototype for Windows.
12 | 
13 | 
14 | The goals / steps of this project are the following:
15 | ---
16 | 
17 | * Get the Webcam feed without bottlenecks.
18 | * Recognize images using Inception v3.
19 | * Text to speech with Google TTS API.
20 | * Making a functional model.
21 | * Tuninning the parameters.
22 | * Output visual display of the results.
23 |  
24 |  Dependencies
25 |  ---
26 | This module requires:
27 | 
28 | * [Python 3.6.1](https://www.python.org/)
29 | * [Tensorflow-gpu 1.0](https://www.tensorflow.org/install/install_windows#requirements_to_run_tensorflow_with_gpu_support)
30 | * [Opencv 3.2](http://opencv.org/)
31 | * [Numpy 1.12](http://www.numpy.org/)
32 | * [Gtts 1.1](https://pypi.python.org/pypi/gTTS)
33 | * [Pygame 1.9](http://www.pygame.org/news)
34 | 
35 | Usage
36 | ---
37 | Just run:
38 | `` python classify_real_time_v2.py``
39 | 
40 | The output should look like this:
41 | 
42 | 
43 | ![alt text][image1]
44 | 
45 | More details
46 | ---
47 | For more information, check my medium post [here](https://medium.com/@lucasgago/real-time-image-recognition-and-speech-5545f267f7b3)
48 | 
49 | Licence
50 | ---
51 | This proyect is Copyright © 2016-2017 Lucas Gago. It is free software, and may be redistributed under the terms specified in the [MIT Licence](https://opensource.org/licenses/MIT).
52 | 


--------------------------------------------------------------------------------
/classify_real_time.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os.path
  3 | import re
  4 | import sys
  5 | import tarfile
  6 | import cv2
  7 | from time import sleep
  8 | import numpy as np
  9 | from six.moves import urllib
 10 | import tensorflow as tf
 11 | import time
 12 | from gtts import gTTS
 13 | import pygame
 14 | import os
 15 | from threading import Thread
 16 | import cv2
 17 | 
 18 | model_dir = '/tmp/imagenet'
 19 | DATA_URL = 'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz'
 20 | 
 21 | 
 22 | # Threaded class for performance improvement
 23 | class VideoStream:
 24 |     def __init__(self, src=0):
 25 |         self.stream = cv2.VideoCapture(src)
 26 |         (self.grabbed, self.frame) = self.stream.read()
 27 |         self.stopped = False
 28 | 
 29 |     def start(self):
 30 |         Thread(target=self.update, args=()).start()
 31 |         return self
 32 | 
 33 |     def update(self):
 34 |         while True:
 35 |             if self.stopped:
 36 |                 return
 37 | 
 38 |             (self.grabbed, self.frame) = self.stream.read()
 39 | 
 40 |     def read(self):
 41 |         # Return the latest frame
 42 |         return self.frame
 43 | 
 44 |     def stop(self):
 45 |         self.stopped = True
 46 | 
 47 | 
 48 | class NodeLookup(object):
 49 |     def __init__(self,
 50 |                  label_lookup_path=None,
 51 |                  uid_lookup_path=None):
 52 |         if not label_lookup_path:
 53 |             label_lookup_path = os.path.join(
 54 |                 model_dir, 'imagenet_2012_challenge_label_map_proto.pbtxt')
 55 |         if not uid_lookup_path:
 56 |             uid_lookup_path = os.path.join(
 57 |                 model_dir, 'imagenet_synset_to_human_label_map.txt')
 58 |         self.node_lookup = self.load(label_lookup_path, uid_lookup_path)
 59 | 
 60 |     def load(self, label_lookup_path, uid_lookup_path):
 61 | 
 62 |         if not tf.gfile.Exists(uid_lookup_path):
 63 |             tf.logging.fatal('File does not exist %s', uid_lookup_path)
 64 |         if not tf.gfile.Exists(label_lookup_path):
 65 |             tf.logging.fatal('File does not exist %s', label_lookup_path)
 66 | 
 67 |         # Loads mapping from string UID to human-readable string
 68 |         proto_as_ascii_lines = tf.gfile.GFile(uid_lookup_path).readlines()
 69 |         uid_to_human = {}
 70 |         p = re.compile(r'[n\d]*[ \S,]*')
 71 |         for line in proto_as_ascii_lines:
 72 |             parsed_items = p.findall(line)
 73 |             uid = parsed_items[0]
 74 |             human_string = parsed_items[2]
 75 |             uid_to_human[uid] = human_string
 76 | 
 77 |         # Loads mapping from string UID to integer node ID.
 78 |         node_id_to_uid = {}
 79 |         proto_as_ascii = tf.gfile.GFile(label_lookup_path).readlines()
 80 |         for line in proto_as_ascii:
 81 |             if line.startswith('  target_class:'):
 82 |                 target_class = int(line.split(': ')[1])
 83 |             if line.startswith('  target_class_string:'):
 84 |                 target_class_string = line.split(': ')[1]
 85 |                 node_id_to_uid[target_class] = target_class_string[1:-2]
 86 | 
 87 |         # Loads the final mapping of integer node ID to human-readable string
 88 |         node_id_to_name = {}
 89 |         for key, val in node_id_to_uid.items():
 90 |             if val not in uid_to_human:
 91 |                 tf.logging.fatal('Failed to locate: %s', val)
 92 |             name = uid_to_human[val]
 93 |             node_id_to_name[key] = name
 94 | 
 95 |         return node_id_to_name
 96 | 
 97 |     def id_to_string(self, node_id):
 98 |         if node_id not in self.node_lookup:
 99 |             return ''
100 |         return self.node_lookup[node_id]
101 | 
102 | 
103 | def create_graph():
104 | 
105 |     # Creates graph from saved graph_def.pb.
106 |     with tf.gfile.FastGFile(os.path.join(
107 |             model_dir, 'classify_image_graph_def.pb'), 'rb') as f:
108 |         graph_def = tf.GraphDef()
109 |         graph_def.ParseFromString(f.read())
110 |         _ = tf.import_graph_def(graph_def, name='')
111 | 
112 | 
113 | def maybe_download_and_extract():
114 |     # Download and extract model tar file
115 |     dest_directory = model_dir
116 |     if not os.path.exists(dest_directory):
117 |         os.makedirs(dest_directory)
118 |     filename = DATA_URL.split('/')[-1]
119 |     filepath = os.path.join(dest_directory, filename)
120 |     if not os.path.exists(filepath):
121 |         def _progress(count, block_size, total_size):
122 |             sys.stdout.write(
123 |                 '\r>> Downloading %s %.1f%%' %
124 |                 (filename,
125 |                  float(
126 |                      count *
127 |                      block_size) /
128 |                     float(total_size) *
129 |                     100.0))
130 |             sys.stdout.flush()
131 |         filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, _progress)
132 |         print()
133 |         statinfo = os.stat(filepath)
134 |         print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
135 |     tarfile.open(filepath, 'r:gz').extractall(dest_directory)
136 | 
137 | 
138 | # Download and create graph
139 | maybe_download_and_extract()
140 | create_graph()
141 | 
142 | # Variables declarations
143 | frame_count = 0
144 | score = 0
145 | start = time.time()
146 | pygame.mixer.init()
147 | pred = 0
148 | last = 0
149 | human_string = None
150 | 
151 | # Init video stream
152 | vs = VideoStream(src=0).start()
153 | 
154 | # Start tensroflow session
155 | with tf.Session() as sess:
156 |     softmax_tensor = sess.graph.get_tensor_by_name('softmax:0')
157 | 
158 |     while True:
159 |         frame = vs.read()
160 |         frame_count += 1
161 | 
162 |         # Only run every 5 frames
163 |         if frame_count % 5 == 0:
164 | 
165 |             # Save the image as the fist layer of inception is a DecodeJpeg
166 |             cv2.imwrite("current_frame.jpg", frame)
167 | 
168 |             image_data = tf.gfile.FastGFile("./current_frame.jpg", 'rb').read()
169 |             predictions = sess.run(
170 |                 softmax_tensor, {
171 |                     'DecodeJpeg/contents:0': image_data})
172 | 
173 |             predictions = np.squeeze(predictions)
174 |             node_lookup = NodeLookup()
175 | 
176 |             # change n_pred for more predictions
177 |             n_pred = 1
178 |             top_k = predictions.argsort()[-n_pred:][::-1]
179 |             for node_id in top_k:
180 |                 human_string_n = node_lookup.id_to_string(node_id)
181 |                 score = predictions[node_id]
182 |             if score > .5:
183 |                 # Some manual corrections
184 |                 # Kind of cheating
185 |                 if human_string_n == "stethoscope":
186 |                     human_string_n = "Headphones"
187 |                 if human_string_n == "spatula":
188 |                     human_string_n = "fork"
189 |                 if human_string_n == "iPod":
190 |                     human_string_n = "iPhone"
191 |                 human_string = human_string_n
192 | 
193 |                 lst = human_string.split()
194 |                 human_string = " ".join(lst[0:2])
195 |                 human_string_filename = str(lst[0])
196 | 
197 |             current = time.time()
198 |             fps = frame_count / (current - start)
199 | 
200 |         # Speech module
201 |         if last > 40 and pygame.mixer.music.get_busy(
202 |         ) == False and human_string == human_string_n:
203 |             pred += 1
204 |             name = human_string_filename + ".mp3"
205 | 
206 |             # Only get from google if we dont have it
207 |             if not os.path.isfile(name):
208 |                 tts = gTTS(text="I see a " + human_string, lang='en')
209 |                 tts.save(name)
210 | 
211 |             last = 0
212 |             pygame.mixer.music.load(name)
213 |             pygame.mixer.music.play()
214 | 
215 |         # Show info during some time
216 |         if last < 40 and frame_count > 10:
217 |             cv2.putText(frame, human_string, (20, 400),
218 |                         cv2.FONT_HERSHEY_TRIPLEX, 1, (255, 255, 255))
219 |             cv2.putText(frame, str(np.round(score, 2)) + "%",
220 |                         (20, 440), cv2.FONT_HERSHEY_TRIPLEX, 1, (255, 255, 255))
221 | 
222 |         if frame_count > 20:
223 |             cv2.putText(frame, "fps: " + str(np.round(fps, 2)),
224 |                         (460, 460), cv2.FONT_HERSHEY_TRIPLEX, 1, (255, 255, 255))
225 | 
226 |         cv2.imshow("Frame", frame)
227 |         last += 1
228 | 
229 |         # if the 'q' key is pressed, stop the loop
230 |         if cv2.waitKey(1) & 0xFF == ord("q"):
231 |             break
232 | 
233 | # cleanup everything
234 | vs.stop()
235 | cv2.destroyAllWindows()
236 | sess.close()
237 | print("Done")
238 | 


--------------------------------------------------------------------------------
/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gagolucasm/Classify-Real-Time-Desktop/fd3d3925a72af7cc19d9045e2b797c92f2e6436f/example.png


--------------------------------------------------------------------------------