├── Readme.md
├── Resources
    ├── coco.names.txt
    ├── yolov4-tiny.cfg
    └── yolov4-tiny.weights
├── Result
    ├── input
    │   └── example.gif
    └── output
    │   └── example.gif
├── libraries.bat
├── main.py
└── requirements.txt


/Readme.md:
--------------------------------------------------------------------------------
 1 | # Finding objects on the video
 2 | In this project, I'll show you how to find objects in aa video, select them, and count them. This program can work with recaptcha
 3 | 
 4 | ## Features
 5 | * Works with image, video
 6 | * Possibility to connect a camera
 7 | * Distinguishes 80 objects
 8 | * The user can specify which object to look for on the image
 9 | 
10 | ## How to install
11 | 1. Clone this repository on your computer
12 | `https://github.com/paveldat/object_detection_on_video.git`
13 | 2. Install all the requirements
14 | `run libraries.bat` or
15 | `pip install -r requirements.txt`
16 | 3. Run the program
17 | `python main.py`
18 | 
19 | ## Help
20 | When you start the program, you will be prompted to enter the path to the video and the name of the object that you need to find and calculate it.
21 | If you need to find several objects in the image, write them separated by commas.
22 | Names of possible objects:
23 | ```
24 | 'person', 'bicycle', 'car', 'motorbike', 'aeroplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
25 | 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
26 | 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
27 | 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
28 | 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
29 | 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'sofa',
30 | 'pottedplant', 'bed', 'diningtable', 'toilet', 'tvmonitor', 'laptop', 'mouse', 'remote', 'keyboard',
31 | 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
32 | 'teddy bear', 'hair drier', 'toothbrush'
33 | ```
34 | 
35 | ## Result
36 | ```
37 | Path to video (or URL): Result/input/example.mp4
38 | What we are looking for: person, car, bus
39 | ```
40 | ![Input](https://github.com/paveldat/object_detection_on_video/blob/main/Result/input/example.gif)
41 | ![Output](https://github.com/paveldat/object_detection_on_video/blob/main/Result/output/example.gif)


--------------------------------------------------------------------------------
/Resources/coco.names.txt:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorbike
 5 | aeroplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | sofa
59 | pottedplant
60 | bed
61 | diningtable
62 | toilet
63 | tvmonitor
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush


--------------------------------------------------------------------------------
/Resources/yolov4-tiny.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | #batch=1
  4 | #subdivisions=1
  5 | # Training
  6 | batch=64
  7 | subdivisions=1
  8 | width=416
  9 | height=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.00261
 19 | burn_in=1000
 20 | 
 21 | max_batches = 2000200
 22 | policy=steps
 23 | steps=1600000,1800000
 24 | scales=.1,.1
 25 | 
 26 | 
 27 | #weights_reject_freq=1001
 28 | #ema_alpha=0.9998
 29 | #equidistant_point=1000
 30 | #num_sigmas_reject_badlabels=3
 31 | #badlabels_rejection_percentage=0.2
 32 | 
 33 | 
 34 | [convolutional]
 35 | batch_normalize=1
 36 | filters=32
 37 | size=3
 38 | stride=2
 39 | pad=1
 40 | activation=leaky
 41 | 
 42 | [convolutional]
 43 | batch_normalize=1
 44 | filters=64
 45 | size=3
 46 | stride=2
 47 | pad=1
 48 | activation=leaky
 49 | 
 50 | [convolutional]
 51 | batch_normalize=1
 52 | filters=64
 53 | size=3
 54 | stride=1
 55 | pad=1
 56 | activation=leaky
 57 | 
 58 | [route]
 59 | layers=-1
 60 | groups=2
 61 | group_id=1
 62 | 
 63 | [convolutional]
 64 | batch_normalize=1
 65 | filters=32
 66 | size=3
 67 | stride=1
 68 | pad=1
 69 | activation=leaky
 70 | 
 71 | [convolutional]
 72 | batch_normalize=1
 73 | filters=32
 74 | size=3
 75 | stride=1
 76 | pad=1
 77 | activation=leaky
 78 | 
 79 | [route]
 80 | layers = -1,-2
 81 | 
 82 | [convolutional]
 83 | batch_normalize=1
 84 | filters=64
 85 | size=1
 86 | stride=1
 87 | pad=1
 88 | activation=leaky
 89 | 
 90 | [route]
 91 | layers = -6,-1
 92 | 
 93 | [maxpool]
 94 | size=2
 95 | stride=2
 96 | 
 97 | [convolutional]
 98 | batch_normalize=1
 99 | filters=128
100 | size=3
101 | stride=1
102 | pad=1
103 | activation=leaky
104 | 
105 | [route]
106 | layers=-1
107 | groups=2
108 | group_id=1
109 | 
110 | [convolutional]
111 | batch_normalize=1
112 | filters=64
113 | size=3
114 | stride=1
115 | pad=1
116 | activation=leaky
117 | 
118 | [convolutional]
119 | batch_normalize=1
120 | filters=64
121 | size=3
122 | stride=1
123 | pad=1
124 | activation=leaky
125 | 
126 | [route]
127 | layers = -1,-2
128 | 
129 | [convolutional]
130 | batch_normalize=1
131 | filters=128
132 | size=1
133 | stride=1
134 | pad=1
135 | activation=leaky
136 | 
137 | [route]
138 | layers = -6,-1
139 | 
140 | [maxpool]
141 | size=2
142 | stride=2
143 | 
144 | [convolutional]
145 | batch_normalize=1
146 | filters=256
147 | size=3
148 | stride=1
149 | pad=1
150 | activation=leaky
151 | 
152 | [route]
153 | layers=-1
154 | groups=2
155 | group_id=1
156 | 
157 | [convolutional]
158 | batch_normalize=1
159 | filters=128
160 | size=3
161 | stride=1
162 | pad=1
163 | activation=leaky
164 | 
165 | [convolutional]
166 | batch_normalize=1
167 | filters=128
168 | size=3
169 | stride=1
170 | pad=1
171 | activation=leaky
172 | 
173 | [route]
174 | layers = -1,-2
175 | 
176 | [convolutional]
177 | batch_normalize=1
178 | filters=256
179 | size=1
180 | stride=1
181 | pad=1
182 | activation=leaky
183 | 
184 | [route]
185 | layers = -6,-1
186 | 
187 | [maxpool]
188 | size=2
189 | stride=2
190 | 
191 | [convolutional]
192 | batch_normalize=1
193 | filters=512
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 | 
199 | ##################################
200 | 
201 | [convolutional]
202 | batch_normalize=1
203 | filters=256
204 | size=1
205 | stride=1
206 | pad=1
207 | activation=leaky
208 | 
209 | [convolutional]
210 | batch_normalize=1
211 | filters=512
212 | size=3
213 | stride=1
214 | pad=1
215 | activation=leaky
216 | 
217 | [convolutional]
218 | size=1
219 | stride=1
220 | pad=1
221 | filters=255
222 | activation=linear
223 | 
224 | 
225 | 
226 | [yolo]
227 | mask = 3,4,5
228 | anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
229 | classes=80
230 | num=6
231 | jitter=.3
232 | scale_x_y = 1.05
233 | cls_normalizer=1.0
234 | iou_normalizer=0.07
235 | iou_loss=ciou
236 | ignore_thresh = .7
237 | truth_thresh = 1
238 | random=0
239 | resize=1.5
240 | nms_kind=greedynms
241 | beta_nms=0.6
242 | #new_coords=1
243 | #scale_x_y = 2.0
244 | 
245 | [route]
246 | layers = -4
247 | 
248 | [convolutional]
249 | batch_normalize=1
250 | filters=128
251 | size=1
252 | stride=1
253 | pad=1
254 | activation=leaky
255 | 
256 | [upsample]
257 | stride=2
258 | 
259 | [route]
260 | layers = -1, 23
261 | 
262 | [convolutional]
263 | batch_normalize=1
264 | filters=256
265 | size=3
266 | stride=1
267 | pad=1
268 | activation=leaky
269 | 
270 | [convolutional]
271 | size=1
272 | stride=1
273 | pad=1
274 | filters=255
275 | activation=linear
276 | 
277 | [yolo]
278 | mask = 1,2,3
279 | anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
280 | classes=80
281 | num=6
282 | jitter=.3
283 | scale_x_y = 1.05
284 | cls_normalizer=1.0
285 | iou_normalizer=0.07
286 | iou_loss=ciou
287 | ignore_thresh = .7
288 | truth_thresh = 1
289 | random=0
290 | resize=1.5
291 | nms_kind=greedynms
292 | beta_nms=0.6
293 | #new_coords=1
294 | #scale_x_y = 2.0


--------------------------------------------------------------------------------
/Resources/yolov4-tiny.weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paveldat/objects_detection_on_video/a9dc705ad00f1b2173a865a214357b48ad4119d9/Resources/yolov4-tiny.weights


--------------------------------------------------------------------------------
/Result/input/example.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paveldat/objects_detection_on_video/a9dc705ad00f1b2173a865a214357b48ad4119d9/Result/input/example.gif


--------------------------------------------------------------------------------
/Result/output/example.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paveldat/objects_detection_on_video/a9dc705ad00f1b2173a865a214357b48ad4119d9/Result/output/example.gif


--------------------------------------------------------------------------------
/libraries.bat:
--------------------------------------------------------------------------------
1 | pip install opencv-python
2 | pip install numpy
3 | pip install art


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | from art import tprint
  4 | 
  5 | 
  6 | def apply_yolo_object_detection(image_to_process):
  7 |     """
  8 |     Recognition and determination of the coordinates of objects on the image
  9 |     :param image_to_process: original image
 10 |     :return: image with marked objects and captions to them
 11 |     """
 12 | 
 13 |     height, width, _ = image_to_process.shape
 14 |     blob = cv2.dnn.blobFromImage(image_to_process, 1 / 255, (608, 608),
 15 |                                  (0, 0, 0), swapRB=True, crop=False)
 16 |     net.setInput(blob)
 17 |     outs = net.forward(out_layers)
 18 |     class_indexes, class_scores, boxes = ([] for i in range(3))
 19 |     objects_count = 0
 20 | 
 21 |     # Starting a search for objects in an image
 22 |     for out in outs:
 23 |         for obj in out:
 24 |             scores = obj[5:]
 25 |             class_index = np.argmax(scores)
 26 |             class_score = scores[class_index]
 27 |             if class_score > 0:
 28 |                 center_x = int(obj[0] * width)
 29 |                 center_y = int(obj[1] * height)
 30 |                 obj_width = int(obj[2] * width)
 31 |                 obj_height = int(obj[3] * height)
 32 |                 box = [center_x - obj_width // 2, center_y - obj_height // 2,
 33 |                        obj_width, obj_height]
 34 |                 boxes.append(box)
 35 |                 class_indexes.append(class_index)
 36 |                 class_scores.append(float(class_score))
 37 | 
 38 |     # Selection
 39 |     chosen_boxes = cv2.dnn.NMSBoxes(boxes, class_scores, 0.0, 0.4)
 40 |     for box_index in chosen_boxes:
 41 |         box_index = box_index
 42 |         box = boxes[box_index]
 43 |         class_index = class_indexes[box_index]
 44 | 
 45 |         # For debugging, we draw objects included in the desired classes
 46 |         if classes[class_index] in classes_to_look_for:
 47 |             objects_count += 1
 48 |             image_to_process = draw_object_bounding_box(image_to_process,
 49 |                                                         class_index, box)
 50 | 
 51 |     final_image = draw_object_count(image_to_process, objects_count)
 52 |     return final_image
 53 | 
 54 | 
 55 | def draw_object_bounding_box(image_to_process, index, box):
 56 |     """
 57 |     Drawing object borders with captions
 58 |     :param image_to_process: original image
 59 |     :param index: index of object class defined with YOLO
 60 |     :param box: coordinates of the area around the object
 61 |     :return: image with marked objects
 62 |     """
 63 | 
 64 |     x, y, w, h = box
 65 |     start = (x, y)
 66 |     end = (x + w, y + h)
 67 |     color = (0, 255, 0)
 68 |     width = 2
 69 |     final_image = cv2.rectangle(image_to_process, start, end, color, width)
 70 | 
 71 |     start = (x, y - 10)
 72 |     font_size = 1
 73 |     font = cv2.FONT_HERSHEY_SIMPLEX
 74 |     width = 2
 75 |     text = classes[index]
 76 |     final_image = cv2.putText(final_image, text, start, font,
 77 |                               font_size, color, width, cv2.LINE_AA)
 78 | 
 79 |     return final_image
 80 | 
 81 | 
 82 | def draw_object_count(image_to_process, objects_count):
 83 |     """
 84 |     Signature of the number of found objects in the image
 85 |     :param image_to_process: original image
 86 |     :param objects_count: the number of objects of the desired class
 87 |     :return: image with labeled number of found objects
 88 |     """
 89 | 
 90 |     start = (10, 120)
 91 |     font_size = 1.5
 92 |     font = cv2.FONT_HERSHEY_SIMPLEX
 93 |     width = 3
 94 |     text = "Objects found: " + str(objects_count)
 95 | 
 96 |     # Text output with a stroke
 97 |     # (so that it can be seen in different lighting conditions of the picture)
 98 |     white_color = (255, 255, 255)
 99 |     black_outline_color = (0, 0, 0)
100 |     final_image = cv2.putText(image_to_process, text, start, font, font_size,
101 |                               black_outline_color, width * 3, cv2.LINE_AA)
102 |     final_image = cv2.putText(final_image, text, start, font, font_size,
103 |                               white_color, width, cv2.LINE_AA)
104 | 
105 |     return final_image
106 | 
107 | 
108 | def start_video_object_detection(video: str):
109 |     """
110 |     Захват и анализ видео в режиме реального времени
111 |     """
112 | 
113 |     while True:
114 |         try:
115 |             # Capturing a picture from a video
116 |             video_camera_capture = cv2.VideoCapture(video)
117 |             
118 |             while video_camera_capture.isOpened():
119 |                 ret, frame = video_camera_capture.read()
120 |                 if not ret:
121 |                     break
122 |                 
123 |                 # Application of object recognition methods on a video frame from YOLO
124 |                 frame = apply_yolo_object_detection(frame)
125 |                 
126 |                 # Displaying the processed image on the screen with a reduced window size
127 |                 frame = cv2.resize(frame, (1920 // 2, 1080 // 2))
128 |                 cv2.imshow("Video Capture", frame)
129 |                 cv2.waitKey(1)
130 |             
131 |             video_camera_capture.release()
132 |             cv2.destroyAllWindows()
133 |     
134 |         except KeyboardInterrupt:
135 |             pass
136 | 
137 | 
138 | if __name__ == '__main__':
139 | 
140 |     # Logo
141 |     tprint("Object detection")
142 |     tprint("by")
143 |     tprint("paveldat")
144 | 
145 |     # Loading YOLO scales from files and setting up the network
146 |     net = cv2.dnn.readNetFromDarknet("Resources/yolov4-tiny.cfg",
147 |                                      "Resources/yolov4-tiny.weights")
148 |     layer_names = net.getLayerNames()
149 |     out_layers_indexes = net.getUnconnectedOutLayers()
150 |     out_layers = [layer_names[index - 1] for index in out_layers_indexes]
151 | 
152 |     # Loading from a file of object classes that YOLO can detect
153 |     with open("Resources/coco.names.txt") as file:
154 |         classes = file.read().split("\n")
155 | 
156 |     # Determining classes that will be prioritized for search in an image
157 |     # The names are in the file coco.names.txt
158 | 
159 |     video = input("Path to video (or URL): ")
160 |     look_for = input("What we are looking for: ").split(',')
161 |     
162 |     # Delete spaces
163 |     list_look_for = []
164 |     for look in look_for:
165 |         list_look_for.append(look.strip())
166 | 
167 |     classes_to_look_for = list_look_for
168 | 
169 |     start_video_object_detection(video)
170 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | opencv-python
2 | numpy
3 | art


--------------------------------------------------------------------------------