├── Readme.md ├── Resources ├── coco.names.txt ├── yolov4-tiny.cfg └── yolov4-tiny.weights ├── Result ├── input │ └── example.gif └── output │ └── example.gif ├── libraries.bat ├── main.py └── requirements.txt /Readme.md: -------------------------------------------------------------------------------- 1 | # Finding objects on the video 2 | In this project, I'll show you how to find objects in aa video, select them, and count them. This program can work with recaptcha 3 | 4 | ## Features 5 | * Works with image, video 6 | * Possibility to connect a camera 7 | * Distinguishes 80 objects 8 | * The user can specify which object to look for on the image 9 | 10 | ## How to install 11 | 1. Clone this repository on your computer 12 | `https://github.com/paveldat/object_detection_on_video.git` 13 | 2. Install all the requirements 14 | `run libraries.bat` or 15 | `pip install -r requirements.txt` 16 | 3. Run the program 17 | `python main.py` 18 | 19 | ## Help 20 | When you start the program, you will be prompted to enter the path to the video and the name of the object that you need to find and calculate it. 21 | If you need to find several objects in the image, write them separated by commas. 22 | Names of possible objects: 23 | ``` 24 | 'person', 'bicycle', 'car', 'motorbike', 'aeroplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 25 | 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 26 | 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 27 | 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 28 | 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 29 | 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'sofa', 30 | 'pottedplant', 'bed', 'diningtable', 'toilet', 'tvmonitor', 'laptop', 'mouse', 'remote', 'keyboard', 31 | 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 32 | 'teddy bear', 'hair drier', 'toothbrush' 33 | ``` 34 | 35 | ## Result 36 | ``` 37 | Path to video (or URL): Result/input/example.mp4 38 | What we are looking for: person, car, bus 39 | ``` 40 | ![Input](https://github.com/paveldat/object_detection_on_video/blob/main/Result/input/example.gif) 41 | ![Output](https://github.com/paveldat/object_detection_on_video/blob/main/Result/output/example.gif) -------------------------------------------------------------------------------- /Resources/coco.names.txt: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush -------------------------------------------------------------------------------- /Resources/yolov4-tiny.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=1 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.00261 19 | burn_in=1000 20 | 21 | max_batches = 2000200 22 | policy=steps 23 | steps=1600000,1800000 24 | scales=.1,.1 25 | 26 | 27 | #weights_reject_freq=1001 28 | #ema_alpha=0.9998 29 | #equidistant_point=1000 30 | #num_sigmas_reject_badlabels=3 31 | #badlabels_rejection_percentage=0.2 32 | 33 | 34 | [convolutional] 35 | batch_normalize=1 36 | filters=32 37 | size=3 38 | stride=2 39 | pad=1 40 | activation=leaky 41 | 42 | [convolutional] 43 | batch_normalize=1 44 | filters=64 45 | size=3 46 | stride=2 47 | pad=1 48 | activation=leaky 49 | 50 | [convolutional] 51 | batch_normalize=1 52 | filters=64 53 | size=3 54 | stride=1 55 | pad=1 56 | activation=leaky 57 | 58 | [route] 59 | layers=-1 60 | groups=2 61 | group_id=1 62 | 63 | [convolutional] 64 | batch_normalize=1 65 | filters=32 66 | size=3 67 | stride=1 68 | pad=1 69 | activation=leaky 70 | 71 | [convolutional] 72 | batch_normalize=1 73 | filters=32 74 | size=3 75 | stride=1 76 | pad=1 77 | activation=leaky 78 | 79 | [route] 80 | layers = -1,-2 81 | 82 | [convolutional] 83 | batch_normalize=1 84 | filters=64 85 | size=1 86 | stride=1 87 | pad=1 88 | activation=leaky 89 | 90 | [route] 91 | layers = -6,-1 92 | 93 | [maxpool] 94 | size=2 95 | stride=2 96 | 97 | [convolutional] 98 | batch_normalize=1 99 | filters=128 100 | size=3 101 | stride=1 102 | pad=1 103 | activation=leaky 104 | 105 | [route] 106 | layers=-1 107 | groups=2 108 | group_id=1 109 | 110 | [convolutional] 111 | batch_normalize=1 112 | filters=64 113 | size=3 114 | stride=1 115 | pad=1 116 | activation=leaky 117 | 118 | [convolutional] 119 | batch_normalize=1 120 | filters=64 121 | size=3 122 | stride=1 123 | pad=1 124 | activation=leaky 125 | 126 | [route] 127 | layers = -1,-2 128 | 129 | [convolutional] 130 | batch_normalize=1 131 | filters=128 132 | size=1 133 | stride=1 134 | pad=1 135 | activation=leaky 136 | 137 | [route] 138 | layers = -6,-1 139 | 140 | [maxpool] 141 | size=2 142 | stride=2 143 | 144 | [convolutional] 145 | batch_normalize=1 146 | filters=256 147 | size=3 148 | stride=1 149 | pad=1 150 | activation=leaky 151 | 152 | [route] 153 | layers=-1 154 | groups=2 155 | group_id=1 156 | 157 | [convolutional] 158 | batch_normalize=1 159 | filters=128 160 | size=3 161 | stride=1 162 | pad=1 163 | activation=leaky 164 | 165 | [convolutional] 166 | batch_normalize=1 167 | filters=128 168 | size=3 169 | stride=1 170 | pad=1 171 | activation=leaky 172 | 173 | [route] 174 | layers = -1,-2 175 | 176 | [convolutional] 177 | batch_normalize=1 178 | filters=256 179 | size=1 180 | stride=1 181 | pad=1 182 | activation=leaky 183 | 184 | [route] 185 | layers = -6,-1 186 | 187 | [maxpool] 188 | size=2 189 | stride=2 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=512 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | ################################## 200 | 201 | [convolutional] 202 | batch_normalize=1 203 | filters=256 204 | size=1 205 | stride=1 206 | pad=1 207 | activation=leaky 208 | 209 | [convolutional] 210 | batch_normalize=1 211 | filters=512 212 | size=3 213 | stride=1 214 | pad=1 215 | activation=leaky 216 | 217 | [convolutional] 218 | size=1 219 | stride=1 220 | pad=1 221 | filters=255 222 | activation=linear 223 | 224 | 225 | 226 | [yolo] 227 | mask = 3,4,5 228 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 229 | classes=80 230 | num=6 231 | jitter=.3 232 | scale_x_y = 1.05 233 | cls_normalizer=1.0 234 | iou_normalizer=0.07 235 | iou_loss=ciou 236 | ignore_thresh = .7 237 | truth_thresh = 1 238 | random=0 239 | resize=1.5 240 | nms_kind=greedynms 241 | beta_nms=0.6 242 | #new_coords=1 243 | #scale_x_y = 2.0 244 | 245 | [route] 246 | layers = -4 247 | 248 | [convolutional] 249 | batch_normalize=1 250 | filters=128 251 | size=1 252 | stride=1 253 | pad=1 254 | activation=leaky 255 | 256 | [upsample] 257 | stride=2 258 | 259 | [route] 260 | layers = -1, 23 261 | 262 | [convolutional] 263 | batch_normalize=1 264 | filters=256 265 | size=3 266 | stride=1 267 | pad=1 268 | activation=leaky 269 | 270 | [convolutional] 271 | size=1 272 | stride=1 273 | pad=1 274 | filters=255 275 | activation=linear 276 | 277 | [yolo] 278 | mask = 1,2,3 279 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 280 | classes=80 281 | num=6 282 | jitter=.3 283 | scale_x_y = 1.05 284 | cls_normalizer=1.0 285 | iou_normalizer=0.07 286 | iou_loss=ciou 287 | ignore_thresh = .7 288 | truth_thresh = 1 289 | random=0 290 | resize=1.5 291 | nms_kind=greedynms 292 | beta_nms=0.6 293 | #new_coords=1 294 | #scale_x_y = 2.0 -------------------------------------------------------------------------------- /Resources/yolov4-tiny.weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paveldat/objects_detection_on_video/a9dc705ad00f1b2173a865a214357b48ad4119d9/Resources/yolov4-tiny.weights -------------------------------------------------------------------------------- /Result/input/example.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paveldat/objects_detection_on_video/a9dc705ad00f1b2173a865a214357b48ad4119d9/Result/input/example.gif -------------------------------------------------------------------------------- /Result/output/example.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paveldat/objects_detection_on_video/a9dc705ad00f1b2173a865a214357b48ad4119d9/Result/output/example.gif -------------------------------------------------------------------------------- /libraries.bat: -------------------------------------------------------------------------------- 1 | pip install opencv-python 2 | pip install numpy 3 | pip install art -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from art import tprint 4 | 5 | 6 | def apply_yolo_object_detection(image_to_process): 7 | """ 8 | Recognition and determination of the coordinates of objects on the image 9 | :param image_to_process: original image 10 | :return: image with marked objects and captions to them 11 | """ 12 | 13 | height, width, _ = image_to_process.shape 14 | blob = cv2.dnn.blobFromImage(image_to_process, 1 / 255, (608, 608), 15 | (0, 0, 0), swapRB=True, crop=False) 16 | net.setInput(blob) 17 | outs = net.forward(out_layers) 18 | class_indexes, class_scores, boxes = ([] for i in range(3)) 19 | objects_count = 0 20 | 21 | # Starting a search for objects in an image 22 | for out in outs: 23 | for obj in out: 24 | scores = obj[5:] 25 | class_index = np.argmax(scores) 26 | class_score = scores[class_index] 27 | if class_score > 0: 28 | center_x = int(obj[0] * width) 29 | center_y = int(obj[1] * height) 30 | obj_width = int(obj[2] * width) 31 | obj_height = int(obj[3] * height) 32 | box = [center_x - obj_width // 2, center_y - obj_height // 2, 33 | obj_width, obj_height] 34 | boxes.append(box) 35 | class_indexes.append(class_index) 36 | class_scores.append(float(class_score)) 37 | 38 | # Selection 39 | chosen_boxes = cv2.dnn.NMSBoxes(boxes, class_scores, 0.0, 0.4) 40 | for box_index in chosen_boxes: 41 | box_index = box_index 42 | box = boxes[box_index] 43 | class_index = class_indexes[box_index] 44 | 45 | # For debugging, we draw objects included in the desired classes 46 | if classes[class_index] in classes_to_look_for: 47 | objects_count += 1 48 | image_to_process = draw_object_bounding_box(image_to_process, 49 | class_index, box) 50 | 51 | final_image = draw_object_count(image_to_process, objects_count) 52 | return final_image 53 | 54 | 55 | def draw_object_bounding_box(image_to_process, index, box): 56 | """ 57 | Drawing object borders with captions 58 | :param image_to_process: original image 59 | :param index: index of object class defined with YOLO 60 | :param box: coordinates of the area around the object 61 | :return: image with marked objects 62 | """ 63 | 64 | x, y, w, h = box 65 | start = (x, y) 66 | end = (x + w, y + h) 67 | color = (0, 255, 0) 68 | width = 2 69 | final_image = cv2.rectangle(image_to_process, start, end, color, width) 70 | 71 | start = (x, y - 10) 72 | font_size = 1 73 | font = cv2.FONT_HERSHEY_SIMPLEX 74 | width = 2 75 | text = classes[index] 76 | final_image = cv2.putText(final_image, text, start, font, 77 | font_size, color, width, cv2.LINE_AA) 78 | 79 | return final_image 80 | 81 | 82 | def draw_object_count(image_to_process, objects_count): 83 | """ 84 | Signature of the number of found objects in the image 85 | :param image_to_process: original image 86 | :param objects_count: the number of objects of the desired class 87 | :return: image with labeled number of found objects 88 | """ 89 | 90 | start = (10, 120) 91 | font_size = 1.5 92 | font = cv2.FONT_HERSHEY_SIMPLEX 93 | width = 3 94 | text = "Objects found: " + str(objects_count) 95 | 96 | # Text output with a stroke 97 | # (so that it can be seen in different lighting conditions of the picture) 98 | white_color = (255, 255, 255) 99 | black_outline_color = (0, 0, 0) 100 | final_image = cv2.putText(image_to_process, text, start, font, font_size, 101 | black_outline_color, width * 3, cv2.LINE_AA) 102 | final_image = cv2.putText(final_image, text, start, font, font_size, 103 | white_color, width, cv2.LINE_AA) 104 | 105 | return final_image 106 | 107 | 108 | def start_video_object_detection(video: str): 109 | """ 110 | Захват и анализ видео в режиме реального времени 111 | """ 112 | 113 | while True: 114 | try: 115 | # Capturing a picture from a video 116 | video_camera_capture = cv2.VideoCapture(video) 117 | 118 | while video_camera_capture.isOpened(): 119 | ret, frame = video_camera_capture.read() 120 | if not ret: 121 | break 122 | 123 | # Application of object recognition methods on a video frame from YOLO 124 | frame = apply_yolo_object_detection(frame) 125 | 126 | # Displaying the processed image on the screen with a reduced window size 127 | frame = cv2.resize(frame, (1920 // 2, 1080 // 2)) 128 | cv2.imshow("Video Capture", frame) 129 | cv2.waitKey(1) 130 | 131 | video_camera_capture.release() 132 | cv2.destroyAllWindows() 133 | 134 | except KeyboardInterrupt: 135 | pass 136 | 137 | 138 | if __name__ == '__main__': 139 | 140 | # Logo 141 | tprint("Object detection") 142 | tprint("by") 143 | tprint("paveldat") 144 | 145 | # Loading YOLO scales from files and setting up the network 146 | net = cv2.dnn.readNetFromDarknet("Resources/yolov4-tiny.cfg", 147 | "Resources/yolov4-tiny.weights") 148 | layer_names = net.getLayerNames() 149 | out_layers_indexes = net.getUnconnectedOutLayers() 150 | out_layers = [layer_names[index - 1] for index in out_layers_indexes] 151 | 152 | # Loading from a file of object classes that YOLO can detect 153 | with open("Resources/coco.names.txt") as file: 154 | classes = file.read().split("\n") 155 | 156 | # Determining classes that will be prioritized for search in an image 157 | # The names are in the file coco.names.txt 158 | 159 | video = input("Path to video (or URL): ") 160 | look_for = input("What we are looking for: ").split(',') 161 | 162 | # Delete spaces 163 | list_look_for = [] 164 | for look in look_for: 165 | list_look_for.append(look.strip()) 166 | 167 | classes_to_look_for = list_look_for 168 | 169 | start_video_object_detection(video) 170 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python 2 | numpy 3 | art --------------------------------------------------------------------------------