├── .gitattributes ├── .gitignore ├── README.md ├── YOLOv4 ├── app.py ├── camera.py ├── cfg │ ├── tiny-yolo-voc.cfg │ ├── yolo-voc.cfg │ ├── yolo.cfg │ ├── yolov3-spp.cfg │ ├── yolov3-tiny.cfg │ ├── yolov3.cfg │ └── yolov4.cfg ├── darknet.py ├── data │ ├── coco.names │ └── voc.names ├── object_detection.py ├── pallete ├── requirements.txt ├── templates │ ├── 12.jpg │ ├── base.html │ └── index.html └── tool │ ├── config.py │ ├── region_loss.py │ ├── torch_utils.py │ ├── utils.py │ └── yolo_layer.py ├── app.py ├── bbox.py ├── camera.py ├── cfg ├── tiny-yolo-voc.cfg ├── yolo-voc.cfg ├── yolo.cfg ├── yolov3-spp.cfg ├── yolov3-tiny.cfg └── yolov3.cfg ├── darknet.py ├── data ├── coco.names └── voc.names ├── object_detection.py ├── pallete ├── preprocess.py ├── requirements.txt ├── templates ├── 12.jpg ├── base.html └── index.html ├── util.py └── utils ├── app_utils.py └── objDet_utils.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.weights filter=lfs diff=lfs merge=lfs -text 2 | *.mp4 filter=lfs diff=lfs merge=lfs -text 3 | *.avi filter=lfs diff=lfs merge=lfs -text 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Object Detection and Distance Measurement 2 | 3 | [![N|Solid](http://muizzer07.pythonanywhere.com/media/files/YOLO-m-ram-copy_RQByeS4.jpg)](https://pjreddie.com/darknet/yolo/?style=centerme) 4 | 5 | 6 | ## Introduction 7 | This repo contains object_detection.py, which can perform the following task - 8 | - Object detection from a live video frame, in any video file, or in an image 9 | - Counting the number of objects in a frame 10 | - Measuring the distance of an object using depth information 11 | - Inference on Multiple Camera feed at a time 12 | 13 | For object detection, YOLO-V3 has been used, which can detect 80 different objects. Some of those are- 14 | - person 15 | - car 16 | - bus 17 | - stop sign 18 | - bench 19 | - dog 20 | - bear 21 | - backpack, and so on. 22 | 23 | ### User Instruction 24 | 25 | ## Update 26 | 27 | **There is a new update with [yolov4 new release](https://github.com/Tianxiaomo/pytorch-YOLOv4). All you have to do a simple step which is after downloading the project, run the following command and follow the rest of the process as it is.** 28 | 29 | ``` 30 | cd YOLOv4 31 | ``` 32 | 33 | You can also use Yolact++ as an object detector using [this] repo (https://github.com/paul-pias/Social-Distance-Monitoring). 34 | 35 | 36 | To execute object_dection.py, you require Python version > 3.5 (depending on whether you are using GPU or not) and have to install the following libraries. 37 | 38 | ### Installation 39 | ``` python 40 | $ pip install -r requirements.txt 41 | or 42 | $ pip install opencv-python 43 | $ pip install numpy 44 | $ pip install pandas 45 | $ pip install matplotlib 46 | $ pip install Pillow 47 | $ pip install imutils 48 | ``` 49 |
50 | 51 | #### For the installation of torch using "pip" 52 | ``` python 53 | $ pip3 install torch===1.2.0 torchvision===0.4.0 -f https://download.pytorch.org/whl/torch_stable.html 54 | ``` 55 | or please follow the instructions from [Pytorch](https://pytorch.org/) 56 | #### For installing the "win32com.client" which is Text-to-Speech module for windows you have follow this 57 | First, open the cmd as an administrator, then run 58 | ``` python 59 | $ python -m pip install pywin32 60 | #After installing, open your Python shell and run 61 | import win32com.client 62 | speaker = win32com.client.Dispatch("SAPI.SpVoice") 63 | speaker.Speak("Good Morning") 64 | ``` 65 | 66 | You need to clone the repository using git bash (if git bash has already been installed), or you can download the zip file. 67 | ``` python 68 | $ git clone https://github.com/paul-pias/Object-Detection-and-Distance-Measurement.git 69 | ``` 70 | 71 | After unzipping the project, there are two ways to run this. If you want to see your output in your browser, execute the "app.py" script or run "object_detection.py" to execute it locally. 72 | 73 | 74 | If you want to run object detection and distance measurement on a video file, write the name of the video file to variable id in either "app.py" or "object_detection.py" or if you want to run it on your webcam just put 0 in id. 75 | 76 | However, if you want to run the inference on a feed of IP Camera , use the following convention while assigning it to the variable "id" 77 | ``` python 78 | "rtsp://assigned_name_of_the_camera:assigned_password@camer_ip/" 79 | ``` 80 | 81 | You can check the performance on different weights of YOLO, which are available in [YOLO](https://pjreddie.com/darknet/yolo/?style=centerme) 82 | 83 | For multiple camera support, you need to add a few lines of codes as follows in app.py- 84 | 85 | ``` python 86 | def simulate(camera): 87 | while True: 88 | frame = camera.main() 89 | if frame != "": 90 | yield (b'--frame\r\n' 91 | b'Content-Type: image/jpeg\r\n\r\n' + frame + b'\r\n\r\n') 92 | 93 | @app.route('/video_simulate') 94 | def video_simulate(): 95 | id = 0 96 | return Response(gen(ObjectDetection(id)), mimetype='multipart/x-mixed-replace; boundary=frame') 97 | ``` 98 | 99 | Depending on how many feeds you need, add the two methods in "app.py" with different names and add a section in index.html. 100 | 101 | ``` html 102 |
103 |
104 |

Camera - 01

105 |
106 | 107 |
108 | 109 |
110 |
111 |
112 | ``` 113 | #### Note: 114 | You have to use git-lfs to download the yolov3.weight file. However you can also download it from here [YOLOv3 @ Google-Drive](https://drive.google.com/drive/folders/1nN49gRqt5HvuMptfc0wRVcuLwiNmMD6u?usp=sharing) || [YOLOv4 @ Google-Drive](https://drive.google.com/file/d/1cewMfusmPjYWbrnuJRuKhPMwRe_b9PaT/view) 115 |
116 | 117 | #### Theory 118 | There are two well-known strategies in a traditional image classification approach for object detection. 119 | 120 | There are two scenarios for a single object in an image. 121 | - Classification 122 | - Localization 123 | 124 | There are two scenarios for multiple objects in an image. 125 | - Object detection and localization 126 | - Object segmentation 127 |

128 | For Single Objects 129 | 130 |

131 | 132 |

133 | For Multiple Objects 134 | 135 |

136 | 137 | ## Distance Measurement 138 |

139 | 140 |

141 |
142 | Traditionally, we measure the distance of any object using Ultrasonic sensors such as HC-sr04 or any other high-frequency devices that generate sound waves to calculate the distance it traverses. 143 | However, when you are working with an embedded device to make a compact design that has functionalities such as 144 | 145 | - Object detection (with camera) and 146 | - Distance measurement 147 | 148 | You don't always want to make your device heavier by adding unnecessary hardware modules. To avoid such cases, you can follow a more convenient and feasible approach. As you have already integrated a camera for object detection, you can use the depth information that the camera uses to draw the bounding boxes for localizing objects to calculate the distance of that object from the camera. 149 | 150 | ### How the object detection works? 151 | From the initial part, we understood that to measure the distance from an image, we had to localize it first to get the depth information. 152 | Now, how localization works? 153 | 154 | #### Localize objects with regression 155 | Regression is about returning a number instead of a class. The number can be represented as (x0,y0,width,height) which are related to a bounding box. In the images illustrated above for single object if you want to only classify the object type then we don't need to draw the bounding box around that object that's why this part is known as Classification . 156 | However, if we are interested to know where does this object locates in the image, then we need to know that 4 numbers that a regreesion layer will return. As you can see there is a black rectangle shape box in the image of a white dog, which was drawn using the regression layer. What happens here is that after the final convolutional layer + Fully connected layers, instead of asking for class scores to compare with some offsets, a regression layer is introduced. Regression layer is nothing but some rectangular box which represents individual objects. For every frame/image to detect objects the following things happens. 157 | - Using the inference on any pre-trained imagenet model the last fully connected layer will need to be re-trained to the desired objects. 158 | - After that all the proposals (=~2000proposal/image) will be resized to maatch the inputs of the cnn. 159 | - A SVM is need to be trained to classify between object and background (One binary SVM(Support Vector Machine) for each class) 160 | - And to put the bounding box perfectly over the image a linear regression classifier is needed to be trained which will output some correction factor. 161 | Problem with this approch is that one part of the network is dedicated for region proposals. After the full connected layers the model tries to propose certain regions on that image which may contain object/objects. So it also requires a high qulaity classifier to filter out valid proposals which will definitely contains object/objects. Although these methos is very accurate but it comes with a big computational cost (low frame-rate) and that's why it is not suitable for embedded devices such as Arduino or Raspberry Pi which has less processing power. 162 |
163 | 164 | #### Localizing with Convolution neural networks 165 | 166 | Another way of doing object detection and to reduce this tedious work is by combining the previous two task into one network. Here, instead of proposing regions for every images the model is fed with a set of pre-defined boxes to look for objects. So prior to the training phase of a neural network some pre-defined rectangular boxes that represents some objects are given to the network to train with. So when a image is gone through the network, after the fully connected layer the trained model tries to match predefined boxes to objects on that image by using non-maxima suppression algorithm to completely tied. If the comparison crosses some threshold, the model tries to draw the bounding box over the object. For example, in the case of the picture of white dog, the model knows what is the coordinates of the box of the dog object and when the image classification is done the model uses L2 distance to calculate the loss between the actual box coordinates that was predefined and the coordinate that the model gave so that it can perfectly draw the bounding box over the object on that image. 167 | 168 | The main idea is to use the convolutional feature maps from the later layers of a network to run small CONV filters over these feature maps to predict class scores and bounding box offsets. 169 | Here, we are reusing the computation already made during classification to localize objects to grab the activation from the final conv layers. At this point, we still have the spatial information of an image that model starts training with but is represented in a much smaller scope. So, in the final layers, each "pixel" represent a larger area of the input image, so we can use those cells to infer object position. Here the tensor containing the original image's information is quite deep as it is now squeezed to a lower dimension. At this point, a 1x1 CONV layer can be used to classify each cell as a class, and also, from the same layer, we can add another CPNV or FC(Fully Connected) layer to predict four numbers( Bounding Box). In this way, we get both class scores and location from one. This approach is known as Single Shot Detection . The overall strategy in this approach can be summarised as follows:- 170 | - Train a CNN with regression(bounding box) and classification objective. 171 | - Gather Activation from a particular layer or layers to infer classification and location with FC layer or another CONV layer that works like an FC layer. 172 | - During prediction, use algorithms like non-maxima suppression to filter multiple boxes around the same object. 173 | - During training time, use algorithms like IoU to relate the predictions during training to the ground truth. 174 | 175 | [Yolo](https://pjreddie.com/media/files/papers/YOLOv3.pdf) follows the strategy of Single Shot Detection. It uses a single activation map for the prediction of classes and bounding boxes at a time that's why it is called "You Only Look Once". 176 | 177 | Here pre-trained yolo-v3 has been used, which can detect 80 different objects. Although this model is faster but it doesn't give the reliability of predicting the actual object in a given frame/image. It's a kind of trade-off between accuracy and precision. 178 | 179 | ### How the distance measurement works? 180 | This formula is used to determine the distance 181 | 182 | ``` python 183 | distancei = (2 x 3.14 x 180) ÷ (w + h x 360) x 1000 + 3 184 | ``` 185 | For measuring distance, at first, we have to understand how a camera sees an object. 186 |

187 | 188 |

189 | 190 | You can relate this image to the white dog picture where the dog was localized. Again we will get four numbers in the bounding box which is (x0,y0,width,height). Here x0,y0 is used to tiled or adjust the bounding box. Width and Height these two variables are used in the formula of measuring the object and describing the detail of the detected object/objects. Width and Height will vary depending on the distance of the object from the camera. 191 | 192 | As we know, an image goes refracted when it goes through a lens because the ray of light can also enter the lens, whereas, in the case of a mirror, the light can be reflected. That's why we get an exact reflection of the image. But in the case of the lens image gets a little stretched. The following image illustrates how the image and the corresponding angles look when it enters through a lens. 193 |

194 | 195 |

196 | If we see there are three variables named: 197 | 198 | - do (Distance of object from the lens) 199 | - di (Distance of the refracted image from the convex lens) 200 | - f (focal length or focal distance) 201 | 202 | So the green line "do" represents the actual distance of the object from the convex length. And "di" gives a sense of what the actual image looks like. Now if we consider a triangle on the left side of the image(new refracted image) with base "do" and draw an opposite triangle similar to the left side one. So the new base of the opposite triangle will also be done with the same perpendicular distance. Now if we compare the two triangles from the right side, we will see "do" and "di" is parallel, and the angle creates on each side of both triangles are opposite to each other. From this, we can infer that both the triangles on the right side are also similar. Now, as they are similar, the ratio of the corresponding sides will be also similar. So do/di = A/B. Again if we compare two triangles on the right side of the image where opposite angles are equal and one angle of both the triangles are right angle (90°) (dark blue area). So A:B is both hypotenuses of a similar triangle where both triangles has a right angle. So the new equation can be defined as : 203 |

204 | 205 |

206 | Now, if we derive from that equation we will find:- 207 |

208 | 209 |

210 | And eventually will come to at 211 |

212 | 213 |

214 | Where f is the focal length or also called the arc length by using the following formula 215 |

216 | 217 |

218 | we will get our final result in "inches" from this formula of distance. 219 | 220 | ``` python 221 | distancei = (2 x 3.14 x 180) ÷ (w + h x 360) x 1000 + 3 222 | ``` 223 | 224 | * Notes - As mentioned earlier YOLO prefers performance over accuracy that's why the model predicts wrong objects frequently. 225 | 226 | ## If anyone using this code for any kind of publication, kindly cite this work. 227 | 228 | M. A. Khan, P. Paul, M. Rashid, M. Hossain and M. A. R. Ahad, "An AI-Based Visual Aid With Integrated Reading Assistant for the Completely Blind," in IEEE Transactions on Human-Machine Systems. 229 | doi: 10.1109/THMS.2020.3027534 230 | 231 | 232 | 233 | 234 | ### Reference 235 | 236 | - [Real-Time Distance Measurement Using a Modified 237 | Camera ](https://sci-hub.tw/10.1109/SAS.2010.5439423) 238 | - [Real-time Distance Measurement Using Single Image](http://emaraic.com/blog/distance-measurement) 239 | - [Object image and focal distance relationship (proof of formula)](https://www.khanacademy.org/science/physics/geometric-optics/lenses/v/object-image-and-focal-distance-relationship-proof-of-formula) 240 | - [Distance or arc length from angular displacement](https://www.khanacademy.org/science/ap-physics-1/ap-centripetal-force-and-gravitation/introduction-to-uniform-circular-motion-ap/v/distance-or-arc-length-from-angular-displacement) 241 | -------------------------------------------------------------------------------- /YOLOv4/app.py: -------------------------------------------------------------------------------- 1 | # Flask utils 2 | from flask import Flask, redirect, url_for, request, render_template, Response 3 | from werkzeug.utils import secure_filename 4 | from gevent.pywsgi import WSGIServer 5 | from camera import ObjectDetection 6 | 7 | app = Flask(__name__) 8 | @app.route("/") 9 | def main(): 10 | return render_template("index.html") 11 | 12 | def gen(camera): 13 | while True: 14 | frame = camera.main() 15 | if frame != "": 16 | yield (b'--frame\r\n' 17 | b'Content-Type: image/jpeg\r\n\r\n' + frame + b'\r\n\r\n') 18 | 19 | @app.route('/video_feed') 20 | def video_feed(): 21 | id = 0 22 | return Response(gen(ObjectDetection(id)), mimetype='multipart/x-mixed-replace; boundary=frame') 23 | 24 | 25 | if __name__ == '__main__': 26 | # Serve the app with gevent 27 | app.run(host='127.0.0.1', threaded=True, debug = True) 28 | -------------------------------------------------------------------------------- /YOLOv4/camera.py: -------------------------------------------------------------------------------- 1 | import torch,cv2,random,os,time 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import numpy as np 5 | import pickle as pkl 6 | import argparse 7 | import threading, queue 8 | from torch.multiprocessing import Pool, Process, set_start_method 9 | from darknet import Darknet 10 | from imutils.video import WebcamVideoStream,FPS 11 | # from camera import write 12 | import win32com.client as wincl #### Python's Text-to-speech (tts) engine for windows, multiprocessing 13 | speak = wincl.Dispatch("SAPI.SpVoice") #### This initiates the tts engine 14 | 15 | torch.multiprocessing.set_start_method('spawn', force=True) 16 | 17 | ## Setting up torch for gpu utilization 18 | if torch.cuda.is_available(): 19 | torch.backends.cudnn.enabled = True 20 | torch.backends.cudnn.benchmark = True 21 | torch.backends.cudnn.deterministic = True 22 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 23 | 24 | def letterbox_image(img, inp_dim): 25 | '''resize image with unchanged aspect ratio using padding''' 26 | img_w, img_h = img.shape[1], img.shape[0] 27 | w, h = inp_dim 28 | new_w = int(img_w * min(w/img_w, h/img_h)) 29 | new_h = int(img_h * min(w/img_w, h/img_h)) 30 | resized_image = cv2.resize(img, (new_w,new_h), interpolation = cv2.INTER_CUBIC) 31 | canvas = np.full((inp_dim[1], inp_dim[0], 3), 128) 32 | canvas[(h-new_h)//2:(h-new_h)//2 + new_h,(w-new_w)//2:(w-new_w)//2 + new_w, :] = resized_image 33 | 34 | return canvas 35 | 36 | def load_classes(namesfile): 37 | fp = open(namesfile, "r") 38 | names = fp.read().split("\n")[:-1] 39 | return names 40 | 41 | def prep_image(img, inp_dim): 42 | """ 43 | Prepare image for inputting to the neural network. 44 | Returns a Variable 45 | """ 46 | orig_im = img 47 | dim = orig_im.shape[1], orig_im.shape[0] 48 | img = (letterbox_image(orig_im, (inp_dim, inp_dim))) 49 | img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy() 50 | img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) 51 | return img_, orig_im, dim 52 | 53 | 54 | 55 | class ObjectDetection: 56 | def __init__(self, id): 57 | # self.cap = cv2.VideoCapture(id) 58 | self.cap = WebcamVideoStream(src = id).start() 59 | self.cfgfile = "cfg/yolov4.cfg" 60 | self.weightsfile = "yolov4.weights" 61 | self.confidence = float(0.6) 62 | self.nms_thesh = float(0.8) 63 | self.num_classes = 80 64 | self.classes = load_classes('data/coco.names') 65 | self.colors = pkl.load(open("pallete", "rb")) 66 | self.model = Darknet(self.cfgfile) 67 | self.CUDA = torch.cuda.is_available() 68 | self.model.load_weights(self.weightsfile) 69 | self.width = 1280 #640#1280 70 | self.height = 720 #360#720 71 | print("Loading network.....") 72 | if self.CUDA: 73 | self.model.cuda() 74 | print("Network successfully loaded") 75 | 76 | self.model.eval() 77 | 78 | def main(self): 79 | q = queue.Queue() 80 | while True: 81 | def frame_render(queue_from_cam): 82 | frame = self.cap.read() # If you capture stream using opencv (cv2.VideoCapture()) the use the following line 83 | # ret, frame = self.cap.read() 84 | frame = cv2.resize(frame,(self.width, self.height)) 85 | queue_from_cam.put(frame) 86 | cam = threading.Thread(target=frame_render, args=(q,)) 87 | cam.start() 88 | cam.join() 89 | frame = q.get() 90 | q.task_done() 91 | fps = FPS().start() 92 | 93 | try: 94 | img, orig_im, dim = prep_image(frame, 160) 95 | 96 | im_dim = torch.FloatTensor(dim).repeat(1,2) 97 | if self.CUDA: #### If you have a gpu properly installed then it will run on the gpu 98 | im_dim = im_dim.cuda() 99 | img = img.cuda() 100 | # with torch.no_grad(): #### Set the model in the evaluation mode 101 | 102 | output = self.model(img) 103 | from tool.utils import post_processing,plot_boxes_cv2 104 | bounding_boxes = post_processing(img,self.confidence, self.nms_thesh, output) 105 | frame = plot_boxes_cv2(frame, bounding_boxes[0], savename= None, class_names=self.classes, color = None, colors=self.colors) 106 | 107 | except: 108 | pass 109 | 110 | fps.update() 111 | fps.stop() 112 | 113 | ret, jpeg = cv2.imencode('.jpg', frame) 114 | print("[INFO] elasped time: {:.2f}".format(fps.elapsed())) 115 | print("[INFO] approx. FPS: {:.1f}".format(fps.fps())) 116 | return jpeg.tostring() 117 | 118 | 119 | 120 | 121 | if __name__ == "__main__": 122 | id = 0 123 | ObjectDetection(id).main() 124 | -------------------------------------------------------------------------------- /YOLOv4/cfg/tiny-yolo-voc.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | batch=64 3 | subdivisions=8 4 | width=416 5 | height=416 6 | channels=3 7 | momentum=0.9 8 | decay=0.0005 9 | angle=0 10 | saturation = 1.5 11 | exposure = 1.5 12 | hue=.1 13 | 14 | learning_rate=0.001 15 | max_batches = 40200 16 | policy=steps 17 | steps=-1,100,20000,30000 18 | scales=.1,10,.1,.1 19 | 20 | [convolutional] 21 | batch_normalize=1 22 | filters=16 23 | size=3 24 | stride=1 25 | pad=1 26 | activation=leaky 27 | 28 | [maxpool] 29 | size=2 30 | stride=2 31 | 32 | [convolutional] 33 | batch_normalize=1 34 | filters=32 35 | size=3 36 | stride=1 37 | pad=1 38 | activation=leaky 39 | 40 | [maxpool] 41 | size=2 42 | stride=2 43 | 44 | [convolutional] 45 | batch_normalize=1 46 | filters=64 47 | size=3 48 | stride=1 49 | pad=1 50 | activation=leaky 51 | 52 | [maxpool] 53 | size=2 54 | stride=2 55 | 56 | [convolutional] 57 | batch_normalize=1 58 | filters=128 59 | size=3 60 | stride=1 61 | pad=1 62 | activation=leaky 63 | 64 | [maxpool] 65 | size=2 66 | stride=2 67 | 68 | [convolutional] 69 | batch_normalize=1 70 | filters=256 71 | size=3 72 | stride=1 73 | pad=1 74 | activation=leaky 75 | 76 | [maxpool] 77 | size=2 78 | stride=2 79 | 80 | [convolutional] 81 | batch_normalize=1 82 | filters=512 83 | size=3 84 | stride=1 85 | pad=1 86 | activation=leaky 87 | 88 | [maxpool] 89 | size=2 90 | stride=1 91 | 92 | [convolutional] 93 | batch_normalize=1 94 | filters=1024 95 | size=3 96 | stride=1 97 | pad=1 98 | activation=leaky 99 | 100 | ########### 101 | 102 | [convolutional] 103 | batch_normalize=1 104 | size=3 105 | stride=1 106 | pad=1 107 | filters=1024 108 | activation=leaky 109 | 110 | [convolutional] 111 | size=1 112 | stride=1 113 | pad=1 114 | filters=125 115 | activation=linear 116 | 117 | [region] 118 | anchors = 1.08,1.19, 3.42,4.41, 6.63,11.38, 9.42,5.11, 16.62,10.52 119 | bias_match=1 120 | classes=20 121 | coords=4 122 | num=5 123 | softmax=1 124 | jitter=.2 125 | rescore=1 126 | 127 | object_scale=5 128 | noobject_scale=1 129 | class_scale=1 130 | coord_scale=1 131 | 132 | absolute=1 133 | thresh = .6 134 | random=1 135 | -------------------------------------------------------------------------------- /YOLOv4/cfg/yolo-voc.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=64 4 | subdivisions=8 5 | # Training 6 | # batch=64 7 | # subdivisions=8 8 | height=416 9 | width=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 80200 21 | policy=steps 22 | steps=-1,500,40000,60000 23 | scales=0.1,10,.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=64 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=128 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [convolutional] 58 | batch_normalize=1 59 | filters=64 60 | size=1 61 | stride=1 62 | pad=1 63 | activation=leaky 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=1 70 | pad=1 71 | activation=leaky 72 | 73 | [maxpool] 74 | size=2 75 | stride=2 76 | 77 | [convolutional] 78 | batch_normalize=1 79 | filters=256 80 | size=3 81 | stride=1 82 | pad=1 83 | activation=leaky 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=128 88 | size=1 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=256 96 | size=3 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [maxpool] 102 | size=2 103 | stride=2 104 | 105 | [convolutional] 106 | batch_normalize=1 107 | filters=512 108 | size=3 109 | stride=1 110 | pad=1 111 | activation=leaky 112 | 113 | [convolutional] 114 | batch_normalize=1 115 | filters=256 116 | size=1 117 | stride=1 118 | pad=1 119 | activation=leaky 120 | 121 | [convolutional] 122 | batch_normalize=1 123 | filters=512 124 | size=3 125 | stride=1 126 | pad=1 127 | activation=leaky 128 | 129 | [convolutional] 130 | batch_normalize=1 131 | filters=256 132 | size=1 133 | stride=1 134 | pad=1 135 | activation=leaky 136 | 137 | [convolutional] 138 | batch_normalize=1 139 | filters=512 140 | size=3 141 | stride=1 142 | pad=1 143 | activation=leaky 144 | 145 | [maxpool] 146 | size=2 147 | stride=2 148 | 149 | [convolutional] 150 | batch_normalize=1 151 | filters=1024 152 | size=3 153 | stride=1 154 | pad=1 155 | activation=leaky 156 | 157 | [convolutional] 158 | batch_normalize=1 159 | filters=512 160 | size=1 161 | stride=1 162 | pad=1 163 | activation=leaky 164 | 165 | [convolutional] 166 | batch_normalize=1 167 | filters=1024 168 | size=3 169 | stride=1 170 | pad=1 171 | activation=leaky 172 | 173 | [convolutional] 174 | batch_normalize=1 175 | filters=512 176 | size=1 177 | stride=1 178 | pad=1 179 | activation=leaky 180 | 181 | [convolutional] 182 | batch_normalize=1 183 | filters=1024 184 | size=3 185 | stride=1 186 | pad=1 187 | activation=leaky 188 | 189 | 190 | ####### 191 | 192 | [convolutional] 193 | batch_normalize=1 194 | size=3 195 | stride=1 196 | pad=1 197 | filters=1024 198 | activation=leaky 199 | 200 | [convolutional] 201 | batch_normalize=1 202 | size=3 203 | stride=1 204 | pad=1 205 | filters=1024 206 | activation=leaky 207 | 208 | [route] 209 | layers=-9 210 | 211 | [convolutional] 212 | batch_normalize=1 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=64 217 | activation=leaky 218 | 219 | [reorg] 220 | stride=2 221 | 222 | [route] 223 | layers=-1,-4 224 | 225 | [convolutional] 226 | batch_normalize=1 227 | size=3 228 | stride=1 229 | pad=1 230 | filters=1024 231 | activation=leaky 232 | 233 | [convolutional] 234 | size=1 235 | stride=1 236 | pad=1 237 | filters=125 238 | activation=linear 239 | 240 | 241 | [region] 242 | anchors = 1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071 243 | bias_match=1 244 | classes=20 245 | coords=4 246 | num=5 247 | softmax=1 248 | jitter=.3 249 | rescore=1 250 | 251 | object_scale=5 252 | noobject_scale=1 253 | class_scale=1 254 | coord_scale=1 255 | 256 | absolute=1 257 | thresh = .6 258 | random=1 259 | -------------------------------------------------------------------------------- /YOLOv4/cfg/yolo.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=8 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=64 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=128 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [convolutional] 58 | batch_normalize=1 59 | filters=64 60 | size=1 61 | stride=1 62 | pad=1 63 | activation=leaky 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=1 70 | pad=1 71 | activation=leaky 72 | 73 | [maxpool] 74 | size=2 75 | stride=2 76 | 77 | [convolutional] 78 | batch_normalize=1 79 | filters=256 80 | size=3 81 | stride=1 82 | pad=1 83 | activation=leaky 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=128 88 | size=1 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=256 96 | size=3 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [maxpool] 102 | size=2 103 | stride=2 104 | 105 | [convolutional] 106 | batch_normalize=1 107 | filters=512 108 | size=3 109 | stride=1 110 | pad=1 111 | activation=leaky 112 | 113 | [convolutional] 114 | batch_normalize=1 115 | filters=256 116 | size=1 117 | stride=1 118 | pad=1 119 | activation=leaky 120 | 121 | [convolutional] 122 | batch_normalize=1 123 | filters=512 124 | size=3 125 | stride=1 126 | pad=1 127 | activation=leaky 128 | 129 | [convolutional] 130 | batch_normalize=1 131 | filters=256 132 | size=1 133 | stride=1 134 | pad=1 135 | activation=leaky 136 | 137 | [convolutional] 138 | batch_normalize=1 139 | filters=512 140 | size=3 141 | stride=1 142 | pad=1 143 | activation=leaky 144 | 145 | [maxpool] 146 | size=2 147 | stride=2 148 | 149 | [convolutional] 150 | batch_normalize=1 151 | filters=1024 152 | size=3 153 | stride=1 154 | pad=1 155 | activation=leaky 156 | 157 | [convolutional] 158 | batch_normalize=1 159 | filters=512 160 | size=1 161 | stride=1 162 | pad=1 163 | activation=leaky 164 | 165 | [convolutional] 166 | batch_normalize=1 167 | filters=1024 168 | size=3 169 | stride=1 170 | pad=1 171 | activation=leaky 172 | 173 | [convolutional] 174 | batch_normalize=1 175 | filters=512 176 | size=1 177 | stride=1 178 | pad=1 179 | activation=leaky 180 | 181 | [convolutional] 182 | batch_normalize=1 183 | filters=1024 184 | size=3 185 | stride=1 186 | pad=1 187 | activation=leaky 188 | 189 | 190 | ####### 191 | 192 | [convolutional] 193 | batch_normalize=1 194 | size=3 195 | stride=1 196 | pad=1 197 | filters=1024 198 | activation=leaky 199 | 200 | [convolutional] 201 | batch_normalize=1 202 | size=3 203 | stride=1 204 | pad=1 205 | filters=1024 206 | activation=leaky 207 | 208 | [route] 209 | layers=-9 210 | 211 | [convolutional] 212 | batch_normalize=1 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=64 217 | activation=leaky 218 | 219 | [reorg] 220 | stride=2 221 | 222 | [route] 223 | layers=-1,-4 224 | 225 | [convolutional] 226 | batch_normalize=1 227 | size=3 228 | stride=1 229 | pad=1 230 | filters=1024 231 | activation=leaky 232 | 233 | [convolutional] 234 | size=1 235 | stride=1 236 | pad=1 237 | filters=425 238 | activation=linear 239 | 240 | 241 | [region] 242 | anchors = 0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828 243 | bias_match=1 244 | classes=80 245 | coords=4 246 | num=5 247 | softmax=1 248 | jitter=.3 249 | rescore=1 250 | 251 | object_scale=5 252 | noobject_scale=1 253 | class_scale=1 254 | coord_scale=1 255 | 256 | absolute=1 257 | thresh = .6 258 | random=1 259 | -------------------------------------------------------------------------------- /YOLOv4/cfg/yolov3-spp.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=leaky 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=leaky 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=leaky 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=255 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 6,7,8 642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 643 | classes=80 644 | num=9 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=256 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=leaky 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | size=3 697 | stride=1 698 | pad=1 699 | filters=512 700 | activation=leaky 701 | 702 | [convolutional] 703 | batch_normalize=1 704 | filters=256 705 | size=1 706 | stride=1 707 | pad=1 708 | activation=leaky 709 | 710 | [convolutional] 711 | batch_normalize=1 712 | size=3 713 | stride=1 714 | pad=1 715 | filters=512 716 | activation=leaky 717 | 718 | [convolutional] 719 | size=1 720 | stride=1 721 | pad=1 722 | filters=255 723 | activation=linear 724 | 725 | 726 | [yolo] 727 | mask = 3,4,5 728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 729 | classes=80 730 | num=9 731 | jitter=.3 732 | ignore_thresh = .7 733 | truth_thresh = 1 734 | random=1 735 | 736 | 737 | 738 | [route] 739 | layers = -4 740 | 741 | [convolutional] 742 | batch_normalize=1 743 | filters=128 744 | size=1 745 | stride=1 746 | pad=1 747 | activation=leaky 748 | 749 | [upsample] 750 | stride=2 751 | 752 | [route] 753 | layers = -1, 36 754 | 755 | 756 | 757 | [convolutional] 758 | batch_normalize=1 759 | filters=128 760 | size=1 761 | stride=1 762 | pad=1 763 | activation=leaky 764 | 765 | [convolutional] 766 | batch_normalize=1 767 | size=3 768 | stride=1 769 | pad=1 770 | filters=256 771 | activation=leaky 772 | 773 | [convolutional] 774 | batch_normalize=1 775 | filters=128 776 | size=1 777 | stride=1 778 | pad=1 779 | activation=leaky 780 | 781 | [convolutional] 782 | batch_normalize=1 783 | size=3 784 | stride=1 785 | pad=1 786 | filters=256 787 | activation=leaky 788 | 789 | [convolutional] 790 | batch_normalize=1 791 | filters=128 792 | size=1 793 | stride=1 794 | pad=1 795 | activation=leaky 796 | 797 | [convolutional] 798 | batch_normalize=1 799 | size=3 800 | stride=1 801 | pad=1 802 | filters=256 803 | activation=leaky 804 | 805 | [convolutional] 806 | size=1 807 | stride=1 808 | pad=1 809 | filters=255 810 | activation=linear 811 | 812 | 813 | [yolo] 814 | mask = 0,1,2 815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 816 | classes=80 817 | num=9 818 | jitter=.3 819 | ignore_thresh = .7 820 | truth_thresh = 1 821 | random=1 822 | 823 | -------------------------------------------------------------------------------- /YOLOv4/cfg/yolov3-tiny.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=2 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=16 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=32 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=64 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [maxpool] 58 | size=2 59 | stride=2 60 | 61 | [convolutional] 62 | batch_normalize=1 63 | filters=128 64 | size=3 65 | stride=1 66 | pad=1 67 | activation=leaky 68 | 69 | [maxpool] 70 | size=2 71 | stride=2 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=256 76 | size=3 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [maxpool] 82 | size=2 83 | stride=2 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=512 88 | size=3 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [maxpool] 94 | size=2 95 | stride=1 96 | 97 | [convolutional] 98 | batch_normalize=1 99 | filters=1024 100 | size=3 101 | stride=1 102 | pad=1 103 | activation=leaky 104 | 105 | ########### 106 | 107 | [convolutional] 108 | batch_normalize=1 109 | filters=256 110 | size=1 111 | stride=1 112 | pad=1 113 | activation=leaky 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=512 118 | size=3 119 | stride=1 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | size=1 125 | stride=1 126 | pad=1 127 | filters=255 128 | activation=linear 129 | 130 | 131 | 132 | [yolo] 133 | mask = 3,4,5 134 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 135 | classes=80 136 | num=6 137 | jitter=.3 138 | ignore_thresh = .7 139 | truth_thresh = 1 140 | random=1 141 | 142 | [route] 143 | layers = -4 144 | 145 | [convolutional] 146 | batch_normalize=1 147 | filters=128 148 | size=1 149 | stride=1 150 | pad=1 151 | activation=leaky 152 | 153 | [upsample] 154 | stride=2 155 | 156 | [route] 157 | layers = -1, 8 158 | 159 | [convolutional] 160 | batch_normalize=1 161 | filters=256 162 | size=3 163 | stride=1 164 | pad=1 165 | activation=leaky 166 | 167 | [convolutional] 168 | size=1 169 | stride=1 170 | pad=1 171 | filters=255 172 | activation=linear 173 | 174 | [yolo] 175 | mask = 0,1,2 176 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 177 | classes=80 178 | num=6 179 | jitter=.3 180 | ignore_thresh = .7 181 | truth_thresh = 1 182 | random=1 183 | -------------------------------------------------------------------------------- /YOLOv4/cfg/yolov3.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=16 8 | width= 320 9 | height = 320 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [convolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=255 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 6,7,8 609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 610 | classes=80 611 | num=9 612 | jitter=.3 613 | ignore_thresh = .5 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [convolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=255 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 3,4,5 695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 696 | classes=80 697 | num=9 698 | jitter=.3 699 | ignore_thresh = .5 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [convolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=255 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 0,1,2 782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 783 | classes=80 784 | num=9 785 | jitter=.3 786 | ignore_thresh = .5 787 | truth_thresh = 1 788 | random=1 789 | 790 | -------------------------------------------------------------------------------- /YOLOv4/cfg/yolov4.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | batch=64 3 | subdivisions=8 4 | # Training 5 | #width=512 6 | #height=512 7 | width=608 8 | height=608 9 | channels=3 10 | momentum=0.949 11 | decay=0.0005 12 | angle=0 13 | saturation = 1.5 14 | exposure = 1.5 15 | hue=.1 16 | 17 | learning_rate=0.0013 18 | burn_in=1000 19 | max_batches = 500500 20 | policy=steps 21 | steps=400000,450000 22 | scales=.1,.1 23 | 24 | #cutmix=1 25 | mosaic=1 26 | 27 | #:104x104 54:52x52 85:26x26 104:13x13 for 416 28 | 29 | [convolutional] 30 | batch_normalize=1 31 | filters=32 32 | size=3 33 | stride=1 34 | pad=1 35 | activation=mish 36 | 37 | # Downsample 38 | 39 | [convolutional] 40 | batch_normalize=1 41 | filters=64 42 | size=3 43 | stride=2 44 | pad=1 45 | activation=mish 46 | 47 | [convolutional] 48 | batch_normalize=1 49 | filters=64 50 | size=1 51 | stride=1 52 | pad=1 53 | activation=mish 54 | 55 | [route] 56 | layers = -2 57 | 58 | [convolutional] 59 | batch_normalize=1 60 | filters=64 61 | size=1 62 | stride=1 63 | pad=1 64 | activation=mish 65 | 66 | [convolutional] 67 | batch_normalize=1 68 | filters=32 69 | size=1 70 | stride=1 71 | pad=1 72 | activation=mish 73 | 74 | [convolutional] 75 | batch_normalize=1 76 | filters=64 77 | size=3 78 | stride=1 79 | pad=1 80 | activation=mish 81 | 82 | [shortcut] 83 | from=-3 84 | activation=linear 85 | 86 | [convolutional] 87 | batch_normalize=1 88 | filters=64 89 | size=1 90 | stride=1 91 | pad=1 92 | activation=mish 93 | 94 | [route] 95 | layers = -1,-7 96 | 97 | [convolutional] 98 | batch_normalize=1 99 | filters=64 100 | size=1 101 | stride=1 102 | pad=1 103 | activation=mish 104 | 105 | # Downsample 106 | 107 | [convolutional] 108 | batch_normalize=1 109 | filters=128 110 | size=3 111 | stride=2 112 | pad=1 113 | activation=mish 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=64 118 | size=1 119 | stride=1 120 | pad=1 121 | activation=mish 122 | 123 | [route] 124 | layers = -2 125 | 126 | [convolutional] 127 | batch_normalize=1 128 | filters=64 129 | size=1 130 | stride=1 131 | pad=1 132 | activation=mish 133 | 134 | [convolutional] 135 | batch_normalize=1 136 | filters=64 137 | size=1 138 | stride=1 139 | pad=1 140 | activation=mish 141 | 142 | [convolutional] 143 | batch_normalize=1 144 | filters=64 145 | size=3 146 | stride=1 147 | pad=1 148 | activation=mish 149 | 150 | [shortcut] 151 | from=-3 152 | activation=linear 153 | 154 | [convolutional] 155 | batch_normalize=1 156 | filters=64 157 | size=1 158 | stride=1 159 | pad=1 160 | activation=mish 161 | 162 | [convolutional] 163 | batch_normalize=1 164 | filters=64 165 | size=3 166 | stride=1 167 | pad=1 168 | activation=mish 169 | 170 | [shortcut] 171 | from=-3 172 | activation=linear 173 | 174 | [convolutional] 175 | batch_normalize=1 176 | filters=64 177 | size=1 178 | stride=1 179 | pad=1 180 | activation=mish 181 | 182 | [route] 183 | layers = -1,-10 184 | 185 | [convolutional] 186 | batch_normalize=1 187 | filters=128 188 | size=1 189 | stride=1 190 | pad=1 191 | activation=mish 192 | 193 | # Downsample 194 | 195 | [convolutional] 196 | batch_normalize=1 197 | filters=256 198 | size=3 199 | stride=2 200 | pad=1 201 | activation=mish 202 | 203 | [convolutional] 204 | batch_normalize=1 205 | filters=128 206 | size=1 207 | stride=1 208 | pad=1 209 | activation=mish 210 | 211 | [route] 212 | layers = -2 213 | 214 | [convolutional] 215 | batch_normalize=1 216 | filters=128 217 | size=1 218 | stride=1 219 | pad=1 220 | activation=mish 221 | 222 | [convolutional] 223 | batch_normalize=1 224 | filters=128 225 | size=1 226 | stride=1 227 | pad=1 228 | activation=mish 229 | 230 | [convolutional] 231 | batch_normalize=1 232 | filters=128 233 | size=3 234 | stride=1 235 | pad=1 236 | activation=mish 237 | 238 | [shortcut] 239 | from=-3 240 | activation=linear 241 | 242 | [convolutional] 243 | batch_normalize=1 244 | filters=128 245 | size=1 246 | stride=1 247 | pad=1 248 | activation=mish 249 | 250 | [convolutional] 251 | batch_normalize=1 252 | filters=128 253 | size=3 254 | stride=1 255 | pad=1 256 | activation=mish 257 | 258 | [shortcut] 259 | from=-3 260 | activation=linear 261 | 262 | [convolutional] 263 | batch_normalize=1 264 | filters=128 265 | size=1 266 | stride=1 267 | pad=1 268 | activation=mish 269 | 270 | [convolutional] 271 | batch_normalize=1 272 | filters=128 273 | size=3 274 | stride=1 275 | pad=1 276 | activation=mish 277 | 278 | [shortcut] 279 | from=-3 280 | activation=linear 281 | 282 | [convolutional] 283 | batch_normalize=1 284 | filters=128 285 | size=1 286 | stride=1 287 | pad=1 288 | activation=mish 289 | 290 | [convolutional] 291 | batch_normalize=1 292 | filters=128 293 | size=3 294 | stride=1 295 | pad=1 296 | activation=mish 297 | 298 | [shortcut] 299 | from=-3 300 | activation=linear 301 | 302 | 303 | [convolutional] 304 | batch_normalize=1 305 | filters=128 306 | size=1 307 | stride=1 308 | pad=1 309 | activation=mish 310 | 311 | [convolutional] 312 | batch_normalize=1 313 | filters=128 314 | size=3 315 | stride=1 316 | pad=1 317 | activation=mish 318 | 319 | [shortcut] 320 | from=-3 321 | activation=linear 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=128 326 | size=1 327 | stride=1 328 | pad=1 329 | activation=mish 330 | 331 | [convolutional] 332 | batch_normalize=1 333 | filters=128 334 | size=3 335 | stride=1 336 | pad=1 337 | activation=mish 338 | 339 | [shortcut] 340 | from=-3 341 | activation=linear 342 | 343 | [convolutional] 344 | batch_normalize=1 345 | filters=128 346 | size=1 347 | stride=1 348 | pad=1 349 | activation=mish 350 | 351 | [convolutional] 352 | batch_normalize=1 353 | filters=128 354 | size=3 355 | stride=1 356 | pad=1 357 | activation=mish 358 | 359 | [shortcut] 360 | from=-3 361 | activation=linear 362 | 363 | [convolutional] 364 | batch_normalize=1 365 | filters=128 366 | size=1 367 | stride=1 368 | pad=1 369 | activation=mish 370 | 371 | [convolutional] 372 | batch_normalize=1 373 | filters=128 374 | size=3 375 | stride=1 376 | pad=1 377 | activation=mish 378 | 379 | [shortcut] 380 | from=-3 381 | activation=linear 382 | 383 | [convolutional] 384 | batch_normalize=1 385 | filters=128 386 | size=1 387 | stride=1 388 | pad=1 389 | activation=mish 390 | 391 | [route] 392 | layers = -1,-28 393 | 394 | [convolutional] 395 | batch_normalize=1 396 | filters=256 397 | size=1 398 | stride=1 399 | pad=1 400 | activation=mish 401 | 402 | # Downsample 403 | 404 | [convolutional] 405 | batch_normalize=1 406 | filters=512 407 | size=3 408 | stride=2 409 | pad=1 410 | activation=mish 411 | 412 | [convolutional] 413 | batch_normalize=1 414 | filters=256 415 | size=1 416 | stride=1 417 | pad=1 418 | activation=mish 419 | 420 | [route] 421 | layers = -2 422 | 423 | [convolutional] 424 | batch_normalize=1 425 | filters=256 426 | size=1 427 | stride=1 428 | pad=1 429 | activation=mish 430 | 431 | [convolutional] 432 | batch_normalize=1 433 | filters=256 434 | size=1 435 | stride=1 436 | pad=1 437 | activation=mish 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=3 443 | stride=1 444 | pad=1 445 | activation=mish 446 | 447 | [shortcut] 448 | from=-3 449 | activation=linear 450 | 451 | 452 | [convolutional] 453 | batch_normalize=1 454 | filters=256 455 | size=1 456 | stride=1 457 | pad=1 458 | activation=mish 459 | 460 | [convolutional] 461 | batch_normalize=1 462 | filters=256 463 | size=3 464 | stride=1 465 | pad=1 466 | activation=mish 467 | 468 | [shortcut] 469 | from=-3 470 | activation=linear 471 | 472 | 473 | [convolutional] 474 | batch_normalize=1 475 | filters=256 476 | size=1 477 | stride=1 478 | pad=1 479 | activation=mish 480 | 481 | [convolutional] 482 | batch_normalize=1 483 | filters=256 484 | size=3 485 | stride=1 486 | pad=1 487 | activation=mish 488 | 489 | [shortcut] 490 | from=-3 491 | activation=linear 492 | 493 | 494 | [convolutional] 495 | batch_normalize=1 496 | filters=256 497 | size=1 498 | stride=1 499 | pad=1 500 | activation=mish 501 | 502 | [convolutional] 503 | batch_normalize=1 504 | filters=256 505 | size=3 506 | stride=1 507 | pad=1 508 | activation=mish 509 | 510 | [shortcut] 511 | from=-3 512 | activation=linear 513 | 514 | 515 | [convolutional] 516 | batch_normalize=1 517 | filters=256 518 | size=1 519 | stride=1 520 | pad=1 521 | activation=mish 522 | 523 | [convolutional] 524 | batch_normalize=1 525 | filters=256 526 | size=3 527 | stride=1 528 | pad=1 529 | activation=mish 530 | 531 | [shortcut] 532 | from=-3 533 | activation=linear 534 | 535 | 536 | [convolutional] 537 | batch_normalize=1 538 | filters=256 539 | size=1 540 | stride=1 541 | pad=1 542 | activation=mish 543 | 544 | [convolutional] 545 | batch_normalize=1 546 | filters=256 547 | size=3 548 | stride=1 549 | pad=1 550 | activation=mish 551 | 552 | [shortcut] 553 | from=-3 554 | activation=linear 555 | 556 | 557 | [convolutional] 558 | batch_normalize=1 559 | filters=256 560 | size=1 561 | stride=1 562 | pad=1 563 | activation=mish 564 | 565 | [convolutional] 566 | batch_normalize=1 567 | filters=256 568 | size=3 569 | stride=1 570 | pad=1 571 | activation=mish 572 | 573 | [shortcut] 574 | from=-3 575 | activation=linear 576 | 577 | [convolutional] 578 | batch_normalize=1 579 | filters=256 580 | size=1 581 | stride=1 582 | pad=1 583 | activation=mish 584 | 585 | [convolutional] 586 | batch_normalize=1 587 | filters=256 588 | size=3 589 | stride=1 590 | pad=1 591 | activation=mish 592 | 593 | [shortcut] 594 | from=-3 595 | activation=linear 596 | 597 | [convolutional] 598 | batch_normalize=1 599 | filters=256 600 | size=1 601 | stride=1 602 | pad=1 603 | activation=mish 604 | 605 | [route] 606 | layers = -1,-28 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | filters=512 611 | size=1 612 | stride=1 613 | pad=1 614 | activation=mish 615 | 616 | # Downsample 617 | 618 | [convolutional] 619 | batch_normalize=1 620 | filters=1024 621 | size=3 622 | stride=2 623 | pad=1 624 | activation=mish 625 | 626 | [convolutional] 627 | batch_normalize=1 628 | filters=512 629 | size=1 630 | stride=1 631 | pad=1 632 | activation=mish 633 | 634 | [route] 635 | layers = -2 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=512 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=mish 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | filters=512 648 | size=1 649 | stride=1 650 | pad=1 651 | activation=mish 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=512 656 | size=3 657 | stride=1 658 | pad=1 659 | activation=mish 660 | 661 | [shortcut] 662 | from=-3 663 | activation=linear 664 | 665 | [convolutional] 666 | batch_normalize=1 667 | filters=512 668 | size=1 669 | stride=1 670 | pad=1 671 | activation=mish 672 | 673 | [convolutional] 674 | batch_normalize=1 675 | filters=512 676 | size=3 677 | stride=1 678 | pad=1 679 | activation=mish 680 | 681 | [shortcut] 682 | from=-3 683 | activation=linear 684 | 685 | [convolutional] 686 | batch_normalize=1 687 | filters=512 688 | size=1 689 | stride=1 690 | pad=1 691 | activation=mish 692 | 693 | [convolutional] 694 | batch_normalize=1 695 | filters=512 696 | size=3 697 | stride=1 698 | pad=1 699 | activation=mish 700 | 701 | [shortcut] 702 | from=-3 703 | activation=linear 704 | 705 | [convolutional] 706 | batch_normalize=1 707 | filters=512 708 | size=1 709 | stride=1 710 | pad=1 711 | activation=mish 712 | 713 | [convolutional] 714 | batch_normalize=1 715 | filters=512 716 | size=3 717 | stride=1 718 | pad=1 719 | activation=mish 720 | 721 | [shortcut] 722 | from=-3 723 | activation=linear 724 | 725 | [convolutional] 726 | batch_normalize=1 727 | filters=512 728 | size=1 729 | stride=1 730 | pad=1 731 | activation=mish 732 | 733 | [route] 734 | layers = -1,-16 735 | 736 | [convolutional] 737 | batch_normalize=1 738 | filters=1024 739 | size=1 740 | stride=1 741 | pad=1 742 | activation=mish 743 | 744 | ########################## 745 | 746 | [convolutional] 747 | batch_normalize=1 748 | filters=512 749 | size=1 750 | stride=1 751 | pad=1 752 | activation=leaky 753 | 754 | [convolutional] 755 | batch_normalize=1 756 | size=3 757 | stride=1 758 | pad=1 759 | filters=1024 760 | activation=leaky 761 | 762 | [convolutional] 763 | batch_normalize=1 764 | filters=512 765 | size=1 766 | stride=1 767 | pad=1 768 | activation=leaky 769 | 770 | ### SPP ### 771 | [maxpool] 772 | stride=1 773 | size=5 774 | 775 | [route] 776 | layers=-2 777 | 778 | [maxpool] 779 | stride=1 780 | size=9 781 | 782 | [route] 783 | layers=-4 784 | 785 | [maxpool] 786 | stride=1 787 | size=13 788 | 789 | [route] 790 | layers=-1,-3,-5,-6 791 | ### End SPP ### 792 | 793 | [convolutional] 794 | batch_normalize=1 795 | filters=512 796 | size=1 797 | stride=1 798 | pad=1 799 | activation=leaky 800 | 801 | [convolutional] 802 | batch_normalize=1 803 | size=3 804 | stride=1 805 | pad=1 806 | filters=1024 807 | activation=leaky 808 | 809 | [convolutional] 810 | batch_normalize=1 811 | filters=512 812 | size=1 813 | stride=1 814 | pad=1 815 | activation=leaky 816 | 817 | [convolutional] 818 | batch_normalize=1 819 | filters=256 820 | size=1 821 | stride=1 822 | pad=1 823 | activation=leaky 824 | 825 | [upsample] 826 | stride=2 827 | 828 | [route] 829 | layers = 85 830 | 831 | [convolutional] 832 | batch_normalize=1 833 | filters=256 834 | size=1 835 | stride=1 836 | pad=1 837 | activation=leaky 838 | 839 | [route] 840 | layers = -1, -3 841 | 842 | [convolutional] 843 | batch_normalize=1 844 | filters=256 845 | size=1 846 | stride=1 847 | pad=1 848 | activation=leaky 849 | 850 | [convolutional] 851 | batch_normalize=1 852 | size=3 853 | stride=1 854 | pad=1 855 | filters=512 856 | activation=leaky 857 | 858 | [convolutional] 859 | batch_normalize=1 860 | filters=256 861 | size=1 862 | stride=1 863 | pad=1 864 | activation=leaky 865 | 866 | [convolutional] 867 | batch_normalize=1 868 | size=3 869 | stride=1 870 | pad=1 871 | filters=512 872 | activation=leaky 873 | 874 | [convolutional] 875 | batch_normalize=1 876 | filters=256 877 | size=1 878 | stride=1 879 | pad=1 880 | activation=leaky 881 | 882 | [convolutional] 883 | batch_normalize=1 884 | filters=128 885 | size=1 886 | stride=1 887 | pad=1 888 | activation=leaky 889 | 890 | [upsample] 891 | stride=2 892 | 893 | [route] 894 | layers = 54 895 | 896 | [convolutional] 897 | batch_normalize=1 898 | filters=128 899 | size=1 900 | stride=1 901 | pad=1 902 | activation=leaky 903 | 904 | [route] 905 | layers = -1, -3 906 | 907 | [convolutional] 908 | batch_normalize=1 909 | filters=128 910 | size=1 911 | stride=1 912 | pad=1 913 | activation=leaky 914 | 915 | [convolutional] 916 | batch_normalize=1 917 | size=3 918 | stride=1 919 | pad=1 920 | filters=256 921 | activation=leaky 922 | 923 | [convolutional] 924 | batch_normalize=1 925 | filters=128 926 | size=1 927 | stride=1 928 | pad=1 929 | activation=leaky 930 | 931 | [convolutional] 932 | batch_normalize=1 933 | size=3 934 | stride=1 935 | pad=1 936 | filters=256 937 | activation=leaky 938 | 939 | [convolutional] 940 | batch_normalize=1 941 | filters=128 942 | size=1 943 | stride=1 944 | pad=1 945 | activation=leaky 946 | 947 | ########################## 948 | 949 | [convolutional] 950 | batch_normalize=1 951 | size=3 952 | stride=1 953 | pad=1 954 | filters=256 955 | activation=leaky 956 | 957 | [convolutional] 958 | size=1 959 | stride=1 960 | pad=1 961 | filters=255 962 | activation=linear 963 | 964 | 965 | [yolo] 966 | mask = 0,1,2 967 | anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401 968 | classes=80 969 | num=9 970 | jitter=.3 971 | ignore_thresh = .7 972 | truth_thresh = 1 973 | scale_x_y = 1.2 974 | iou_thresh=0.213 975 | cls_normalizer=1.0 976 | iou_normalizer=0.07 977 | iou_loss=ciou 978 | nms_kind=greedynms 979 | beta_nms=0.6 980 | max_delta=5 981 | 982 | 983 | [route] 984 | layers = -4 985 | 986 | [convolutional] 987 | batch_normalize=1 988 | size=3 989 | stride=2 990 | pad=1 991 | filters=256 992 | activation=leaky 993 | 994 | [route] 995 | layers = -1, -16 996 | 997 | [convolutional] 998 | batch_normalize=1 999 | filters=256 1000 | size=1 1001 | stride=1 1002 | pad=1 1003 | activation=leaky 1004 | 1005 | [convolutional] 1006 | batch_normalize=1 1007 | size=3 1008 | stride=1 1009 | pad=1 1010 | filters=512 1011 | activation=leaky 1012 | 1013 | [convolutional] 1014 | batch_normalize=1 1015 | filters=256 1016 | size=1 1017 | stride=1 1018 | pad=1 1019 | activation=leaky 1020 | 1021 | [convolutional] 1022 | batch_normalize=1 1023 | size=3 1024 | stride=1 1025 | pad=1 1026 | filters=512 1027 | activation=leaky 1028 | 1029 | [convolutional] 1030 | batch_normalize=1 1031 | filters=256 1032 | size=1 1033 | stride=1 1034 | pad=1 1035 | activation=leaky 1036 | 1037 | [convolutional] 1038 | batch_normalize=1 1039 | size=3 1040 | stride=1 1041 | pad=1 1042 | filters=512 1043 | activation=leaky 1044 | 1045 | [convolutional] 1046 | size=1 1047 | stride=1 1048 | pad=1 1049 | filters=255 1050 | activation=linear 1051 | 1052 | 1053 | [yolo] 1054 | mask = 3,4,5 1055 | anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401 1056 | classes=80 1057 | num=9 1058 | jitter=.3 1059 | ignore_thresh = .7 1060 | truth_thresh = 1 1061 | scale_x_y = 1.1 1062 | iou_thresh=0.213 1063 | cls_normalizer=1.0 1064 | iou_normalizer=0.07 1065 | iou_loss=ciou 1066 | nms_kind=greedynms 1067 | beta_nms=0.6 1068 | max_delta=5 1069 | 1070 | 1071 | [route] 1072 | layers = -4 1073 | 1074 | [convolutional] 1075 | batch_normalize=1 1076 | size=3 1077 | stride=2 1078 | pad=1 1079 | filters=512 1080 | activation=leaky 1081 | 1082 | [route] 1083 | layers = -1, -37 1084 | 1085 | [convolutional] 1086 | batch_normalize=1 1087 | filters=512 1088 | size=1 1089 | stride=1 1090 | pad=1 1091 | activation=leaky 1092 | 1093 | [convolutional] 1094 | batch_normalize=1 1095 | size=3 1096 | stride=1 1097 | pad=1 1098 | filters=1024 1099 | activation=leaky 1100 | 1101 | [convolutional] 1102 | batch_normalize=1 1103 | filters=512 1104 | size=1 1105 | stride=1 1106 | pad=1 1107 | activation=leaky 1108 | 1109 | [convolutional] 1110 | batch_normalize=1 1111 | size=3 1112 | stride=1 1113 | pad=1 1114 | filters=1024 1115 | activation=leaky 1116 | 1117 | [convolutional] 1118 | batch_normalize=1 1119 | filters=512 1120 | size=1 1121 | stride=1 1122 | pad=1 1123 | activation=leaky 1124 | 1125 | [convolutional] 1126 | batch_normalize=1 1127 | size=3 1128 | stride=1 1129 | pad=1 1130 | filters=1024 1131 | activation=leaky 1132 | 1133 | [convolutional] 1134 | size=1 1135 | stride=1 1136 | pad=1 1137 | filters=255 1138 | activation=linear 1139 | 1140 | 1141 | [yolo] 1142 | mask = 6,7,8 1143 | anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401 1144 | classes=80 1145 | num=9 1146 | jitter=.3 1147 | ignore_thresh = .7 1148 | truth_thresh = 1 1149 | random=1 1150 | scale_x_y = 1.05 1151 | iou_thresh=0.213 1152 | cls_normalizer=1.0 1153 | iou_normalizer=0.07 1154 | iou_loss=ciou 1155 | nms_kind=greedynms 1156 | beta_nms=0.6 1157 | max_delta=5 -------------------------------------------------------------------------------- /YOLOv4/data/coco.names: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /YOLOv4/data/voc.names: -------------------------------------------------------------------------------- 1 | aeroplane 2 | bicycle 3 | bird 4 | boat 5 | bottle 6 | bus 7 | car 8 | cat 9 | chair 10 | cow 11 | diningtable 12 | dog 13 | horse 14 | motorbike 15 | person 16 | pottedplant 17 | sheep 18 | sofa 19 | train 20 | tvmonitor 21 | -------------------------------------------------------------------------------- /YOLOv4/object_detection.py: -------------------------------------------------------------------------------- 1 | import torch,cv2,random,os,time 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import numpy as np 5 | import pickle as pkl 6 | import argparse 7 | import threading, queue 8 | from torch.multiprocessing import Pool, Process, set_start_method 9 | from darknet import Darknet 10 | from imutils.video import WebcamVideoStream,FPS 11 | # from camera import write 12 | import win32com.client as wincl #### Python's Text-to-speech (tts) engine for windows, multiprocessing 13 | speak = wincl.Dispatch("SAPI.SpVoice") #### This initiates the tts engine 14 | 15 | torch.multiprocessing.set_start_method('spawn', force=True) 16 | 17 | ## Setting up torch for gpu utilization 18 | if torch.cuda.is_available(): 19 | torch.backends.cudnn.enabled = True 20 | torch.backends.cudnn.benchmark = True 21 | torch.backends.cudnn.deterministic = True 22 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 23 | 24 | def letterbox_image(img, inp_dim): 25 | '''resize image with unchanged aspect ratio using padding''' 26 | img_w, img_h = img.shape[1], img.shape[0] 27 | w, h = inp_dim 28 | new_w = int(img_w * min(w/img_w, h/img_h)) 29 | new_h = int(img_h * min(w/img_w, h/img_h)) 30 | resized_image = cv2.resize(img, (new_w,new_h), interpolation = cv2.INTER_CUBIC) 31 | canvas = np.full((inp_dim[1], inp_dim[0], 3), 128) 32 | canvas[(h-new_h)//2:(h-new_h)//2 + new_h,(w-new_w)//2:(w-new_w)//2 + new_w, :] = resized_image 33 | 34 | return canvas 35 | 36 | def load_classes(namesfile): 37 | fp = open(namesfile, "r") 38 | names = fp.read().split("\n")[:-1] 39 | return names 40 | 41 | def prep_image(img, inp_dim): 42 | """ 43 | Prepare image for inputting to the neural network. 44 | Returns a Variable 45 | """ 46 | orig_im = img 47 | dim = orig_im.shape[1], orig_im.shape[0] 48 | img = (letterbox_image(orig_im, (inp_dim, inp_dim))) 49 | img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy() 50 | img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) 51 | return img_, orig_im, dim 52 | 53 | 54 | 55 | class ObjectDetection: 56 | def __init__(self, id): 57 | # self.cap = cv2.VideoCapture(id) 58 | self.cap = WebcamVideoStream(src = id).start() 59 | self.cfgfile = "cfg/yolov4.cfg" 60 | self.weightsfile = "yolov4.weights" 61 | self.confidence = float(0.6) 62 | self.nms_thesh = float(0.8) 63 | self.num_classes = 80 64 | self.classes = load_classes('data/coco.names') 65 | self.colors = pkl.load(open("pallete", "rb")) 66 | self.model = Darknet(self.cfgfile) 67 | self.CUDA = torch.cuda.is_available() 68 | self.model.load_weights(self.weightsfile) 69 | self.width = 1280 #640#1280 70 | self.height = 720 #360#720 71 | print("Loading network.....") 72 | if self.CUDA: 73 | self.model.cuda() 74 | print("Network successfully loaded") 75 | 76 | self.model.eval() 77 | 78 | def main(self): 79 | q = queue.Queue() 80 | while True: 81 | def frame_render(queue_from_cam): 82 | frame = self.cap.read() # If you capture stream using opencv (cv2.VideoCapture()) the use the following line 83 | # ret, frame = self.cap.read() 84 | frame = cv2.resize(frame,(self.width, self.height)) 85 | queue_from_cam.put(frame) 86 | cam = threading.Thread(target=frame_render, args=(q,)) 87 | cam.start() 88 | cam.join() 89 | frame = q.get() 90 | q.task_done() 91 | fps = FPS().start() 92 | 93 | try: 94 | img, orig_im, dim = prep_image(frame, 160) 95 | 96 | im_dim = torch.FloatTensor(dim).repeat(1,2) 97 | if self.CUDA: #### If you have a gpu properly installed then it will run on the gpu 98 | im_dim = im_dim.cuda() 99 | img = img.cuda() 100 | # with torch.no_grad(): #### Set the model in the evaluation mode 101 | 102 | output = self.model(img) 103 | from tool.utils import post_processing,plot_boxes_cv2 104 | bounding_boxes = post_processing(img,self.confidence, self.nms_thesh, output) 105 | frame = plot_boxes_cv2(frame, bounding_boxes[0], savename= None, class_names=self.classes, color = None, colors=self.colors) 106 | 107 | except: 108 | pass 109 | 110 | fps.update() 111 | fps.stop() 112 | print("[INFO] elasped time: {:.2f}".format(fps.elapsed())) 113 | print("[INFO] approx. FPS: {:.1f}".format(fps.fps())) 114 | 115 | cv2.imshow("Object Detection Window", frame) 116 | if cv2.waitKey(1) & 0xFF == ord('q'): 117 | break 118 | continue 119 | torch.cuda.empty_cache() 120 | 121 | 122 | 123 | if __name__ == "__main__": 124 | id = 0 125 | ObjectDetection(id).main() 126 | -------------------------------------------------------------------------------- /YOLOv4/pallete: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paul-pias/Object-Detection-and-Distance-Measurement/d03baa0d99626190c87fccdd75fbc67ce8d176f8/YOLOv4/pallete -------------------------------------------------------------------------------- /YOLOv4/requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python==4.1.0.25 2 | numpy==1.17.0 3 | pandas==0.25.1 4 | torch_nightly==1.2.0.dev20190807+cpu 5 | matplotlib==3.1.1 6 | Pillow>=6.2.2 7 | torch==1.2.0 8 | imutils 9 | -------------------------------------------------------------------------------- /YOLOv4/templates/12.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paul-pias/Object-Detection-and-Distance-Measurement/d03baa0d99626190c87fccdd75fbc67ce8d176f8/YOLOv4/templates/12.jpg -------------------------------------------------------------------------------- /YOLOv4/templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
18 |
19 | 34 |
35 | {% block content %} 36 | 37 | {% endblock %} 38 |
39 |
40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /YOLOv4/templates/index.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} {% block content %} 2 | 3 | 4 | 5 |
6 |
7 |
8 |

Camera - 01

9 |
10 | 11 |
12 | 13 |
14 |
15 |
16 | {% endblock %} -------------------------------------------------------------------------------- /YOLOv4/tool/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tool.torch_utils import convert2cpu 3 | 4 | 5 | def parse_cfg(cfgfile): 6 | blocks = [] 7 | fp = open(cfgfile, 'r') 8 | block = None 9 | line = fp.readline() 10 | while line != '': 11 | line = line.rstrip() 12 | if line == '' or line[0] == '#': 13 | line = fp.readline() 14 | continue 15 | elif line[0] == '[': 16 | if block: 17 | blocks.append(block) 18 | block = dict() 19 | block['type'] = line.lstrip('[').rstrip(']') 20 | # set default value 21 | if block['type'] == 'convolutional': 22 | block['batch_normalize'] = 0 23 | else: 24 | key, value = line.split('=') 25 | key = key.strip() 26 | if key == 'type': 27 | key = '_type' 28 | value = value.strip() 29 | block[key] = value 30 | line = fp.readline() 31 | 32 | if block: 33 | blocks.append(block) 34 | fp.close() 35 | return blocks 36 | 37 | 38 | def print_cfg(blocks): 39 | print('layer filters size input output'); 40 | prev_width = 416 41 | prev_height = 416 42 | prev_filters = 3 43 | out_filters = [] 44 | out_widths = [] 45 | out_heights = [] 46 | ind = -2 47 | for block in blocks: 48 | ind = ind + 1 49 | if block['type'] == 'net': 50 | prev_width = int(block['width']) 51 | prev_height = int(block['height']) 52 | continue 53 | elif block['type'] == 'convolutional': 54 | filters = int(block['filters']) 55 | kernel_size = int(block['size']) 56 | stride = int(block['stride']) 57 | is_pad = int(block['pad']) 58 | pad = (kernel_size - 1) // 2 if is_pad else 0 59 | width = (prev_width + 2 * pad - kernel_size) // stride + 1 60 | height = (prev_height + 2 * pad - kernel_size) // stride + 1 61 | print('%5d %-6s %4d %d x %d / %d %3d x %3d x%4d -> %3d x %3d x%4d' % ( 62 | ind, 'conv', filters, kernel_size, kernel_size, stride, prev_width, prev_height, prev_filters, width, 63 | height, filters)) 64 | prev_width = width 65 | prev_height = height 66 | prev_filters = filters 67 | out_widths.append(prev_width) 68 | out_heights.append(prev_height) 69 | out_filters.append(prev_filters) 70 | elif block['type'] == 'maxpool': 71 | pool_size = int(block['size']) 72 | stride = int(block['stride']) 73 | width = prev_width // stride 74 | height = prev_height // stride 75 | print('%5d %-6s %d x %d / %d %3d x %3d x%4d -> %3d x %3d x%4d' % ( 76 | ind, 'max', pool_size, pool_size, stride, prev_width, prev_height, prev_filters, width, height, 77 | filters)) 78 | prev_width = width 79 | prev_height = height 80 | prev_filters = filters 81 | out_widths.append(prev_width) 82 | out_heights.append(prev_height) 83 | out_filters.append(prev_filters) 84 | elif block['type'] == 'avgpool': 85 | width = 1 86 | height = 1 87 | print('%5d %-6s %3d x %3d x%4d -> %3d' % ( 88 | ind, 'avg', prev_width, prev_height, prev_filters, prev_filters)) 89 | prev_width = width 90 | prev_height = height 91 | prev_filters = filters 92 | out_widths.append(prev_width) 93 | out_heights.append(prev_height) 94 | out_filters.append(prev_filters) 95 | elif block['type'] == 'softmax': 96 | print('%5d %-6s -> %3d' % (ind, 'softmax', prev_filters)) 97 | out_widths.append(prev_width) 98 | out_heights.append(prev_height) 99 | out_filters.append(prev_filters) 100 | elif block['type'] == 'cost': 101 | print('%5d %-6s -> %3d' % (ind, 'cost', prev_filters)) 102 | out_widths.append(prev_width) 103 | out_heights.append(prev_height) 104 | out_filters.append(prev_filters) 105 | elif block['type'] == 'reorg': 106 | stride = int(block['stride']) 107 | filters = stride * stride * prev_filters 108 | width = prev_width // stride 109 | height = prev_height // stride 110 | print('%5d %-6s / %d %3d x %3d x%4d -> %3d x %3d x%4d' % ( 111 | ind, 'reorg', stride, prev_width, prev_height, prev_filters, width, height, filters)) 112 | prev_width = width 113 | prev_height = height 114 | prev_filters = filters 115 | out_widths.append(prev_width) 116 | out_heights.append(prev_height) 117 | out_filters.append(prev_filters) 118 | elif block['type'] == 'upsample': 119 | stride = int(block['stride']) 120 | filters = prev_filters 121 | width = prev_width * stride 122 | height = prev_height * stride 123 | print('%5d %-6s * %d %3d x %3d x%4d -> %3d x %3d x%4d' % ( 124 | ind, 'upsample', stride, prev_width, prev_height, prev_filters, width, height, filters)) 125 | prev_width = width 126 | prev_height = height 127 | prev_filters = filters 128 | out_widths.append(prev_width) 129 | out_heights.append(prev_height) 130 | out_filters.append(prev_filters) 131 | elif block['type'] == 'route': 132 | layers = block['layers'].split(',') 133 | layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers] 134 | if len(layers) == 1: 135 | print('%5d %-6s %d' % (ind, 'route', layers[0])) 136 | prev_width = out_widths[layers[0]] 137 | prev_height = out_heights[layers[0]] 138 | prev_filters = out_filters[layers[0]] 139 | elif len(layers) == 2: 140 | print('%5d %-6s %d %d' % (ind, 'route', layers[0], layers[1])) 141 | prev_width = out_widths[layers[0]] 142 | prev_height = out_heights[layers[0]] 143 | assert (prev_width == out_widths[layers[1]]) 144 | assert (prev_height == out_heights[layers[1]]) 145 | prev_filters = out_filters[layers[0]] + out_filters[layers[1]] 146 | elif len(layers) == 4: 147 | print('%5d %-6s %d %d %d %d' % (ind, 'route', layers[0], layers[1], layers[2], layers[3])) 148 | prev_width = out_widths[layers[0]] 149 | prev_height = out_heights[layers[0]] 150 | assert (prev_width == out_widths[layers[1]] == out_widths[layers[2]] == out_widths[layers[3]]) 151 | assert (prev_height == out_heights[layers[1]] == out_heights[layers[2]] == out_heights[layers[3]]) 152 | prev_filters = out_filters[layers[0]] + out_filters[layers[1]] + out_filters[layers[2]] + out_filters[ 153 | layers[3]] 154 | else: 155 | print("route error !!! {} {} {}".format(sys._getframe().f_code.co_filename, 156 | sys._getframe().f_code.co_name, sys._getframe().f_lineno)) 157 | 158 | out_widths.append(prev_width) 159 | out_heights.append(prev_height) 160 | out_filters.append(prev_filters) 161 | elif block['type'] in ['region', 'yolo']: 162 | print('%5d %-6s' % (ind, 'detection')) 163 | out_widths.append(prev_width) 164 | out_heights.append(prev_height) 165 | out_filters.append(prev_filters) 166 | elif block['type'] == 'shortcut': 167 | from_id = int(block['from']) 168 | from_id = from_id if from_id > 0 else from_id + ind 169 | print('%5d %-6s %d' % (ind, 'shortcut', from_id)) 170 | prev_width = out_widths[from_id] 171 | prev_height = out_heights[from_id] 172 | prev_filters = out_filters[from_id] 173 | out_widths.append(prev_width) 174 | out_heights.append(prev_height) 175 | out_filters.append(prev_filters) 176 | elif block['type'] == 'connected': 177 | filters = int(block['output']) 178 | print('%5d %-6s %d -> %3d' % (ind, 'connected', prev_filters, filters)) 179 | prev_filters = filters 180 | out_widths.append(1) 181 | out_heights.append(1) 182 | out_filters.append(prev_filters) 183 | else: 184 | print('unknown type %s' % (block['type'])) 185 | 186 | 187 | def load_conv(buf, start, conv_model): 188 | num_w = conv_model.weight.numel() 189 | num_b = conv_model.bias.numel() 190 | conv_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b])); 191 | start = start + num_b 192 | conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).reshape(conv_model.weight.data.shape)); 193 | start = start + num_w 194 | return start 195 | 196 | 197 | def save_conv(fp, conv_model): 198 | if conv_model.bias.is_cuda: 199 | convert2cpu(conv_model.bias.data).numpy().tofile(fp) 200 | convert2cpu(conv_model.weight.data).numpy().tofile(fp) 201 | else: 202 | conv_model.bias.data.numpy().tofile(fp) 203 | conv_model.weight.data.numpy().tofile(fp) 204 | 205 | 206 | def load_conv_bn(buf, start, conv_model, bn_model): 207 | num_w = conv_model.weight.numel() 208 | num_b = bn_model.bias.numel() 209 | bn_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b])); 210 | start = start + num_b 211 | bn_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_b])); 212 | start = start + num_b 213 | bn_model.running_mean.copy_(torch.from_numpy(buf[start:start + num_b])); 214 | start = start + num_b 215 | bn_model.running_var.copy_(torch.from_numpy(buf[start:start + num_b])); 216 | start = start + num_b 217 | conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).reshape(conv_model.weight.data.shape)); 218 | start = start + num_w 219 | return start 220 | 221 | 222 | def save_conv_bn(fp, conv_model, bn_model): 223 | if bn_model.bias.is_cuda: 224 | convert2cpu(bn_model.bias.data).numpy().tofile(fp) 225 | convert2cpu(bn_model.weight.data).numpy().tofile(fp) 226 | convert2cpu(bn_model.running_mean).numpy().tofile(fp) 227 | convert2cpu(bn_model.running_var).numpy().tofile(fp) 228 | convert2cpu(conv_model.weight.data).numpy().tofile(fp) 229 | else: 230 | bn_model.bias.data.numpy().tofile(fp) 231 | bn_model.weight.data.numpy().tofile(fp) 232 | bn_model.running_mean.numpy().tofile(fp) 233 | bn_model.running_var.numpy().tofile(fp) 234 | conv_model.weight.data.numpy().tofile(fp) 235 | 236 | 237 | def load_fc(buf, start, fc_model): 238 | num_w = fc_model.weight.numel() 239 | num_b = fc_model.bias.numel() 240 | fc_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b])); 241 | start = start + num_b 242 | fc_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w])); 243 | start = start + num_w 244 | return start 245 | 246 | 247 | def save_fc(fp, fc_model): 248 | fc_model.bias.data.numpy().tofile(fp) 249 | fc_model.weight.data.numpy().tofile(fp) 250 | 251 | 252 | if __name__ == '__main__': 253 | import sys 254 | 255 | blocks = parse_cfg('cfg/yolo.cfg') 256 | if len(sys.argv) == 2: 257 | blocks = parse_cfg(sys.argv[1]) 258 | print_cfg(blocks) -------------------------------------------------------------------------------- /YOLOv4/tool/region_loss.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | from tool.torch_utils import * 4 | 5 | 6 | def build_targets(pred_boxes, target, anchors, num_anchors, num_classes, nH, nW, noobject_scale, object_scale, 7 | sil_thresh, seen): 8 | nB = target.size(0) 9 | nA = num_anchors 10 | nC = num_classes 11 | anchor_step = len(anchors) / num_anchors 12 | conf_mask = torch.ones(nB, nA, nH, nW) * noobject_scale 13 | coord_mask = torch.zeros(nB, nA, nH, nW) 14 | cls_mask = torch.zeros(nB, nA, nH, nW) 15 | tx = torch.zeros(nB, nA, nH, nW) 16 | ty = torch.zeros(nB, nA, nH, nW) 17 | tw = torch.zeros(nB, nA, nH, nW) 18 | th = torch.zeros(nB, nA, nH, nW) 19 | tconf = torch.zeros(nB, nA, nH, nW) 20 | tcls = torch.zeros(nB, nA, nH, nW) 21 | 22 | nAnchors = nA * nH * nW 23 | nPixels = nH * nW 24 | for b in range(nB): 25 | cur_pred_boxes = pred_boxes[b * nAnchors:(b + 1) * nAnchors].t() 26 | cur_ious = torch.zeros(nAnchors) 27 | for t in range(50): 28 | if target[b][t * 5 + 1] == 0: 29 | break 30 | gx = target[b][t * 5 + 1] * nW 31 | gy = target[b][t * 5 + 2] * nH 32 | gw = target[b][t * 5 + 3] * nW 33 | gh = target[b][t * 5 + 4] * nH 34 | cur_gt_boxes = torch.FloatTensor([gx, gy, gw, gh]).repeat(nAnchors, 1).t() 35 | cur_ious = torch.max(cur_ious, bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False)) 36 | conf_mask[b][cur_ious > sil_thresh] = 0 37 | if seen < 12800: 38 | if anchor_step == 4: 39 | tx = torch.FloatTensor(anchors).view(nA, anchor_step).index_select(1, torch.LongTensor([2])).view(1, nA, 1, 40 | 1).repeat( 41 | nB, 1, nH, nW) 42 | ty = torch.FloatTensor(anchors).view(num_anchors, anchor_step).index_select(1, torch.LongTensor([2])).view( 43 | 1, nA, 1, 1).repeat(nB, 1, nH, nW) 44 | else: 45 | tx.fill_(0.5) 46 | ty.fill_(0.5) 47 | tw.zero_() 48 | th.zero_() 49 | coord_mask.fill_(1) 50 | 51 | nGT = 0 52 | nCorrect = 0 53 | for b in range(nB): 54 | for t in range(50): 55 | if target[b][t * 5 + 1] == 0: 56 | break 57 | nGT = nGT + 1 58 | best_iou = 0.0 59 | best_n = -1 60 | min_dist = 10000 61 | gx = target[b][t * 5 + 1] * nW 62 | gy = target[b][t * 5 + 2] * nH 63 | gi = int(gx) 64 | gj = int(gy) 65 | gw = target[b][t * 5 + 3] * nW 66 | gh = target[b][t * 5 + 4] * nH 67 | gt_box = [0, 0, gw, gh] 68 | for n in range(nA): 69 | aw = anchors[anchor_step * n] 70 | ah = anchors[anchor_step * n + 1] 71 | anchor_box = [0, 0, aw, ah] 72 | iou = bbox_iou(anchor_box, gt_box, x1y1x2y2=False) 73 | if anchor_step == 4: 74 | ax = anchors[anchor_step * n + 2] 75 | ay = anchors[anchor_step * n + 3] 76 | dist = pow(((gi + ax) - gx), 2) + pow(((gj + ay) - gy), 2) 77 | if iou > best_iou: 78 | best_iou = iou 79 | best_n = n 80 | elif anchor_step == 4 and iou == best_iou and dist < min_dist: 81 | best_iou = iou 82 | best_n = n 83 | min_dist = dist 84 | 85 | gt_box = [gx, gy, gw, gh] 86 | pred_box = pred_boxes[b * nAnchors + best_n * nPixels + gj * nW + gi] 87 | 88 | coord_mask[b][best_n][gj][gi] = 1 89 | cls_mask[b][best_n][gj][gi] = 1 90 | conf_mask[b][best_n][gj][gi] = object_scale 91 | tx[b][best_n][gj][gi] = target[b][t * 5 + 1] * nW - gi 92 | ty[b][best_n][gj][gi] = target[b][t * 5 + 2] * nH - gj 93 | tw[b][best_n][gj][gi] = math.log(gw / anchors[anchor_step * best_n]) 94 | th[b][best_n][gj][gi] = math.log(gh / anchors[anchor_step * best_n + 1]) 95 | iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False) # best_iou 96 | tconf[b][best_n][gj][gi] = iou 97 | tcls[b][best_n][gj][gi] = target[b][t * 5] 98 | if iou > 0.5: 99 | nCorrect = nCorrect + 1 100 | 101 | return nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls 102 | 103 | 104 | class RegionLoss(nn.Module): 105 | def __init__(self, num_classes=0, anchors=[], num_anchors=1): 106 | super(RegionLoss, self).__init__() 107 | self.num_classes = num_classes 108 | self.anchors = anchors 109 | self.num_anchors = num_anchors 110 | self.anchor_step = len(anchors) / num_anchors 111 | self.coord_scale = 1 112 | self.noobject_scale = 1 113 | self.object_scale = 5 114 | self.class_scale = 1 115 | self.thresh = 0.6 116 | self.seen = 0 117 | 118 | def forward(self, output, target): 119 | # output : BxAs*(4+1+num_classes)*H*W 120 | t0 = time.time() 121 | nB = output.data.size(0) 122 | nA = self.num_anchors 123 | nC = self.num_classes 124 | nH = output.data.size(2) 125 | nW = output.data.size(3) 126 | 127 | output = output.view(nB, nA, (5 + nC), nH, nW) 128 | x = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([0]))).view(nB, nA, nH, nW)) 129 | y = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([1]))).view(nB, nA, nH, nW)) 130 | w = output.index_select(2, Variable(torch.cuda.LongTensor([2]))).view(nB, nA, nH, nW) 131 | h = output.index_select(2, Variable(torch.cuda.LongTensor([3]))).view(nB, nA, nH, nW) 132 | conf = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([4]))).view(nB, nA, nH, nW)) 133 | cls = output.index_select(2, Variable(torch.linspace(5, 5 + nC - 1, nC).long().cuda())) 134 | cls = cls.view(nB * nA, nC, nH * nW).transpose(1, 2).contiguous().view(nB * nA * nH * nW, nC) 135 | t1 = time.time() 136 | 137 | pred_boxes = torch.cuda.FloatTensor(4, nB * nA * nH * nW) 138 | grid_x = torch.linspace(0, nW - 1, nW).repeat(nH, 1).repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda() 139 | grid_y = torch.linspace(0, nH - 1, nH).repeat(nW, 1).t().repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda() 140 | anchor_w = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([0])).cuda() 141 | anchor_h = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([1])).cuda() 142 | anchor_w = anchor_w.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW) 143 | anchor_h = anchor_h.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW) 144 | pred_boxes[0] = x.data + grid_x 145 | pred_boxes[1] = y.data + grid_y 146 | pred_boxes[2] = torch.exp(w.data) * anchor_w 147 | pred_boxes[3] = torch.exp(h.data) * anchor_h 148 | pred_boxes = convert2cpu(pred_boxes.transpose(0, 1).contiguous().view(-1, 4)) 149 | t2 = time.time() 150 | 151 | nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes, 152 | target.data, 153 | self.anchors, nA, 154 | nC, \ 155 | nH, nW, 156 | self.noobject_scale, 157 | self.object_scale, 158 | self.thresh, 159 | self.seen) 160 | cls_mask = (cls_mask == 1) 161 | nProposals = int((conf > 0.25).sum().data[0]) 162 | 163 | tx = Variable(tx.cuda()) 164 | ty = Variable(ty.cuda()) 165 | tw = Variable(tw.cuda()) 166 | th = Variable(th.cuda()) 167 | tconf = Variable(tconf.cuda()) 168 | tcls = Variable(tcls.view(-1)[cls_mask].long().cuda()) 169 | 170 | coord_mask = Variable(coord_mask.cuda()) 171 | conf_mask = Variable(conf_mask.cuda().sqrt()) 172 | cls_mask = Variable(cls_mask.view(-1, 1).repeat(1, nC).cuda()) 173 | cls = cls[cls_mask].view(-1, nC) 174 | 175 | t3 = time.time() 176 | 177 | loss_x = self.coord_scale * nn.MSELoss(size_average=False)(x * coord_mask, tx * coord_mask) / 2.0 178 | loss_y = self.coord_scale * nn.MSELoss(size_average=False)(y * coord_mask, ty * coord_mask) / 2.0 179 | loss_w = self.coord_scale * nn.MSELoss(size_average=False)(w * coord_mask, tw * coord_mask) / 2.0 180 | loss_h = self.coord_scale * nn.MSELoss(size_average=False)(h * coord_mask, th * coord_mask) / 2.0 181 | loss_conf = nn.MSELoss(size_average=False)(conf * conf_mask, tconf * conf_mask) / 2.0 182 | loss_cls = self.class_scale * nn.CrossEntropyLoss(size_average=False)(cls, tcls) 183 | loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls 184 | t4 = time.time() 185 | if False: 186 | print('-----------------------------------') 187 | print(' activation : %f' % (t1 - t0)) 188 | print(' create pred_boxes : %f' % (t2 - t1)) 189 | print(' build targets : %f' % (t3 - t2)) 190 | print(' create loss : %f' % (t4 - t3)) 191 | print(' total : %f' % (t4 - t0)) 192 | print('%d: nGT %d, recall %d, proposals %d, loss: x %f, y %f, w %f, h %f, conf %f, cls %f, total %f' % ( 193 | self.seen, nGT, nCorrect, nProposals, loss_x.data[0], loss_y.data[0], loss_w.data[0], loss_h.data[0], 194 | loss_conf.data[0], loss_cls.data[0], loss.data[0])) 195 | return loss -------------------------------------------------------------------------------- /YOLOv4/tool/torch_utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import time 4 | import math 5 | import torch 6 | import numpy as np 7 | from torch.autograd import Variable 8 | 9 | import itertools 10 | import struct # get_image_size 11 | import imghdr # get_image_size 12 | 13 | from tool import utils 14 | 15 | 16 | def bbox_ious(boxes1, boxes2, x1y1x2y2=True): 17 | if x1y1x2y2: 18 | mx = torch.min(boxes1[0], boxes2[0]) 19 | Mx = torch.max(boxes1[2], boxes2[2]) 20 | my = torch.min(boxes1[1], boxes2[1]) 21 | My = torch.max(boxes1[3], boxes2[3]) 22 | w1 = boxes1[2] - boxes1[0] 23 | h1 = boxes1[3] - boxes1[1] 24 | w2 = boxes2[2] - boxes2[0] 25 | h2 = boxes2[3] - boxes2[1] 26 | else: 27 | mx = torch.min(boxes1[0] - boxes1[2] / 2.0, boxes2[0] - boxes2[2] / 2.0) 28 | Mx = torch.max(boxes1[0] + boxes1[2] / 2.0, boxes2[0] + boxes2[2] / 2.0) 29 | my = torch.min(boxes1[1] - boxes1[3] / 2.0, boxes2[1] - boxes2[3] / 2.0) 30 | My = torch.max(boxes1[1] + boxes1[3] / 2.0, boxes2[1] + boxes2[3] / 2.0) 31 | w1 = boxes1[2] 32 | h1 = boxes1[3] 33 | w2 = boxes2[2] 34 | h2 = boxes2[3] 35 | uw = Mx - mx 36 | uh = My - my 37 | cw = w1 + w2 - uw 38 | ch = h1 + h2 - uh 39 | mask = ((cw <= 0) + (ch <= 0) > 0) 40 | area1 = w1 * h1 41 | area2 = w2 * h2 42 | carea = cw * ch 43 | carea[mask] = 0 44 | uarea = area1 + area2 - carea 45 | return carea / uarea 46 | 47 | 48 | def get_region_boxes(boxes_and_confs): 49 | 50 | # print('Getting boxes from boxes and confs ...') 51 | 52 | boxes_list = [] 53 | confs_list = [] 54 | 55 | for item in boxes_and_confs: 56 | boxes_list.append(item[0]) 57 | confs_list.append(item[1]) 58 | 59 | # boxes: [batch, num1 + num2 + num3, 4] 60 | # confs: [batch, num1 + num2 + num3, num_classes] 61 | boxes = torch.cat(boxes_list, dim=1) 62 | confs = torch.cat(confs_list, dim=1) 63 | 64 | output = torch.cat((boxes, confs), dim=2) 65 | 66 | return output 67 | 68 | 69 | def convert2cpu(gpu_matrix): 70 | return torch.FloatTensor(gpu_matrix.size()).copy_(gpu_matrix) 71 | 72 | 73 | def convert2cpu_long(gpu_matrix): 74 | return torch.LongTensor(gpu_matrix.size()).copy_(gpu_matrix) 75 | 76 | 77 | 78 | def do_detect(model, img, conf_thresh, nms_thresh, use_cuda=1): 79 | model.eval() 80 | t0 = time.time() 81 | 82 | if type(img) == np.ndarray and len(img.shape) == 3: # cv2 image 83 | img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0) 84 | elif type(img) == np.ndarray and len(img.shape) == 4: 85 | img = torch.from_numpy(img.transpose(0, 3, 1, 2)).float().div(255.0) 86 | else: 87 | print("unknow image type") 88 | exit(-1) 89 | 90 | if use_cuda: 91 | img = img.cuda() 92 | img = torch.autograd.Variable(img) 93 | 94 | t1 = time.time() 95 | 96 | output = model(img) 97 | 98 | t2 = time.time() 99 | 100 | print('-----------------------------------') 101 | print(' Preprocess : %f' % (t1 - t0)) 102 | print(' Model Inference : %f' % (t2 - t1)) 103 | print('-----------------------------------') 104 | 105 | return utils.post_processing(img, conf_thresh, nms_thresh, output) -------------------------------------------------------------------------------- /YOLOv4/tool/utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import time 4 | import math 5 | import numpy as np 6 | 7 | import itertools 8 | import struct # get_image_size 9 | import imghdr # get_image_size 10 | 11 | import win32com.client as wincl #### Python's Text-to-speech (tts) engine for windows, multiprocessing 12 | speak = wincl.Dispatch("SAPI.SpVoice") #### This initiates the tts engine 13 | 14 | 15 | def sigmoid(x): 16 | return 1.0 / (np.exp(-x) + 1.) 17 | 18 | 19 | def softmax(x): 20 | x = np.exp(x - np.expand_dims(np.max(x, axis=1), axis=1)) 21 | x = x / np.expand_dims(x.sum(axis=1), axis=1) 22 | return x 23 | 24 | 25 | def bbox_iou(box1, box2, x1y1x2y2=True): 26 | 27 | # print('iou box1:', box1) 28 | # print('iou box2:', box2) 29 | 30 | if x1y1x2y2: 31 | mx = min(box1[0], box2[0]) 32 | Mx = max(box1[2], box2[2]) 33 | my = min(box1[1], box2[1]) 34 | My = max(box1[3], box2[3]) 35 | w1 = box1[2] - box1[0] 36 | h1 = box1[3] - box1[1] 37 | w2 = box2[2] - box2[0] 38 | h2 = box2[3] - box2[1] 39 | else: 40 | w1 = box1[2] 41 | h1 = box1[3] 42 | w2 = box2[2] 43 | h2 = box2[3] 44 | 45 | mx = min(box1[0], box2[0]) 46 | Mx = max(box1[0] + w1, box2[0] + w2) 47 | my = min(box1[1], box2[1]) 48 | My = max(box1[1] + h1, box2[1] + h2) 49 | uw = Mx - mx 50 | uh = My - my 51 | cw = w1 + w2 - uw 52 | ch = h1 + h2 - uh 53 | carea = 0 54 | if cw <= 0 or ch <= 0: 55 | return 0.0 56 | 57 | area1 = w1 * h1 58 | area2 = w2 * h2 59 | carea = cw * ch 60 | uarea = area1 + area2 - carea 61 | return carea / uarea 62 | 63 | 64 | def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False): 65 | # print(boxes.shape) 66 | x1 = boxes[:, 0] 67 | y1 = boxes[:, 1] 68 | x2 = boxes[:, 0] + boxes[:, 2] 69 | y2 = boxes[:, 1] + boxes[:, 3] 70 | 71 | areas = (x2 - x1) * (y2 - y1) 72 | order = confs.argsort()[::-1] 73 | 74 | keep = [] 75 | while order.size > 0: 76 | idx_self = order[0] 77 | idx_other = order[1:] 78 | 79 | keep.append(idx_self) 80 | 81 | xx1 = np.maximum(x1[idx_self], x1[idx_other]) 82 | yy1 = np.maximum(y1[idx_self], y1[idx_other]) 83 | xx2 = np.minimum(x2[idx_self], x2[idx_other]) 84 | yy2 = np.minimum(y2[idx_self], y2[idx_other]) 85 | 86 | w = np.maximum(0.0, xx2 - xx1) 87 | h = np.maximum(0.0, yy2 - yy1) 88 | inter = w * h 89 | 90 | if min_mode: 91 | over = inter / np.minimum(areas[order[0]], areas[order[1:]]) 92 | else: 93 | over = inter / (areas[order[0]] + areas[order[1:]] - inter) 94 | 95 | inds = np.where(over <= nms_thresh)[0] 96 | order = order[inds + 1] 97 | 98 | return np.array(keep) 99 | 100 | 101 | 102 | def plot_boxes_cv2(img, boxes, savename=None, class_names=None, color=None, colors = None): 103 | import cv2 104 | img = np.copy(img) 105 | # colors = np.array([[1, 0, 1], [0, 0, 1], [0, 1, 1], [0, 1, 0], [1, 1, 0], [1, 0, 0]], dtype=np.float32) 106 | colors = np.array(colors) 107 | 108 | def get_color(c, x, max_val): 109 | ratio = float(x) / max_val * 5 110 | i = int(math.floor(ratio)) 111 | j = int(math.ceil(ratio)) 112 | ratio = ratio - i 113 | r = (1 - ratio) * colors[i][c] + ratio * colors[j][c] 114 | return int(r * 255) 115 | 116 | width = img.shape[1] 117 | height = img.shape[0] 118 | # print("weight{} , height {}".format(width, height)) 119 | for i in range(len(boxes)): 120 | box = boxes[i] 121 | x1 = int((box[0] - box[2] / 2.0) * width) 122 | y1 = int((box[1] - box[3] / 2.0) * height) 123 | x2 = int((box[0] + box[2] / 2.0) * width) 124 | y2 = int((box[1] + box[3] / 2.0) * height) 125 | x,y,w,h = x1,y1,x2,y2 126 | 127 | font_face = cv2.FONT_HERSHEY_DUPLEX 128 | font_scale = 1.2 129 | font_thickness = 1 130 | 131 | text_pt = (box[0], box[1] - 3) 132 | text_color = [255, 255, 255] 133 | if color: 134 | rgb = color 135 | else: 136 | rgb = (255, 0, 0) 137 | if len(box) >= 7 and class_names: 138 | cls_conf = box[5] 139 | cls_id = box[6] 140 | print('%s: %f' % (class_names[cls_id], cls_conf)) 141 | 142 | distance = (2 * 3.14 * 180) / (w+ h * 360) * 1000 + 3 ### Distance measuring in Inch 143 | feedback = ("{}".format(class_names[cls_id])+ " " +"is"+" at {} ".format(round(distance))+"Inches") 144 | # speak.Speak(feedback) 145 | print(feedback) 146 | text_str = '%s' % (class_names[cls_id]) 147 | text_w, text_h = cv2.getTextSize(text_str, font_face, font_scale, font_thickness)[0] 148 | classes = len(class_names) 149 | offset = cls_id * 123457 % classes 150 | red = get_color(2, offset, classes) 151 | green = get_color(1, offset, classes) 152 | blue = get_color(0, offset, classes) 153 | if color is None: 154 | rgb = (red, green, blue) 155 | cv2.putText(img, str("{:.2f} Inches".format(distance)), (text_w+x,y), cv2.FONT_HERSHEY_SIMPLEX, font_scale, rgb, font_thickness, cv2.LINE_AA) 156 | img = cv2.putText(img, class_names[cls_id], (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 1.2, rgb, 1) 157 | 158 | 159 | img = cv2.rectangle(img, (x1, y1), (x2, y2), rgb, 1) 160 | 161 | 162 | return img 163 | 164 | 165 | def read_truths(lab_path): 166 | if not os.path.exists(lab_path): 167 | return np.array([]) 168 | if os.path.getsize(lab_path): 169 | truths = np.loadtxt(lab_path) 170 | truths = truths.reshape(truths.size / 5, 5) # to avoid single truth problem 171 | return truths 172 | else: 173 | return np.array([]) 174 | 175 | 176 | 177 | 178 | 179 | def post_processing(img, conf_thresh, nms_thresh, output): 180 | 181 | # anchors = [12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401] 182 | # num_anchors = 9 183 | # anchor_masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]] 184 | # strides = [8, 16, 32] 185 | # anchor_step = len(anchors) // num_anchors 186 | 187 | t1 = time.time() 188 | 189 | if type(output).__name__ != 'ndarray': 190 | output = output.cpu().detach().numpy() 191 | 192 | # [batch, num, 4] 193 | box_array = output[:, :, :4] 194 | 195 | # [batch, num, num_classes] 196 | confs = output[:, :, 4:] 197 | 198 | # [batch, num, num_classes] --> [batch, num] 199 | max_conf = np.max(confs, axis=2) 200 | max_id = np.argmax(confs, axis=2) 201 | 202 | t2 = time.time() 203 | 204 | bboxes_batch = [] 205 | for i in range(box_array.shape[0]): 206 | 207 | argwhere = max_conf[i] > conf_thresh 208 | l_box_array = box_array[i, argwhere, :] 209 | l_max_conf = max_conf[i, argwhere] 210 | l_max_id = max_id[i, argwhere] 211 | 212 | keep = nms_cpu(l_box_array, l_max_conf, nms_thresh) 213 | 214 | bboxes = [] 215 | if (keep.size > 0): 216 | l_box_array = l_box_array[keep, :] 217 | l_max_conf = l_max_conf[keep] 218 | l_max_id = l_max_id[keep] 219 | 220 | for j in range(l_box_array.shape[0]): 221 | bboxes.append([l_box_array[j, 0], l_box_array[j, 1], l_box_array[j, 2], l_box_array[j, 3], l_max_conf[j], l_max_conf[j], l_max_id[j]]) 222 | 223 | bboxes_batch.append(bboxes) 224 | 225 | t3 = time.time() 226 | 227 | # print('-----------------------------------') 228 | # print(' max and argmax : %f' % (t2 - t1)) 229 | # print(' nms : %f' % (t3 - t2)) 230 | # print('Post processing total : %f' % (t3 - t1)) 231 | # print('-----------------------------------') 232 | 233 | return bboxes_batch -------------------------------------------------------------------------------- /YOLOv4/tool/yolo_layer.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | from tool.torch_utils import * 4 | 5 | 6 | def yolo_forward_alternative(output, conf_thresh, num_classes, anchors, num_anchors, only_objectness=1, 7 | validation=False): 8 | # Output would be invalid if it does not satisfy this assert 9 | # assert (output.size(1) == (5 + num_classes) * num_anchors) 10 | 11 | # print(output.size()) 12 | 13 | # Slice the second dimension (channel) of output into: 14 | # [ 2, 2, 1, num_classes, 2, 2, 1, num_classes, 2, 2, 1, num_classes ] 15 | # And then into 16 | # bxy = [ 6 ] bwh = [ 6 ] det_conf = [ 3 ] cls_conf = [ num_classes * 3 ] 17 | batch = output.size(0) 18 | H = output.size(2) 19 | W = output.size(3) 20 | 21 | device = None 22 | cuda_check = output.is_cuda 23 | if cuda_check: 24 | device = output.get_device() 25 | 26 | 27 | # Prepare C-x, C-y, P-w, P-h (None of them are torch related) 28 | grid_x = np.expand_dims(np.linspace(0, W - 1, W), axis=0).repeat(H, 0).reshape(1, 1, H * W).repeat(batch, 0).repeat(num_anchors, 1) 29 | grid_y = np.expand_dims(np.linspace(0, H - 1, H), axis=1).repeat(W, 1).reshape(1, 1, H * W).repeat(batch, 0).repeat(num_anchors, 1) 30 | # Shape: [batch, num_anchors, H * W] 31 | grid_x_tensor = torch.tensor(grid_x, device=device, dtype=torch.float32) 32 | grid_y_tensor = torch.tensor(grid_y, device=device, dtype=torch.float32) 33 | 34 | anchor_array = np.array(anchors).reshape(1, num_anchors, 2) 35 | anchor_array = anchor_array.repeat(batch, 0) 36 | anchor_array = np.expand_dims(anchor_array, axis=3).repeat(H * W, 3) 37 | # Shape: [batch, num_anchors, 2, H * W] 38 | anchor_tensor = torch.tensor(anchor_array, device=device, dtype=torch.float32) 39 | 40 | # normalize coordinates to [0, 1] 41 | normal_array = np.array([1.0 / W, 1.0 / H, 1.0 / W, 1.0 / H], dtype=np.float32).reshape(1, 1, 4) 42 | normal_array = normal_array.repeat(batch, 0) 43 | normal_array = normal_array.repeat(num_anchors * H * W, 1) 44 | # Shape: [batch, num_anchors * H * W, 4] 45 | normal_tensor = torch.tensor(normal_array, device=device, dtype=torch.float32) 46 | 47 | bxy_list = [] 48 | bwh_list = [] 49 | det_confs_list = [] 50 | cls_confs_list = [] 51 | 52 | for i in range(num_anchors): 53 | begin = i * (5 + num_classes) 54 | end = (i + 1) * (5 + num_classes) 55 | 56 | bxy_list.append(output[:, begin : begin + 2]) 57 | bwh_list.append(output[:, begin + 2 : begin + 4]) 58 | det_confs_list.append(output[:, begin + 4 : begin + 5]) 59 | cls_confs_list.append(output[:, begin + 5 : end]) 60 | 61 | # Shape: [batch, num_anchors * 2, H, W] 62 | bxy = torch.cat(bxy_list, dim=1) 63 | # Shape: [batch, num_anchors * 2, H, W] 64 | bwh = torch.cat(bwh_list, dim=1) 65 | 66 | # Shape: [batch, num_anchors, H, W] 67 | det_confs = torch.cat(det_confs_list, dim=1) 68 | # Shape: [batch, num_anchors * H * W] 69 | det_confs = det_confs.view(batch, num_anchors * H * W) 70 | 71 | # Shape: [batch, num_anchors * num_classes, H, W] 72 | cls_confs = torch.cat(cls_confs_list, dim=1) 73 | # Shape: [batch, num_anchors, num_classes, H * W] 74 | cls_confs = cls_confs.view(batch, num_anchors, num_classes, H * W) 75 | # Shape: [batch, num_anchors, num_classes, H * W] --> [batch, num_anchors * H * W, num_classes] 76 | cls_confs = cls_confs.permute(0, 1, 3, 2).reshape(batch, num_anchors * H * W, num_classes) 77 | 78 | # Apply sigmoid(), exp() and softmax() to slices 79 | # 80 | bxy = torch.sigmoid(bxy) 81 | bwh = torch.exp(bwh) 82 | det_confs = torch.sigmoid(det_confs) 83 | cls_confs = torch.nn.Softmax(dim=2)(cls_confs) 84 | 85 | # Shape: [batch, num_anchors, 2, H * W] 86 | bxy = bxy.view(batch, num_anchors, 2, H * W) 87 | # Shape: [batch, num_anchors, 2, H * W] 88 | bwh = bwh.view(batch, num_anchors, 2, H * W) 89 | 90 | # Apply C-x, C-y, P-w, P-h 91 | bxy[:, :, 0] += grid_x_tensor 92 | bxy[:, :, 1] += grid_y_tensor 93 | 94 | print(anchor_tensor.size()) 95 | bwh *= anchor_tensor 96 | 97 | # Shape: [batch, num_anchors, 4, H * W] --> [batch, num_anchors * H * W, 4] 98 | boxes = torch.cat((bxy, bwh), dim=2).permute(0, 1, 3, 2).reshape(batch, num_anchors * H * W, 4) 99 | 100 | print(normal_tensor.size()) 101 | boxes *= normal_tensor 102 | 103 | det_confs = det_confs.view(batch, num_anchors * H * W, 1) 104 | confs = cls_confs * det_confs 105 | 106 | # boxes: [batch, num_anchors * H * W, 4] 107 | # confs: [batch, num_anchors * H * W, num_classes] 108 | 109 | return boxes, confs 110 | 111 | 112 | 113 | def yolo_forward(output, conf_thresh, num_classes, anchors, num_anchors, scale_x_y, only_objectness=1, 114 | validation=False): 115 | # Output would be invalid if it does not satisfy this assert 116 | # assert (output.size(1) == (5 + num_classes) * num_anchors) 117 | 118 | # print(output.size()) 119 | 120 | # Slice the second dimension (channel) of output into: 121 | # [ 2, 2, 1, num_classes, 2, 2, 1, num_classes, 2, 2, 1, num_classes ] 122 | # And then into 123 | # bxy = [ 6 ] bwh = [ 6 ] det_conf = [ 3 ] cls_conf = [ num_classes * 3 ] 124 | batch = output.size(0) 125 | H = output.size(2) 126 | W = output.size(3) 127 | 128 | bxy_list = [] 129 | bwh_list = [] 130 | det_confs_list = [] 131 | cls_confs_list = [] 132 | 133 | for i in range(num_anchors): 134 | begin = i * (5 + num_classes) 135 | end = (i + 1) * (5 + num_classes) 136 | 137 | bxy_list.append(output[:, begin : begin + 2]) 138 | bwh_list.append(output[:, begin + 2 : begin + 4]) 139 | det_confs_list.append(output[:, begin + 4 : begin + 5]) 140 | cls_confs_list.append(output[:, begin + 5 : end]) 141 | 142 | # Shape: [batch, num_anchors * 2, H, W] 143 | bxy = torch.cat(bxy_list, dim=1) 144 | # Shape: [batch, num_anchors * 2, H, W] 145 | bwh = torch.cat(bwh_list, dim=1) 146 | 147 | # Shape: [batch, num_anchors, H, W] 148 | det_confs = torch.cat(det_confs_list, dim=1) 149 | # Shape: [batch, num_anchors * H * W] 150 | det_confs = det_confs.view(batch, num_anchors * H * W) 151 | 152 | # Shape: [batch, num_anchors * num_classes, H, W] 153 | cls_confs = torch.cat(cls_confs_list, dim=1) 154 | # Shape: [batch, num_anchors, num_classes, H * W] 155 | cls_confs = cls_confs.view(batch, num_anchors, num_classes, H * W) 156 | # Shape: [batch, num_anchors, num_classes, H * W] --> [batch, num_anchors * H * W, num_classes] 157 | cls_confs = cls_confs.permute(0, 1, 3, 2).reshape(batch, num_anchors * H * W, num_classes) 158 | 159 | # Apply sigmoid(), exp() and softmax() to slices 160 | # 161 | bxy = torch.sigmoid(bxy) * scale_x_y - 0.5 * (scale_x_y - 1) 162 | bwh = torch.exp(bwh) 163 | det_confs = torch.sigmoid(det_confs) 164 | cls_confs = torch.nn.Softmax(dim=2)(cls_confs) 165 | 166 | # Prepare C-x, C-y, P-w, P-h (None of them are torch related) 167 | grid_x = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, W - 1, W), axis=0).repeat(H, 0), axis=0), axis=0) 168 | grid_y = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, H - 1, H), axis=1).repeat(W, 1), axis=0), axis=0) 169 | # grid_x = torch.linspace(0, W - 1, W).reshape(1, 1, 1, W).repeat(1, 1, H, 1) 170 | # grid_y = torch.linspace(0, H - 1, H).reshape(1, 1, H, 1).repeat(1, 1, 1, W) 171 | 172 | anchor_w = [] 173 | anchor_h = [] 174 | for i in range(num_anchors): 175 | anchor_w.append(anchors[i * 2]) 176 | anchor_h.append(anchors[i * 2 + 1]) 177 | 178 | device = None 179 | cuda_check = output.is_cuda 180 | if cuda_check: 181 | device = output.get_device() 182 | 183 | bx_list = [] 184 | by_list = [] 185 | bw_list = [] 186 | bh_list = [] 187 | 188 | # Apply C-x, C-y, P-w, P-h 189 | for i in range(num_anchors): 190 | ii = i * 2 191 | # Shape: [batch, 1, H, W] 192 | bx = bxy[:, ii : ii + 1] + torch.tensor(grid_x, device=device, dtype=torch.float32) # grid_x.to(device=device, dtype=torch.float32) 193 | # Shape: [batch, 1, H, W] 194 | by = bxy[:, ii + 1 : ii + 2] + torch.tensor(grid_y, device=device, dtype=torch.float32) # grid_y.to(device=device, dtype=torch.float32) 195 | # Shape: [batch, 1, H, W] 196 | bw = bwh[:, ii : ii + 1] * anchor_w[i] 197 | # Shape: [batch, 1, H, W] 198 | bh = bwh[:, ii + 1 : ii + 2] * anchor_h[i] 199 | 200 | bx_list.append(bx) 201 | by_list.append(by) 202 | bw_list.append(bw) 203 | bh_list.append(bh) 204 | 205 | 206 | ######################################## 207 | # Figure out bboxes from slices # 208 | ######################################## 209 | 210 | # Shape: [batch, num_anchors, H, W] 211 | bx = torch.cat(bx_list, dim=1) 212 | # Shape: [batch, num_anchors, H, W] 213 | by = torch.cat(by_list, dim=1) 214 | # Shape: [batch, num_anchors, H, W] 215 | bw = torch.cat(bw_list, dim=1) 216 | # Shape: [batch, num_anchors, H, W] 217 | bh = torch.cat(bh_list, dim=1) 218 | 219 | # Shape: [batch, 2 * num_anchors, H, W] 220 | bx_bw = torch.cat((bx, bw), dim=1) 221 | # Shape: [batch, 2 * num_anchors, H, W] 222 | by_bh = torch.cat((by, bh), dim=1) 223 | 224 | # normalize coordinates to [0, 1] 225 | bx_bw /= W 226 | by_bh /= H 227 | 228 | # Shape: [batch, num_anchors * H * W, 1] 229 | bx = bx_bw[:, :num_anchors].view(batch, num_anchors * H * W, 1) 230 | by = by_bh[:, :num_anchors].view(batch, num_anchors * H * W, 1) 231 | bw = bx_bw[:, num_anchors:].view(batch, num_anchors * H * W, 1) 232 | bh = by_bh[:, num_anchors:].view(batch, num_anchors * H * W, 1) 233 | 234 | # Shape: [batch, num_anchors * h * w, 4] 235 | boxes = torch.cat((bx, by, bw, bh), dim=2).view(batch, num_anchors * H * W, 4) 236 | 237 | # boxes: [batch, num_anchors * H * W, num_classes, 4] 238 | # cls_confs: [batch, num_anchors * H * W, num_classes] 239 | # det_confs: [batch, num_anchors * H * W] 240 | 241 | det_confs = det_confs.view(batch, num_anchors * H * W, 1) 242 | confs = cls_confs * det_confs 243 | 244 | # boxes: [batch, num_anchors * H * W, 4] 245 | # confs: [batch, num_anchors * H * W, num_classes] 246 | 247 | return boxes, confs 248 | 249 | 250 | class YoloLayer(nn.Module): 251 | ''' Yolo layer 252 | model_out: while inference,is post-processing inside or outside the model 253 | true:outside 254 | ''' 255 | def __init__(self, anchor_mask=[], num_classes=0, anchors=[], num_anchors=1, stride=32, model_out=False): 256 | super(YoloLayer, self).__init__() 257 | self.anchor_mask = anchor_mask 258 | self.num_classes = num_classes 259 | self.anchors = anchors 260 | self.num_anchors = num_anchors 261 | self.anchor_step = len(anchors) // num_anchors 262 | self.coord_scale = 1 263 | self.noobject_scale = 1 264 | self.object_scale = 5 265 | self.class_scale = 1 266 | self.thresh = 0.6 267 | self.stride = stride 268 | self.seen = 0 269 | self.scale_x_y = 1 270 | 271 | self.model_out = model_out 272 | 273 | def forward(self, output, target=None): 274 | if self.training: 275 | return output 276 | masked_anchors = [] 277 | for m in self.anchor_mask: 278 | masked_anchors += self.anchors[m * self.anchor_step:(m + 1) * self.anchor_step] 279 | masked_anchors = [anchor / self.stride for anchor in masked_anchors] 280 | 281 | return yolo_forward(output, self.thresh, self.num_classes, masked_anchors, len(self.anchor_mask),scale_x_y=self.scale_x_y) -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | # Flask utils 2 | from flask import Flask, redirect, url_for, request, render_template, Response 3 | from werkzeug.utils import secure_filename 4 | from gevent.pywsgi import WSGIServer 5 | from camera import ObjectDetection 6 | 7 | app = Flask(__name__) 8 | @app.route("/") 9 | def main(): 10 | return render_template("index.html") 11 | 12 | def gen(camera): 13 | while True: 14 | frame = camera.main() 15 | if frame != "": 16 | yield (b'--frame\r\n' 17 | b'Content-Type: image/jpeg\r\n\r\n' + frame + b'\r\n\r\n') 18 | 19 | @app.route('/video_feed') 20 | def video_feed(): 21 | id = 0 22 | return Response(gen(ObjectDetection(id)), mimetype='multipart/x-mixed-replace; boundary=frame') 23 | 24 | 25 | if __name__ == '__main__': 26 | # Serve the app with gevent 27 | app.run(host='127.0.0.1', threaded=True, debug = True) 28 | -------------------------------------------------------------------------------- /bbox.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import torch 4 | import random 5 | 6 | import numpy as np 7 | import cv2 8 | 9 | def confidence_filter(result, confidence): 10 | conf_mask = (result[:,:,4] > confidence).float().unsqueeze(2) 11 | result = result*conf_mask 12 | 13 | return result 14 | 15 | def confidence_filter_cls(result, confidence): 16 | max_scores = torch.max(result[:,:,5:25], 2)[0] 17 | res = torch.cat((result, max_scores),2) 18 | print(res.shape) 19 | 20 | 21 | cond_1 = (res[:,:,4] > confidence).float() 22 | cond_2 = (res[:,:,25] > 0.995).float() 23 | 24 | conf = cond_1 + cond_2 25 | conf = torch.clamp(conf, 0.0, 1.0) 26 | conf = conf.unsqueeze(2) 27 | result = result*conf 28 | return result 29 | 30 | 31 | 32 | def get_abs_coord(box): 33 | box[2], box[3] = abs(box[2]), abs(box[3]) 34 | x1 = (box[0] - box[2]/2) - 1 35 | y1 = (box[1] - box[3]/2) - 1 36 | x2 = (box[0] + box[2]/2) - 1 37 | y2 = (box[1] + box[3]/2) - 1 38 | return x1, y1, x2, y2 39 | 40 | 41 | 42 | def sanity_fix(box): 43 | if (box[0] > box[2]): 44 | box[0], box[2] = box[2], box[0] 45 | 46 | if (box[1] > box[3]): 47 | box[1], box[3] = box[3], box[1] 48 | 49 | return box 50 | 51 | def bbox_iou(box1, box2): 52 | """ 53 | Returns the IoU of two bounding boxes 54 | 55 | 56 | """ 57 | #Get the coordinates of bounding boxes 58 | b1_x1, b1_y1, b1_x2, b1_y2 = box1[:,0], box1[:,1], box1[:,2], box1[:,3] 59 | b2_x1, b2_y1, b2_x2, b2_y2 = box2[:,0], box2[:,1], box2[:,2], box2[:,3] 60 | 61 | #get the corrdinates of the intersection rectangle 62 | inter_rect_x1 = torch.max(b1_x1, b2_x1) 63 | inter_rect_y1 = torch.max(b1_y1, b2_y1) 64 | inter_rect_x2 = torch.min(b1_x2, b2_x2) 65 | inter_rect_y2 = torch.min(b1_y2, b2_y2) 66 | 67 | #Intersection area 68 | if torch.cuda.is_available(): 69 | inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1,torch.zeros(inter_rect_x2.shape).cuda())*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape).cuda()) 70 | else: 71 | inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1,torch.zeros(inter_rect_x2.shape))*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape)) 72 | 73 | #Union Area 74 | b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1) 75 | b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1) 76 | 77 | iou = inter_area / (b1_area + b2_area - inter_area) 78 | 79 | return iou 80 | 81 | 82 | def pred_corner_coord(prediction): 83 | #Get indices of non-zero confidence bboxes 84 | ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous() 85 | 86 | box = prediction[ind_nz[0], ind_nz[1]] 87 | 88 | 89 | box_a = box.new(box.shape) 90 | box_a[:,0] = (box[:,0] - box[:,2]/2) 91 | box_a[:,1] = (box[:,1] - box[:,3]/2) 92 | box_a[:,2] = (box[:,0] + box[:,2]/2) 93 | box_a[:,3] = (box[:,1] + box[:,3]/2) 94 | box[:,:4] = box_a[:,:4] 95 | 96 | prediction[ind_nz[0], ind_nz[1]] = box 97 | 98 | return prediction 99 | 100 | 101 | 102 | 103 | def write(x, batches, results, colors, classes): 104 | c1 = tuple(x[1:3].int()) 105 | c2 = tuple(x[3:5].int()) 106 | img = results[int(x[0])] 107 | cls = int(x[-1]) 108 | label = "{0}".format(classes[cls]) 109 | color = random.choice(colors) 110 | cv2.rectangle(img, c1, c2,color, 1) 111 | t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0] 112 | c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 113 | cv2.rectangle(img, c1, c2,color, -1) 114 | cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1); 115 | return img 116 | -------------------------------------------------------------------------------- /camera.py: -------------------------------------------------------------------------------- 1 | import torch,cv2,random,os,time 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import numpy as np 5 | import pickle as pkl 6 | import argparse 7 | import threading, queue 8 | from torch.multiprocessing import Pool, Process, set_start_method 9 | from util import write_results, load_classes 10 | from preprocess import letterbox_image 11 | from darknet import Darknet 12 | from imutils.video import WebcamVideoStream,FPS 13 | # from camera import write 14 | import win32com.client as wincl #### Python's Text-to-speech (tts) engine for windows, multiprocessing 15 | speak = wincl.Dispatch("SAPI.SpVoice") #### This initiates the tts engine 16 | 17 | torch.multiprocessing.set_start_method('spawn', force=True) 18 | 19 | ## Setting up torch for gpu utilization 20 | if torch.cuda.is_available(): 21 | torch.backends.cudnn.enabled = True 22 | torch.backends.cudnn.benchmark = True 23 | torch.backends.cudnn.deterministic = True 24 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 25 | 26 | def prep_image(img, inp_dim): 27 | """ 28 | Prepare image for inputting to the neural network. 29 | Returns a Variable 30 | """ 31 | orig_im = img 32 | dim = orig_im.shape[1], orig_im.shape[0] 33 | img = (letterbox_image(orig_im, (inp_dim, inp_dim))) 34 | img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy() 35 | img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) 36 | return img_, orig_im, dim 37 | 38 | labels = {} 39 | b_boxes = {} 40 | def write(bboxes, img, classes, colors): 41 | """ 42 | Draws the bounding box in every frame over the objects that the model detects 43 | """ 44 | class_idx = bboxes 45 | bboxes = bboxes[1:5] 46 | bboxes = bboxes.cpu().data.numpy() 47 | bboxes = bboxes.astype(int) 48 | b_boxes.update({"bbox":bboxes.tolist()}) 49 | # bboxes = bboxes + [150,100,200,200] # personal choice you can modify this to get distance as accurate as possible 50 | bboxes = torch.from_numpy(bboxes) 51 | cls = int(class_idx[-1]) 52 | label = "{0}".format(classes[cls]) 53 | labels.update({"Current Object":label}) 54 | color = random.choice(colors) 55 | 56 | ## Put text configuration on frame 57 | text_str = '%s' % (label) 58 | font_face = cv2.FONT_HERSHEY_DUPLEX 59 | font_scale = 0.6 60 | font_thickness = 1 61 | text_w, text_h = cv2.getTextSize(text_str, font_face, font_scale, font_thickness)[0] 62 | text_pt = (bboxes[0], bboxes[1] - 3) 63 | text_color = [255, 255, 255] 64 | 65 | 66 | ## Distance Meaasurement for each bounding box 67 | x, y, w, h = bboxes[0], bboxes[1], bboxes[2], bboxes[3] 68 | ## item() is used to retrieve the value from the tensor 69 | distance = (2 * 3.14 * 180) / (w.item()+ h.item() * 360) * 1000 + 3 ### Distance measuring in Inch 70 | feedback = ("{}".format(labels["Current Object"])+ " " +"is"+" at {} ".format(round(distance))+"Inches") 71 | # # speak.Speak(feedback) # If you are running this on linux based OS kindly use espeak. Using this speaking library in winodws will add unnecessary latency 72 | print(feedback) 73 | 74 | cv2.putText(img, str("{:.2f} Inches".format(distance)), (text_w+x,y), cv2.FONT_HERSHEY_DUPLEX, font_scale, (0,255,0), font_thickness, cv2.LINE_AA) 75 | cv2.rectangle(img, (bboxes[0],bboxes[1]),(bboxes[2] + text_w -30,bboxes[3]), color, 2) 76 | cv2.putText(img, text_str, text_pt, font_face, font_scale, text_color, font_thickness, cv2.LINE_AA) 77 | 78 | return img 79 | 80 | class ObjectDetection: 81 | def __init__(self, id): 82 | # self.cap = cv2.VideoCapture(id) 83 | self.cap = WebcamVideoStream(src = id).start() 84 | self.cfgfile = "cfg/yolov3.cfg" 85 | # self.cfgfile = 'cfg/yolov3-tiny.cfg' 86 | self.weightsfile = "yolov3.weights" 87 | # self.weightsfile = 'yolov3-tiny.weights' 88 | self.confidence = float(0.6) 89 | self.nms_thesh = float(0.8) 90 | self.num_classes = 80 91 | self.classes = load_classes('data/coco.names') 92 | self.colors = pkl.load(open("pallete", "rb")) 93 | self.model = Darknet(self.cfgfile) 94 | self.CUDA = torch.cuda.is_available() 95 | self.model.load_weights(self.weightsfile) 96 | self.model.net_info["height"] = 160 97 | self.inp_dim = int(self.model.net_info["height"]) 98 | self.width = 1280 #640#1280 99 | self.height = 720 #360#720 100 | print("Loading network.....") 101 | if self.CUDA: 102 | self.model.cuda() 103 | print("Network successfully loaded") 104 | assert self.inp_dim % 32 == 0 105 | assert self.inp_dim > 32 106 | self.model.eval() 107 | 108 | def main(self): 109 | q = queue.Queue() 110 | while True: 111 | def frame_render(queue_from_cam): 112 | frame = self.cap.read() # If you capture stream using opencv (cv2.VideoCapture()) the use the following line 113 | # ret, frame = self.cap.read() 114 | frame = cv2.resize(frame,(self.width, self.height)) 115 | queue_from_cam.put(frame) 116 | cam = threading.Thread(target=frame_render, args=(q,)) 117 | cam.start() 118 | cam.join() 119 | frame = q.get() 120 | q.task_done() 121 | fps = FPS().start() 122 | try: 123 | img, orig_im, dim = prep_image(frame, self.inp_dim) 124 | im_dim = torch.FloatTensor(dim).repeat(1,2) 125 | if self.CUDA: #### If you have a gpu properly installed then it will run on the gpu 126 | im_dim = im_dim.cuda() 127 | img = img.cuda() 128 | # with torch.no_grad(): #### Set the model in the evaluation mode 129 | output = self.model(Variable(img), self.CUDA) 130 | output = write_results(output, self.confidence, self.num_classes, nms = True, nms_conf = self.nms_thesh) #### Localize the objects in a frame 131 | output = output.type(torch.half) 132 | 133 | if list(output.size()) == [1,86]: 134 | print(output.size()) 135 | pass 136 | else: 137 | output[:,1:5] = torch.clamp(output[:,1:5], 0.0, float(self.inp_dim))/self.inp_dim 138 | 139 | # im_dim = im_dim.repeat(output.size(0), 1) 140 | output[:,[1,3]] *= frame.shape[1] 141 | output[:,[2,4]] *= frame.shape[0] 142 | list(map(lambda boxes: write(boxes, frame, self.classes, self.colors),output)) 143 | 144 | except: 145 | pass 146 | 147 | fps.update() 148 | fps.stop() 149 | ret, jpeg = cv2.imencode('.jpg', frame) 150 | print("[INFO] elasped time: {:.2f}".format(fps.elapsed())) 151 | print("[INFO] approx. FPS: {:.1f}".format(fps.fps())) 152 | 153 | return jpeg.tostring() -------------------------------------------------------------------------------- /cfg/tiny-yolo-voc.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | batch=64 3 | subdivisions=8 4 | width=416 5 | height=416 6 | channels=3 7 | momentum=0.9 8 | decay=0.0005 9 | angle=0 10 | saturation = 1.5 11 | exposure = 1.5 12 | hue=.1 13 | 14 | learning_rate=0.001 15 | max_batches = 40200 16 | policy=steps 17 | steps=-1,100,20000,30000 18 | scales=.1,10,.1,.1 19 | 20 | [convolutional] 21 | batch_normalize=1 22 | filters=16 23 | size=3 24 | stride=1 25 | pad=1 26 | activation=leaky 27 | 28 | [maxpool] 29 | size=2 30 | stride=2 31 | 32 | [convolutional] 33 | batch_normalize=1 34 | filters=32 35 | size=3 36 | stride=1 37 | pad=1 38 | activation=leaky 39 | 40 | [maxpool] 41 | size=2 42 | stride=2 43 | 44 | [convolutional] 45 | batch_normalize=1 46 | filters=64 47 | size=3 48 | stride=1 49 | pad=1 50 | activation=leaky 51 | 52 | [maxpool] 53 | size=2 54 | stride=2 55 | 56 | [convolutional] 57 | batch_normalize=1 58 | filters=128 59 | size=3 60 | stride=1 61 | pad=1 62 | activation=leaky 63 | 64 | [maxpool] 65 | size=2 66 | stride=2 67 | 68 | [convolutional] 69 | batch_normalize=1 70 | filters=256 71 | size=3 72 | stride=1 73 | pad=1 74 | activation=leaky 75 | 76 | [maxpool] 77 | size=2 78 | stride=2 79 | 80 | [convolutional] 81 | batch_normalize=1 82 | filters=512 83 | size=3 84 | stride=1 85 | pad=1 86 | activation=leaky 87 | 88 | [maxpool] 89 | size=2 90 | stride=1 91 | 92 | [convolutional] 93 | batch_normalize=1 94 | filters=1024 95 | size=3 96 | stride=1 97 | pad=1 98 | activation=leaky 99 | 100 | ########### 101 | 102 | [convolutional] 103 | batch_normalize=1 104 | size=3 105 | stride=1 106 | pad=1 107 | filters=1024 108 | activation=leaky 109 | 110 | [convolutional] 111 | size=1 112 | stride=1 113 | pad=1 114 | filters=125 115 | activation=linear 116 | 117 | [region] 118 | anchors = 1.08,1.19, 3.42,4.41, 6.63,11.38, 9.42,5.11, 16.62,10.52 119 | bias_match=1 120 | classes=20 121 | coords=4 122 | num=5 123 | softmax=1 124 | jitter=.2 125 | rescore=1 126 | 127 | object_scale=5 128 | noobject_scale=1 129 | class_scale=1 130 | coord_scale=1 131 | 132 | absolute=1 133 | thresh = .6 134 | random=1 135 | -------------------------------------------------------------------------------- /cfg/yolo-voc.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=64 4 | subdivisions=8 5 | # Training 6 | # batch=64 7 | # subdivisions=8 8 | height=416 9 | width=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 80200 21 | policy=steps 22 | steps=-1,500,40000,60000 23 | scales=0.1,10,.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=64 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=128 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [convolutional] 58 | batch_normalize=1 59 | filters=64 60 | size=1 61 | stride=1 62 | pad=1 63 | activation=leaky 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=1 70 | pad=1 71 | activation=leaky 72 | 73 | [maxpool] 74 | size=2 75 | stride=2 76 | 77 | [convolutional] 78 | batch_normalize=1 79 | filters=256 80 | size=3 81 | stride=1 82 | pad=1 83 | activation=leaky 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=128 88 | size=1 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=256 96 | size=3 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [maxpool] 102 | size=2 103 | stride=2 104 | 105 | [convolutional] 106 | batch_normalize=1 107 | filters=512 108 | size=3 109 | stride=1 110 | pad=1 111 | activation=leaky 112 | 113 | [convolutional] 114 | batch_normalize=1 115 | filters=256 116 | size=1 117 | stride=1 118 | pad=1 119 | activation=leaky 120 | 121 | [convolutional] 122 | batch_normalize=1 123 | filters=512 124 | size=3 125 | stride=1 126 | pad=1 127 | activation=leaky 128 | 129 | [convolutional] 130 | batch_normalize=1 131 | filters=256 132 | size=1 133 | stride=1 134 | pad=1 135 | activation=leaky 136 | 137 | [convolutional] 138 | batch_normalize=1 139 | filters=512 140 | size=3 141 | stride=1 142 | pad=1 143 | activation=leaky 144 | 145 | [maxpool] 146 | size=2 147 | stride=2 148 | 149 | [convolutional] 150 | batch_normalize=1 151 | filters=1024 152 | size=3 153 | stride=1 154 | pad=1 155 | activation=leaky 156 | 157 | [convolutional] 158 | batch_normalize=1 159 | filters=512 160 | size=1 161 | stride=1 162 | pad=1 163 | activation=leaky 164 | 165 | [convolutional] 166 | batch_normalize=1 167 | filters=1024 168 | size=3 169 | stride=1 170 | pad=1 171 | activation=leaky 172 | 173 | [convolutional] 174 | batch_normalize=1 175 | filters=512 176 | size=1 177 | stride=1 178 | pad=1 179 | activation=leaky 180 | 181 | [convolutional] 182 | batch_normalize=1 183 | filters=1024 184 | size=3 185 | stride=1 186 | pad=1 187 | activation=leaky 188 | 189 | 190 | ####### 191 | 192 | [convolutional] 193 | batch_normalize=1 194 | size=3 195 | stride=1 196 | pad=1 197 | filters=1024 198 | activation=leaky 199 | 200 | [convolutional] 201 | batch_normalize=1 202 | size=3 203 | stride=1 204 | pad=1 205 | filters=1024 206 | activation=leaky 207 | 208 | [route] 209 | layers=-9 210 | 211 | [convolutional] 212 | batch_normalize=1 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=64 217 | activation=leaky 218 | 219 | [reorg] 220 | stride=2 221 | 222 | [route] 223 | layers=-1,-4 224 | 225 | [convolutional] 226 | batch_normalize=1 227 | size=3 228 | stride=1 229 | pad=1 230 | filters=1024 231 | activation=leaky 232 | 233 | [convolutional] 234 | size=1 235 | stride=1 236 | pad=1 237 | filters=125 238 | activation=linear 239 | 240 | 241 | [region] 242 | anchors = 1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071 243 | bias_match=1 244 | classes=20 245 | coords=4 246 | num=5 247 | softmax=1 248 | jitter=.3 249 | rescore=1 250 | 251 | object_scale=5 252 | noobject_scale=1 253 | class_scale=1 254 | coord_scale=1 255 | 256 | absolute=1 257 | thresh = .6 258 | random=1 259 | -------------------------------------------------------------------------------- /cfg/yolo.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=8 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=64 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=128 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [convolutional] 58 | batch_normalize=1 59 | filters=64 60 | size=1 61 | stride=1 62 | pad=1 63 | activation=leaky 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=1 70 | pad=1 71 | activation=leaky 72 | 73 | [maxpool] 74 | size=2 75 | stride=2 76 | 77 | [convolutional] 78 | batch_normalize=1 79 | filters=256 80 | size=3 81 | stride=1 82 | pad=1 83 | activation=leaky 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=128 88 | size=1 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=256 96 | size=3 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [maxpool] 102 | size=2 103 | stride=2 104 | 105 | [convolutional] 106 | batch_normalize=1 107 | filters=512 108 | size=3 109 | stride=1 110 | pad=1 111 | activation=leaky 112 | 113 | [convolutional] 114 | batch_normalize=1 115 | filters=256 116 | size=1 117 | stride=1 118 | pad=1 119 | activation=leaky 120 | 121 | [convolutional] 122 | batch_normalize=1 123 | filters=512 124 | size=3 125 | stride=1 126 | pad=1 127 | activation=leaky 128 | 129 | [convolutional] 130 | batch_normalize=1 131 | filters=256 132 | size=1 133 | stride=1 134 | pad=1 135 | activation=leaky 136 | 137 | [convolutional] 138 | batch_normalize=1 139 | filters=512 140 | size=3 141 | stride=1 142 | pad=1 143 | activation=leaky 144 | 145 | [maxpool] 146 | size=2 147 | stride=2 148 | 149 | [convolutional] 150 | batch_normalize=1 151 | filters=1024 152 | size=3 153 | stride=1 154 | pad=1 155 | activation=leaky 156 | 157 | [convolutional] 158 | batch_normalize=1 159 | filters=512 160 | size=1 161 | stride=1 162 | pad=1 163 | activation=leaky 164 | 165 | [convolutional] 166 | batch_normalize=1 167 | filters=1024 168 | size=3 169 | stride=1 170 | pad=1 171 | activation=leaky 172 | 173 | [convolutional] 174 | batch_normalize=1 175 | filters=512 176 | size=1 177 | stride=1 178 | pad=1 179 | activation=leaky 180 | 181 | [convolutional] 182 | batch_normalize=1 183 | filters=1024 184 | size=3 185 | stride=1 186 | pad=1 187 | activation=leaky 188 | 189 | 190 | ####### 191 | 192 | [convolutional] 193 | batch_normalize=1 194 | size=3 195 | stride=1 196 | pad=1 197 | filters=1024 198 | activation=leaky 199 | 200 | [convolutional] 201 | batch_normalize=1 202 | size=3 203 | stride=1 204 | pad=1 205 | filters=1024 206 | activation=leaky 207 | 208 | [route] 209 | layers=-9 210 | 211 | [convolutional] 212 | batch_normalize=1 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=64 217 | activation=leaky 218 | 219 | [reorg] 220 | stride=2 221 | 222 | [route] 223 | layers=-1,-4 224 | 225 | [convolutional] 226 | batch_normalize=1 227 | size=3 228 | stride=1 229 | pad=1 230 | filters=1024 231 | activation=leaky 232 | 233 | [convolutional] 234 | size=1 235 | stride=1 236 | pad=1 237 | filters=425 238 | activation=linear 239 | 240 | 241 | [region] 242 | anchors = 0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828 243 | bias_match=1 244 | classes=80 245 | coords=4 246 | num=5 247 | softmax=1 248 | jitter=.3 249 | rescore=1 250 | 251 | object_scale=5 252 | noobject_scale=1 253 | class_scale=1 254 | coord_scale=1 255 | 256 | absolute=1 257 | thresh = .6 258 | random=1 259 | -------------------------------------------------------------------------------- /cfg/yolov3-spp.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=leaky 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=leaky 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=leaky 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=255 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 6,7,8 642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 643 | classes=80 644 | num=9 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=256 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=leaky 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | size=3 697 | stride=1 698 | pad=1 699 | filters=512 700 | activation=leaky 701 | 702 | [convolutional] 703 | batch_normalize=1 704 | filters=256 705 | size=1 706 | stride=1 707 | pad=1 708 | activation=leaky 709 | 710 | [convolutional] 711 | batch_normalize=1 712 | size=3 713 | stride=1 714 | pad=1 715 | filters=512 716 | activation=leaky 717 | 718 | [convolutional] 719 | size=1 720 | stride=1 721 | pad=1 722 | filters=255 723 | activation=linear 724 | 725 | 726 | [yolo] 727 | mask = 3,4,5 728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 729 | classes=80 730 | num=9 731 | jitter=.3 732 | ignore_thresh = .7 733 | truth_thresh = 1 734 | random=1 735 | 736 | 737 | 738 | [route] 739 | layers = -4 740 | 741 | [convolutional] 742 | batch_normalize=1 743 | filters=128 744 | size=1 745 | stride=1 746 | pad=1 747 | activation=leaky 748 | 749 | [upsample] 750 | stride=2 751 | 752 | [route] 753 | layers = -1, 36 754 | 755 | 756 | 757 | [convolutional] 758 | batch_normalize=1 759 | filters=128 760 | size=1 761 | stride=1 762 | pad=1 763 | activation=leaky 764 | 765 | [convolutional] 766 | batch_normalize=1 767 | size=3 768 | stride=1 769 | pad=1 770 | filters=256 771 | activation=leaky 772 | 773 | [convolutional] 774 | batch_normalize=1 775 | filters=128 776 | size=1 777 | stride=1 778 | pad=1 779 | activation=leaky 780 | 781 | [convolutional] 782 | batch_normalize=1 783 | size=3 784 | stride=1 785 | pad=1 786 | filters=256 787 | activation=leaky 788 | 789 | [convolutional] 790 | batch_normalize=1 791 | filters=128 792 | size=1 793 | stride=1 794 | pad=1 795 | activation=leaky 796 | 797 | [convolutional] 798 | batch_normalize=1 799 | size=3 800 | stride=1 801 | pad=1 802 | filters=256 803 | activation=leaky 804 | 805 | [convolutional] 806 | size=1 807 | stride=1 808 | pad=1 809 | filters=255 810 | activation=linear 811 | 812 | 813 | [yolo] 814 | mask = 0,1,2 815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 816 | classes=80 817 | num=9 818 | jitter=.3 819 | ignore_thresh = .7 820 | truth_thresh = 1 821 | random=1 822 | 823 | -------------------------------------------------------------------------------- /cfg/yolov3-tiny.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=2 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=16 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=32 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=64 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [maxpool] 58 | size=2 59 | stride=2 60 | 61 | [convolutional] 62 | batch_normalize=1 63 | filters=128 64 | size=3 65 | stride=1 66 | pad=1 67 | activation=leaky 68 | 69 | [maxpool] 70 | size=2 71 | stride=2 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=256 76 | size=3 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [maxpool] 82 | size=2 83 | stride=2 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=512 88 | size=3 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [maxpool] 94 | size=2 95 | stride=1 96 | 97 | [convolutional] 98 | batch_normalize=1 99 | filters=1024 100 | size=3 101 | stride=1 102 | pad=1 103 | activation=leaky 104 | 105 | ########### 106 | 107 | [convolutional] 108 | batch_normalize=1 109 | filters=256 110 | size=1 111 | stride=1 112 | pad=1 113 | activation=leaky 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=512 118 | size=3 119 | stride=1 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | size=1 125 | stride=1 126 | pad=1 127 | filters=255 128 | activation=linear 129 | 130 | 131 | 132 | [yolo] 133 | mask = 3,4,5 134 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 135 | classes=80 136 | num=6 137 | jitter=.3 138 | ignore_thresh = .7 139 | truth_thresh = 1 140 | random=1 141 | 142 | [route] 143 | layers = -4 144 | 145 | [convolutional] 146 | batch_normalize=1 147 | filters=128 148 | size=1 149 | stride=1 150 | pad=1 151 | activation=leaky 152 | 153 | [upsample] 154 | stride=2 155 | 156 | [route] 157 | layers = -1, 8 158 | 159 | [convolutional] 160 | batch_normalize=1 161 | filters=256 162 | size=3 163 | stride=1 164 | pad=1 165 | activation=leaky 166 | 167 | [convolutional] 168 | size=1 169 | stride=1 170 | pad=1 171 | filters=255 172 | activation=linear 173 | 174 | [yolo] 175 | mask = 0,1,2 176 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 177 | classes=80 178 | num=6 179 | jitter=.3 180 | ignore_thresh = .7 181 | truth_thresh = 1 182 | random=1 183 | -------------------------------------------------------------------------------- /cfg/yolov3.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=16 8 | width= 320 9 | height = 320 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [convolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=255 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 6,7,8 609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 610 | classes=80 611 | num=9 612 | jitter=.3 613 | ignore_thresh = .5 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [convolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=255 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 3,4,5 695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 696 | classes=80 697 | num=9 698 | jitter=.3 699 | ignore_thresh = .5 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [convolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=255 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 0,1,2 782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 783 | classes=80 784 | num=9 785 | jitter=.3 786 | ignore_thresh = .5 787 | truth_thresh = 1 788 | random=1 789 | 790 | -------------------------------------------------------------------------------- /data/coco.names: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /data/voc.names: -------------------------------------------------------------------------------- 1 | aeroplane 2 | bicycle 3 | bird 4 | boat 5 | bottle 6 | bus 7 | car 8 | cat 9 | chair 10 | cow 11 | diningtable 12 | dog 13 | horse 14 | motorbike 15 | person 16 | pottedplant 17 | sheep 18 | sofa 19 | train 20 | tvmonitor 21 | -------------------------------------------------------------------------------- /object_detection.py: -------------------------------------------------------------------------------- 1 | import torch,cv2,random,os,time 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import numpy as np 5 | import pickle as pkl 6 | import argparse 7 | import threading, queue 8 | from torch.multiprocessing import Pool, Process, set_start_method 9 | from util import write_results, load_classes 10 | from preprocess import letterbox_image 11 | from darknet import Darknet 12 | from imutils.video import WebcamVideoStream,FPS 13 | # from camera import write 14 | import win32com.client as wincl #### Python's Text-to-speech (tts) engine for windows, multiprocessing 15 | speak = wincl.Dispatch("SAPI.SpVoice") #### This initiates the tts engine 16 | 17 | torch.multiprocessing.set_start_method('spawn', force=True) 18 | 19 | ## Setting up torch for gpu utilization 20 | if torch.cuda.is_available(): 21 | torch.backends.cudnn.enabled = True 22 | torch.backends.cudnn.benchmark = True 23 | torch.backends.cudnn.deterministic = True 24 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 25 | 26 | def prep_image(img, inp_dim): 27 | """ 28 | Prepare image for inputting to the neural network. 29 | Returns a Variable 30 | """ 31 | orig_im = img 32 | dim = orig_im.shape[1], orig_im.shape[0] 33 | img = (letterbox_image(orig_im, (inp_dim, inp_dim))) 34 | img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy() 35 | img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) 36 | return img_, orig_im, dim 37 | 38 | labels = {} 39 | b_boxes = {} 40 | def write(bboxes, img, classes, colors): 41 | """ 42 | Draws the bounding box in every frame over the objects that the model detects 43 | """ 44 | class_idx = bboxes 45 | bboxes = bboxes[1:5] 46 | bboxes = bboxes.cpu().data.numpy() 47 | bboxes = bboxes.astype(int) 48 | b_boxes.update({"bbox":bboxes.tolist()}) 49 | # bboxes = bboxes + [150,100,200,200] # personal choice you can modify this to get distance as accurate as possible 50 | bboxes = torch.from_numpy(bboxes) 51 | cls = int(class_idx[-1]) 52 | label = "{0}".format(classes[cls]) 53 | labels.update({"Current Object":label}) 54 | color = random.choice(colors) 55 | 56 | ## Put text configuration on frame 57 | text_str = '%s' % (label) 58 | font_face = cv2.FONT_HERSHEY_DUPLEX 59 | font_scale = 0.6 60 | font_thickness = 1 61 | text_w, text_h = cv2.getTextSize(text_str, font_face, font_scale, font_thickness)[0] 62 | text_pt = (bboxes[0], bboxes[1] - 3) 63 | text_color = [255, 255, 255] 64 | 65 | 66 | ## Distance Meaasurement for each bounding box 67 | x, y, w, h = bboxes[0], bboxes[1], bboxes[2], bboxes[3] 68 | ## item() is used to retrieve the value from the tensor 69 | distance = (2 * 3.14 * 180) / (w.item()+ h.item() * 360) * 1000 + 3 ### Distance measuring in Inch 70 | feedback = ("{}".format(labels["Current Object"])+ " " +"is"+" at {} ".format(round(distance))+"Inches") 71 | # # speak.Speak(feedback) # If you are running this on linux based OS kindly use espeak. Using this speaking library in winodws will add unnecessary latency 72 | print(feedback) 73 | 74 | cv2.putText(img, str("{:.2f} Inches".format(distance)), (text_w+x,y), cv2.FONT_HERSHEY_DUPLEX, font_scale, (0,255,0), font_thickness, cv2.LINE_AA) 75 | cv2.rectangle(img, (bboxes[0],bboxes[1]),(bboxes[2] + text_w -30,bboxes[3]), color, 2) 76 | cv2.putText(img, text_str, text_pt, font_face, font_scale, text_color, font_thickness, cv2.LINE_AA) 77 | 78 | return img 79 | 80 | class ObjectDetection: 81 | def __init__(self, id): 82 | # self.cap = cv2.VideoCapture(id) 83 | self.cap = WebcamVideoStream(src = id).start() 84 | self.cfgfile = "cfg/yolov3.cfg" 85 | # self.cfgfile = 'cfg/yolov3-tiny.cfg' 86 | self.weightsfile = "yolov3.weights" 87 | # self.weightsfile = 'yolov3-tiny.weights' 88 | self.confidence = float(0.6) 89 | self.nms_thesh = float(0.8) 90 | self.num_classes = 80 91 | self.classes = load_classes('data/coco.names') 92 | self.colors = pkl.load(open("pallete", "rb")) 93 | self.model = Darknet(self.cfgfile) 94 | self.CUDA = torch.cuda.is_available() 95 | self.model.load_weights(self.weightsfile) 96 | self.model.net_info["height"] = 160 97 | self.inp_dim = int(self.model.net_info["height"]) 98 | self.width = 1280 #640#1280 99 | self.height = 720 #360#720 100 | print("Loading network.....") 101 | if self.CUDA: 102 | self.model.cuda() 103 | print("Network successfully loaded") 104 | assert self.inp_dim % 32 == 0 105 | assert self.inp_dim > 32 106 | self.model.eval() 107 | 108 | def main(self): 109 | q = queue.Queue() 110 | while True: 111 | def frame_render(queue_from_cam): 112 | frame = self.cap.read() # If you capture stream using opencv (cv2.VideoCapture()) the use the following line 113 | # ret, frame = self.cap.read() 114 | frame = cv2.resize(frame,(self.width, self.height)) 115 | queue_from_cam.put(frame) 116 | cam = threading.Thread(target=frame_render, args=(q,)) 117 | cam.start() 118 | cam.join() 119 | frame = q.get() 120 | q.task_done() 121 | fps = FPS().start() 122 | try: 123 | img, orig_im, dim = prep_image(frame, self.inp_dim) 124 | im_dim = torch.FloatTensor(dim).repeat(1,2) 125 | if self.CUDA: #### If you have a gpu properly installed then it will run on the gpu 126 | im_dim = im_dim.cuda() 127 | img = img.cuda() 128 | # with torch.no_grad(): #### Set the model in the evaluation mode 129 | output = self.model(Variable(img), self.CUDA) 130 | output = write_results(output, self.confidence, self.num_classes, nms = True, nms_conf = self.nms_thesh) #### Localize the objects in a frame 131 | output = output.type(torch.half) 132 | 133 | if list(output.size()) == [1,86]: 134 | print(output.size()) 135 | pass 136 | else: 137 | output[:,1:5] = torch.clamp(output[:,1:5], 0.0, float(self.inp_dim))/self.inp_dim 138 | 139 | # im_dim = im_dim.repeat(output.size(0), 1) 140 | output[:,[1,3]] *= frame.shape[1] 141 | output[:,[2,4]] *= frame.shape[0] 142 | list(map(lambda boxes: write(boxes, frame, self.classes, self.colors),output)) 143 | 144 | except: 145 | pass 146 | 147 | fps.update() 148 | fps.stop() 149 | print("[INFO] elasped time: {:.2f}".format(fps.elapsed())) 150 | print("[INFO] approx. FPS: {:.1f}".format(fps.fps())) 151 | cv2.imshow("Object Detection Window", frame) 152 | 153 | if cv2.waitKey(1) & 0xFF == ord('q'): 154 | break 155 | continue 156 | torch.cuda.empty_cache() 157 | 158 | 159 | if __name__ == "__main__": 160 | id = 0 161 | ObjectDetection(id).main() 162 | -------------------------------------------------------------------------------- /pallete: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paul-pias/Object-Detection-and-Distance-Measurement/d03baa0d99626190c87fccdd75fbc67ce8d176f8/pallete -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | import numpy as np 8 | import cv2 9 | import matplotlib.pyplot as plt 10 | from util import count_parameters as count 11 | from util import convert2cpu as cpu 12 | from PIL import Image, ImageDraw 13 | 14 | 15 | def letterbox_image(img, inp_dim): 16 | '''resize image with unchanged aspect ratio using padding''' 17 | img_w, img_h = img.shape[1], img.shape[0] 18 | w, h = inp_dim 19 | new_w = int(img_w * min(w/img_w, h/img_h)) 20 | new_h = int(img_h * min(w/img_w, h/img_h)) 21 | resized_image = cv2.resize(img, (new_w,new_h), interpolation = cv2.INTER_CUBIC) 22 | 23 | canvas = np.full((inp_dim[1], inp_dim[0], 3), 128) 24 | 25 | canvas[(h-new_h)//2:(h-new_h)//2 + new_h,(w-new_w)//2:(w-new_w)//2 + new_w, :] = resized_image 26 | 27 | return canvas 28 | 29 | 30 | 31 | def prep_image(img, inp_dim): 32 | """ 33 | Prepare image for inputting to the neural network. 34 | 35 | Returns a Variable 36 | """ 37 | 38 | orig_im = cv2.imread(img) 39 | dim = orig_im.shape[1], orig_im.shape[0] 40 | img = (letterbox_image(orig_im, (inp_dim, inp_dim))) 41 | img_ = img[:,:,::-1].transpose((2,0,1)).copy() 42 | img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) 43 | return img_, orig_im, dim 44 | 45 | def prep_image_pil(img, network_dim): 46 | orig_im = Image.open(img) 47 | img = orig_im.convert('RGB') 48 | dim = img.size 49 | img = img.resize(network_dim) 50 | img = torch.ByteTensor(torch.ByteStorage.from_buffer(img.tobytes())) 51 | img = img.view(*network_dim, 3).transpose(0,1).transpose(0,2).contiguous() 52 | img = img.view(1, 3,*network_dim) 53 | img = img.float().div(255.0) 54 | return (img, orig_im, dim) 55 | 56 | def inp_to_image(inp): 57 | inp = inp.cpu().squeeze() 58 | inp = inp*255 59 | try: 60 | inp = inp.data.numpy() 61 | except RuntimeError: 62 | inp = inp.numpy() 63 | inp = inp.transpose(1,2,0) 64 | 65 | inp = inp[:,:,::-1] 66 | return inp 67 | 68 | 69 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python==4.1.0.25 2 | numpy==1.17.0 3 | pandas==0.25.1 4 | torch_nightly==1.2.0.dev20190807+cpu 5 | matplotlib==3.1.1 6 | Pillow>=7.1.0 7 | torch==1.2.0 8 | imutils 9 | -------------------------------------------------------------------------------- /templates/12.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paul-pias/Object-Detection-and-Distance-Measurement/d03baa0d99626190c87fccdd75fbc67ce8d176f8/templates/12.jpg -------------------------------------------------------------------------------- /templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
18 |
19 | 34 |
35 | {% block content %} 36 | 37 | {% endblock %} 38 |
39 |
40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} {% block content %} 2 | 3 | 4 | 5 |
6 |
7 |
8 |

Camera - 01

9 |
10 | 11 |
12 | 13 |
14 |
15 |
16 | {% endblock %} -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import division 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | import numpy as np 9 | import cv2 10 | import matplotlib.pyplot as plt 11 | from bbox import bbox_iou 12 | 13 | def count_parameters(model): 14 | return sum(p.numel() for p in model.parameters()) 15 | 16 | def count_learnable_parameters(model): 17 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 18 | 19 | def convert2cpu(matrix): 20 | if matrix.is_cuda: 21 | return torch.FloatTensor(matrix.size()).copy_(matrix) 22 | else: 23 | return matrix 24 | 25 | def predict_transform(prediction, inp_dim, anchors, num_classes, CUDA = True): 26 | batch_size = prediction.size(0) 27 | stride = inp_dim // prediction.size(2) 28 | grid_size = inp_dim // stride 29 | bbox_attrs = 5 + num_classes 30 | num_anchors = len(anchors) 31 | 32 | anchors = [(a[0]/stride, a[1]/stride) for a in anchors] 33 | 34 | 35 | 36 | prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size) 37 | prediction = prediction.transpose(1,2).contiguous() 38 | prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs) 39 | 40 | 41 | #Sigmoid the centre_X, centre_Y. and object confidencce 42 | prediction[:,:,0] = torch.sigmoid(prediction[:,:,0]) 43 | prediction[:,:,1] = torch.sigmoid(prediction[:,:,1]) 44 | prediction[:,:,4] = torch.sigmoid(prediction[:,:,4]) 45 | 46 | 47 | 48 | #Add the center offsets 49 | grid_len = np.arange(grid_size) 50 | a,b = np.meshgrid(grid_len, grid_len) 51 | 52 | x_offset = torch.FloatTensor(a).view(-1,1) 53 | y_offset = torch.FloatTensor(b).view(-1,1) 54 | 55 | if CUDA: 56 | x_offset = x_offset.cuda() 57 | y_offset = y_offset.cuda() 58 | 59 | x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1,num_anchors).view(-1,2).unsqueeze(0) 60 | 61 | prediction[:,:,:2] += x_y_offset 62 | 63 | #log space transform height and the width 64 | anchors = torch.FloatTensor(anchors) 65 | 66 | if CUDA: 67 | anchors = anchors.cuda() 68 | 69 | anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0) 70 | prediction[:,:,2:4] = torch.exp(prediction[:,:,2:4])*anchors 71 | 72 | #Softmax the class scores 73 | prediction[:,:,5: 5 + num_classes] = torch.sigmoid((prediction[:,:, 5 : 5 + num_classes])) 74 | 75 | prediction[:,:,:4] *= stride 76 | 77 | 78 | return prediction 79 | 80 | def load_classes(namesfile): 81 | fp = open(namesfile, "r") 82 | names = fp.read().split("\n")[:-1] 83 | return names 84 | 85 | def get_im_dim(im): 86 | im = cv2.imread(im) 87 | w,h = im.shape[1], im.shape[0] 88 | return w,h 89 | 90 | def unique(tensor): 91 | tensor_np = tensor.cpu().numpy() 92 | unique_np = np.unique(tensor_np) 93 | unique_tensor = torch.from_numpy(unique_np) 94 | 95 | tensor_res = tensor.new(unique_tensor.shape) 96 | tensor_res.copy_(unique_tensor) 97 | return tensor_res 98 | 99 | def write_results(prediction, confidence, num_classes, nms = True, nms_conf = 0.4): 100 | conf_mask = (prediction[:,:,4] > confidence).float().unsqueeze(2) 101 | prediction = prediction*conf_mask 102 | 103 | 104 | try: 105 | ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous() 106 | except: 107 | return 0 108 | 109 | 110 | box_a = prediction.new(prediction.shape) 111 | box_a[:,:,0] = (prediction[:,:,0] - prediction[:,:,2]/2) 112 | box_a[:,:,1] = (prediction[:,:,1] - prediction[:,:,3]/2) 113 | box_a[:,:,2] = (prediction[:,:,0] + prediction[:,:,2]/2) 114 | box_a[:,:,3] = (prediction[:,:,1] + prediction[:,:,3]/2) 115 | prediction[:,:,:4] = box_a[:,:,:4] 116 | 117 | 118 | 119 | batch_size = prediction.size(0) 120 | 121 | output = prediction.new(1, prediction.size(2) + 1) 122 | write = False 123 | 124 | 125 | for ind in range(batch_size): 126 | #select the image from the batch 127 | image_pred = prediction[ind] 128 | 129 | 130 | 131 | #Get the class having maximum score, and the index of that class 132 | #Get rid of num_classes softmax scores 133 | #Add the class index and the class score of class having maximum score 134 | max_conf, max_conf_score = torch.max(image_pred[:,5:5+ num_classes], 1) 135 | max_conf = max_conf.float().unsqueeze(1) 136 | max_conf_score = max_conf_score.float().unsqueeze(1) 137 | seq = (image_pred[:,:5], max_conf, max_conf_score) 138 | image_pred = torch.cat(seq, 1) 139 | 140 | 141 | 142 | #Get rid of the zero entries 143 | non_zero_ind = (torch.nonzero(image_pred[:,4])) 144 | 145 | 146 | image_pred_ = image_pred[non_zero_ind.squeeze(),:].view(-1,7) 147 | 148 | #Get the various classes detected in the image 149 | try: 150 | img_classes = unique(image_pred_[:,-1]) 151 | except: 152 | continue 153 | #WE will do NMS classwise 154 | for cls in img_classes: 155 | #get the detections with one particular class 156 | cls_mask = image_pred_*(image_pred_[:,-1] == cls).float().unsqueeze(1) 157 | class_mask_ind = torch.nonzero(cls_mask[:,-2]).squeeze() 158 | 159 | 160 | image_pred_class = image_pred_[class_mask_ind].view(-1,7) 161 | 162 | 163 | 164 | #sort the detections such that the entry with the maximum objectness 165 | #confidence is at the top 166 | conf_sort_index = torch.sort(image_pred_class[:,4], descending = True )[1] 167 | image_pred_class = image_pred_class[conf_sort_index] 168 | idx = image_pred_class.size(0) 169 | 170 | #if nms has to be done 171 | if nms: 172 | #For each detection 173 | for i in range(idx): 174 | #Get the IOUs of all boxes that come after the one we are looking at 175 | #in the loop 176 | try: 177 | ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:]) 178 | except ValueError: 179 | break 180 | 181 | except IndexError: 182 | break 183 | 184 | #Zero out all the detections that have IoU > treshhold 185 | iou_mask = (ious < nms_conf).float().unsqueeze(1) 186 | image_pred_class[i+1:] *= iou_mask 187 | 188 | #Remove the non-zero entries 189 | non_zero_ind = torch.nonzero(image_pred_class[:,4]).squeeze() 190 | image_pred_class = image_pred_class[non_zero_ind].view(-1,7) 191 | 192 | 193 | 194 | #Concatenate the batch_id of the image to the detection 195 | #this helps us identify which image does the detection correspond to 196 | #We use a linear straucture to hold ALL the detections from the batch 197 | #the batch_dim is flattened 198 | #batch is identified by extra batch column 199 | 200 | 201 | batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind) 202 | seq = batch_ind, image_pred_class 203 | if not write: 204 | output = torch.cat(seq,1) 205 | write = True 206 | else: 207 | out = torch.cat(seq,1) 208 | output = torch.cat((output,out)) 209 | 210 | return output 211 | 212 | #!/usr/bin/env python3 213 | # -*- coding: utf-8 -*- 214 | """ 215 | Created on Sat Mar 24 00:12:16 2018 216 | 217 | @author: ayooshmac 218 | """ 219 | 220 | def predict_transform_half(prediction, inp_dim, anchors, num_classes, CUDA = True): 221 | batch_size = prediction.size(0) 222 | stride = inp_dim // prediction.size(2) 223 | 224 | bbox_attrs = 5 + num_classes 225 | num_anchors = len(anchors) 226 | grid_size = inp_dim // stride 227 | 228 | 229 | prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size) 230 | prediction = prediction.transpose(1,2).contiguous() 231 | prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs) 232 | 233 | 234 | #Sigmoid the centre_X, centre_Y. and object confidencce 235 | prediction[:,:,0] = torch.sigmoid(prediction[:,:,0]) 236 | prediction[:,:,1] = torch.sigmoid(prediction[:,:,1]) 237 | prediction[:,:,4] = torch.sigmoid(prediction[:,:,4]) 238 | 239 | 240 | #Add the center offsets 241 | grid_len = np.arange(grid_size) 242 | a,b = np.meshgrid(grid_len, grid_len) 243 | 244 | x_offset = torch.FloatTensor(a).view(-1,1) 245 | y_offset = torch.FloatTensor(b).view(-1,1) 246 | 247 | if CUDA: 248 | x_offset = x_offset.cuda().half() 249 | y_offset = y_offset.cuda().half() 250 | 251 | x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1,num_anchors).view(-1,2).unsqueeze(0) 252 | 253 | prediction[:,:,:2] += x_y_offset 254 | 255 | #log space transform height and the width 256 | anchors = torch.HalfTensor(anchors) 257 | 258 | if CUDA: 259 | anchors = anchors.cuda() 260 | 261 | anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0) 262 | prediction[:,:,2:4] = torch.exp(prediction[:,:,2:4])*anchors 263 | 264 | #Softmax the class scores 265 | prediction[:,:,5: 5 + num_classes] = nn.Softmax(-1)(Variable(prediction[:,:, 5 : 5 + num_classes])).data 266 | 267 | prediction[:,:,:4] *= stride 268 | 269 | 270 | return prediction 271 | 272 | 273 | def write_results_half(prediction, confidence, num_classes, nms = True, nms_conf = 0.4): 274 | conf_mask = (prediction[:,:,4] > confidence).half().unsqueeze(2) 275 | prediction = prediction*conf_mask 276 | 277 | try: 278 | ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous() 279 | except: 280 | return 0 281 | 282 | 283 | 284 | box_a = prediction.new(prediction.shape) 285 | box_a[:,:,0] = (prediction[:,:,0] - prediction[:,:,2]/2) 286 | box_a[:,:,1] = (prediction[:,:,1] - prediction[:,:,3]/2) 287 | box_a[:,:,2] = (prediction[:,:,0] + prediction[:,:,2]/2) 288 | box_a[:,:,3] = (prediction[:,:,1] + prediction[:,:,3]/2) 289 | prediction[:,:,:4] = box_a[:,:,:4] 290 | 291 | 292 | 293 | batch_size = prediction.size(0) 294 | 295 | output = prediction.new(1, prediction.size(2) + 1) 296 | write = False 297 | 298 | for ind in range(batch_size): 299 | #select the image from the batch 300 | image_pred = prediction[ind] 301 | 302 | 303 | #Get the class having maximum score, and the index of that class 304 | #Get rid of num_classes softmax scores 305 | #Add the class index and the class score of class having maximum score 306 | max_conf, max_conf_score = torch.max(image_pred[:,5:5+ num_classes], 1) 307 | max_conf = max_conf.half().unsqueeze(1) 308 | max_conf_score = max_conf_score.half().unsqueeze(1) 309 | seq = (image_pred[:,:5], max_conf, max_conf_score) 310 | image_pred = torch.cat(seq, 1) 311 | 312 | 313 | #Get rid of the zero entries 314 | non_zero_ind = (torch.nonzero(image_pred[:,4])) 315 | try: 316 | image_pred_ = image_pred[non_zero_ind.squeeze(),:] 317 | except: 318 | continue 319 | 320 | #Get the various classes detected in the image 321 | img_classes = unique(image_pred_[:,-1].long()).half() 322 | 323 | 324 | 325 | 326 | #WE will do NMS classwise 327 | for cls in img_classes: 328 | #get the detections with one particular class 329 | cls_mask = image_pred_*(image_pred_[:,-1] == cls).half().unsqueeze(1) 330 | class_mask_ind = torch.nonzero(cls_mask[:,-2]).squeeze() 331 | 332 | 333 | image_pred_class = image_pred_[class_mask_ind] 334 | 335 | 336 | #sort the detections such that the entry with the maximum objectness 337 | #confidence is at the top 338 | conf_sort_index = torch.sort(image_pred_class[:,4], descending = True )[1] 339 | image_pred_class = image_pred_class[conf_sort_index] 340 | idx = image_pred_class.size(0) 341 | 342 | #if nms has to be done 343 | if nms: 344 | #For each detection 345 | for i in range(idx): 346 | #Get the IOUs of all boxes that come after the one we are looking at 347 | #in the loop 348 | try: 349 | ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:]) 350 | except ValueError: 351 | break 352 | 353 | except IndexError: 354 | break 355 | 356 | #Zero out all the detections that have IoU > treshhold 357 | iou_mask = (ious < nms_conf).half().unsqueeze(1) 358 | image_pred_class[i+1:] *= iou_mask 359 | 360 | #Remove the non-zero entries 361 | non_zero_ind = torch.nonzero(image_pred_class[:,4]).squeeze() 362 | image_pred_class = image_pred_class[non_zero_ind] 363 | 364 | 365 | 366 | #Concatenate the batch_id of the image to the detection 367 | #this helps us identify which image does the detection correspond to 368 | #We use a linear straucture to hold ALL the detections from the batch 369 | #the batch_dim is flattened 370 | #batch is identified by extra batch column 371 | batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind) 372 | seq = batch_ind, image_pred_class 373 | 374 | if not write: 375 | output = torch.cat(seq,1) 376 | write = True 377 | else: 378 | out = torch.cat(seq,1) 379 | output = torch.cat((output,out)) 380 | 381 | return output 382 | -------------------------------------------------------------------------------- /utils/app_utils.py: -------------------------------------------------------------------------------- 1 | # import the necessary packages 2 | from threading import Thread 3 | import datetime 4 | import cv2 5 | 6 | class FPS: 7 | def __init__(self): 8 | # store the start time, end time, and total number of frames 9 | # that were examined between the start and end intervals 10 | self._start = None 11 | self._end = None 12 | self._numFrames = 0 13 | 14 | def start(self): 15 | # start the timer 16 | self._start = datetime.datetime.now() 17 | return self 18 | 19 | def stop(self): 20 | # stop the timer 21 | self._end = datetime.datetime.now() 22 | 23 | def update(self): 24 | # increment the total number of frames examined during the 25 | # start and end intervals 26 | self._numFrames += 1 27 | 28 | def elapsed(self): 29 | # return the total number of seconds between the start and 30 | # end interval 31 | return (self._end - self._start).total_seconds() 32 | 33 | def fps(self): 34 | # compute the (approximate) frames per second 35 | return self._numFrames / self.elapsed() 36 | 37 | 38 | class WebcamVideoStream: 39 | def __init__(self, src=0): 40 | # initialize the video camera stream and read the first frame 41 | # from the stream 42 | self.stream = cv2.VideoCapture(src) 43 | (self.grabbed, self.frame) = self.stream.read() 44 | 45 | # initialize the variable used to indicate if the thread should 46 | # be stopped 47 | self.stopped = False 48 | 49 | def start(self): 50 | # start the thread to read frames from the video stream 51 | Thread(target=self.update, args=()).start() 52 | return self 53 | 54 | def update(self): 55 | # keep looping infinitely until the thread is stopped 56 | while True: 57 | # if the thread indicator variable is set, stop the thread 58 | if self.stopped: 59 | return 60 | 61 | # otherwise, read the next frame from the stream 62 | (self.grabbed, self.frame) = self.stream.read() 63 | 64 | def read(self): 65 | # return the frame most recently read 66 | return self.grabbed, self.frame 67 | 68 | def stop(self): 69 | # indicate that the thread should be stopped 70 | self.stopped = True 71 | 72 | def getWidth(self): 73 | # Get the width of the frames 74 | return int(self.stream.get(cv2.CAP_PROP_FRAME_WIDTH)) 75 | 76 | def getHeight(self): 77 | # Get the height of the frames 78 | return int(self.stream.get(cv2.CAP_PROP_FRAME_HEIGHT)) 79 | 80 | def getFPS(self): 81 | # Get the frame rate of the frames 82 | return int(self.stream.get(cv2.CAP_PROP_FPS)) 83 | 84 | def isOpen(self): 85 | # Get the frame rate of the frames 86 | return self.stream.isOpened() 87 | 88 | def setFramePosition(self, framePos): 89 | self.stream.set(cv2.CAP_PROP_POS_FRAMES, framePos) 90 | 91 | def getFramePosition(self): 92 | return int(self.stream.get(cv2.CAP_PROP_POS_FRAMES)) 93 | 94 | def getFrameCount(self): 95 | return int(self.stream.get(cv2.CAP_PROP_FRAME_COUNT)) 96 | -------------------------------------------------------------------------------- /utils/objDet_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from utils.app_utils import * 3 | import numpy as np 4 | import tensorflow as tf 5 | import sys 6 | 7 | sys.path.append("C:\\Users\\okaya\\Documents\\tensorflow\\models\\research\\") 8 | sys.path.append("C:\\Users\\okaya\\Documents\\tensorflow\\models\\research\\object_detection\\utils") 9 | 10 | from object_detection.utils import label_map_util 11 | from object_detection.utils import visualization_utils as vis_util 12 | 13 | 14 | # Path to frozen detection graph. This is the actual model that is used for the object detection. 15 | PATH_TO_CKPT = 'model/frozen_inference_graph.pb' 16 | 17 | # List of the strings that is used to add correct label for each box. 18 | PATH_TO_LABELS = 'model/mscoco_label_map.pbtxt' 19 | 20 | NUM_CLASSES = 90 21 | 22 | # Loading label map 23 | label_map = label_map_util.load_labelmap(PATH_TO_LABELS) 24 | categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, 25 | use_display_name=True) 26 | category_index = label_map_util.create_category_index(categories) 27 | 28 | def detect_objects(image_np, sess, detection_graph): 29 | # Expand dimensions since the model expects images to have shape: [1, None, None, 3] 30 | image_np_expanded = np.expand_dims(image_np, axis=0) 31 | image_tensor = detection_graph.get_tensor_by_name('image_tensor:0') 32 | 33 | # Each box represents a part of the image where a particular object was detected. 34 | boxes = detection_graph.get_tensor_by_name('detection_boxes:0') 35 | 36 | # Each score represent how level of confidence for each of the objects. 37 | # Score is shown on the result image, together with the class label. 38 | scores = detection_graph.get_tensor_by_name('detection_scores:0') 39 | classes = detection_graph.get_tensor_by_name('detection_classes:0') 40 | num_detections = detection_graph.get_tensor_by_name('num_detections:0') 41 | 42 | # Actual detection. 43 | (boxes, scores, classes, num_detections) = sess.run( 44 | [boxes, scores, classes, num_detections], 45 | feed_dict={image_tensor: image_np_expanded}) 46 | 47 | # Visualization of the results of a detection. 48 | vis_util.visualize_boxes_and_labels_on_image_array( 49 | image_np, 50 | np.squeeze(boxes), 51 | np.squeeze(classes).astype(np.int32), 52 | np.squeeze(scores), 53 | category_index, 54 | use_normalized_coordinates=True, 55 | line_thickness=4) 56 | 57 | return image_np 58 | 59 | 60 | 61 | def worker(input_q, output_q): 62 | # Load a (frozen) Tensorflow model into memory. 63 | detection_graph = tf.Graph() 64 | with detection_graph.as_default(): 65 | od_graph_def = tf.GraphDef() 66 | with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid: 67 | serialized_graph = fid.read() 68 | od_graph_def.ParseFromString(serialized_graph) 69 | tf.import_graph_def(od_graph_def, name='') 70 | sess = tf.Session(graph=detection_graph) 71 | 72 | fps = FPS().start() 73 | while True: 74 | fps.update() 75 | frame = input_q.get() 76 | 77 | # Check frame object is a 2-D array (video) or 1-D (webcam) 78 | if len(frame) == 2: 79 | frame_rgb = cv2.cvtColor(frame[1], cv2.COLOR_BGR2RGB) 80 | output_q.put((frame[0], detect_objects(frame_rgb, sess, detection_graph))) 81 | else: 82 | frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 83 | output_q.put(detect_objects(frame_rgb, sess, detection_graph)) 84 | fps.stop() 85 | sess.close() 86 | --------------------------------------------------------------------------------