├── README.md ├── bbox.py ├── coco_labels.txt ├── common.py ├── data_processing.py ├── images ├── dog.jpg └── person.jpg ├── onnx_to_tensorrt.py ├── util.py ├── yolov3-608.cfg └── yolov3_to_onnx.py /README.md: -------------------------------------------------------------------------------- 1 | # Update on 2019-04-19 2 | - I have optimized and upgraded this project. So: 3 | - If you see this project for the first time, you can jump to [This project](https://github.com/Cw-zero/TensorRT_yolo3_module) directly. 4 | - If you meet some bug on this project,you can try [This project](https://github.com/Cw-zero/TensorRT_yolo3_module). 5 | 6 | # Use TensorRT accelerate yolo3 7 | --- 8 | ## 1. How to run this project 9 | - a. Download yolo3.weight from [this](https://pjreddie.com/media/files/yolov3.weights), and change the name to **yolov3-608.weights**. 10 | - b. `python yolov3_to_onnx.py`, you will have a file named **yolov3-608.onnx** 11 | - c. `python onnx_to_tensorrt.py`,you can get the result of detections. 12 | 13 | ## 2. Performance compare 14 | - a.You can download and run [this project](https://github.com/ayooshkathuria/pytorch-yolo-v3), which our project is changed from it. 15 | It detection speed is about **100ms** per image. 16 | 17 | - b.Our project speed is about **62ms** per image 18 | 19 | ## 3.Others 20 | - If you are more familiar with Chinese, you can refer to this blog(https://www.cnblogs.com/justcoder/), which has more details. 21 | -------------------------------------------------------------------------------- /bbox.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import torch 4 | import random 5 | 6 | import numpy as np 7 | import cv2 8 | 9 | def confidence_filter(result, confidence): 10 | conf_mask = (result[:,:,4] > confidence).float().unsqueeze(2) 11 | result = result*conf_mask 12 | 13 | return result 14 | 15 | def confidence_filter_cls(result, confidence): 16 | max_scores = torch.max(result[:,:,5:25], 2)[0] 17 | res = torch.cat((result, max_scores),2) 18 | print(res.shape) 19 | 20 | 21 | cond_1 = (res[:,:,4] > confidence).float() 22 | cond_2 = (res[:,:,25] > 0.995).float() 23 | 24 | conf = cond_1 + cond_2 25 | conf = torch.clamp(conf, 0.0, 1.0) 26 | conf = conf.unsqueeze(2) 27 | result = result*conf 28 | return result 29 | 30 | 31 | 32 | def get_abs_coord(box): 33 | box[2], box[3] = abs(box[2]), abs(box[3]) 34 | x1 = (box[0] - box[2]/2) - 1 35 | y1 = (box[1] - box[3]/2) - 1 36 | x2 = (box[0] + box[2]/2) - 1 37 | y2 = (box[1] + box[3]/2) - 1 38 | return x1, y1, x2, y2 39 | 40 | 41 | 42 | def sanity_fix(box): 43 | if (box[0] > box[2]): 44 | box[0], box[2] = box[2], box[0] 45 | 46 | if (box[1] > box[3]): 47 | box[1], box[3] = box[3], box[1] 48 | 49 | return box 50 | 51 | def bbox_iou(box1, box2): 52 | """ 53 | Returns the IoU of two bounding boxes 54 | 55 | 56 | """ 57 | #Get the coordinates of bounding boxes 58 | b1_x1, b1_y1, b1_x2, b1_y2 = box1[:,0], box1[:,1], box1[:,2], box1[:,3] 59 | b2_x1, b2_y1, b2_x2, b2_y2 = box2[:,0], box2[:,1], box2[:,2], box2[:,3] 60 | 61 | #get the corrdinates of the intersection rectangle 62 | inter_rect_x1 = torch.max(b1_x1, b2_x1) 63 | inter_rect_y1 = torch.max(b1_y1, b2_y1) 64 | inter_rect_x2 = torch.min(b1_x2, b2_x2) 65 | inter_rect_y2 = torch.min(b1_y2, b2_y2) 66 | 67 | #Intersection area 68 | 69 | inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1,torch.zeros(inter_rect_x2.shape).cuda())*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape).cuda()) 70 | # inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1,torch.zeros(inter_rect_x2.shape))*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape)) 71 | 72 | #Union Area 73 | b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1) 74 | b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1) 75 | 76 | iou = inter_area / (b1_area + b2_area - inter_area) 77 | 78 | return iou 79 | 80 | 81 | def pred_corner_coord(prediction): 82 | #Get indices of non-zero confidence bboxes 83 | ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous() 84 | 85 | box = prediction[ind_nz[0], ind_nz[1]] 86 | 87 | 88 | box_a = box.new(box.shape) 89 | box_a[:,0] = (box[:,0] - box[:,2]/2) 90 | box_a[:,1] = (box[:,1] - box[:,3]/2) 91 | box_a[:,2] = (box[:,0] + box[:,2]/2) 92 | box_a[:,3] = (box[:,1] + box[:,3]/2) 93 | box[:,:4] = box_a[:,:4] 94 | 95 | prediction[ind_nz[0], ind_nz[1]] = box 96 | 97 | return prediction 98 | 99 | 100 | 101 | 102 | def write(x, batches, results, colors, classes): 103 | c1 = tuple(x[1:3].int()) 104 | c2 = tuple(x[3:5].int()) 105 | img = results[int(x[0])] 106 | cls = int(x[-1]) 107 | label = "{0}".format(classes[cls]) 108 | color = random.choice(colors) 109 | cv2.rectangle(img, c1, c2,color, 1) 110 | t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0] 111 | c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 112 | cv2.rectangle(img, c1, c2,color, -1) 113 | cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1); 114 | return img 115 | -------------------------------------------------------------------------------- /coco_labels.txt: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush -------------------------------------------------------------------------------- /common.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 1993-2018 NVIDIA Corporation. All rights reserved. 3 | # 4 | # NOTICE TO LICENSEE: 5 | # 6 | # This source code and/or documentation ("Licensed Deliverables") are 7 | # subject to NVIDIA intellectual property rights under U.S. and 8 | # international Copyright laws. 9 | # 10 | # These Licensed Deliverables contained herein is PROPRIETARY and 11 | # CONFIDENTIAL to NVIDIA and is being provided under the terms and 12 | # conditions of a form of NVIDIA software license agreement by and 13 | # between NVIDIA and Licensee ("License Agreement") or electronically 14 | # accepted by Licensee. Notwithstanding any terms or conditions to 15 | # the contrary in the License Agreement, reproduction or disclosure 16 | # of the Licensed Deliverables to any third party without the express 17 | # written consent of NVIDIA is prohibited. 18 | # 19 | # NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE 20 | # LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE 21 | # SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS 22 | # PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. 23 | # NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED 24 | # DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, 25 | # NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 26 | # NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE 27 | # LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY 28 | # SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY 29 | # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 30 | # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 31 | # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 32 | # OF THESE LICENSED DELIVERABLES. 33 | # 34 | # U.S. Government End Users. These Licensed Deliverables are a 35 | # "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 36 | # 1995), consisting of "commercial computer software" and "commercial 37 | # computer software documentation" as such terms are used in 48 38 | # C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government 39 | # only as a commercial end item. Consistent with 48 C.F.R.12.212 and 40 | # 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all 41 | # U.S. Government End Users acquire the Licensed Deliverables with 42 | # only those rights set forth herein. 43 | # 44 | # Any use of the Licensed Deliverables in individual and commercial 45 | # software must include, in the user documentation and internal 46 | # comments to the code, the above Disclaimer and U.S. Government End 47 | # Users Notice. 48 | # 49 | 50 | import os 51 | import argparse 52 | import numpy as np 53 | import pycuda.driver as cuda 54 | import tensorrt as trt 55 | 56 | try: 57 | # Sometimes python2 does not understand FileNotFoundError 58 | FileNotFoundError 59 | except NameError: 60 | FileNotFoundError = IOError 61 | 62 | def GiB(val): 63 | return val * 1 << 30 64 | 65 | def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[]): 66 | ''' 67 | Parses sample arguments. 68 | Args: 69 | description (str): Description of the sample. 70 | subfolder (str): The subfolder containing data relevant to this sample 71 | find_files (str): A list of filenames to find. Each filename will be replaced with an absolute path. 72 | Returns: 73 | str: Path of data directory. 74 | Raises: 75 | FileNotFoundError 76 | ''' 77 | kDEFAULT_DATA_ROOT = os.path.abspath("/usr/src/tensorrt/data") 78 | 79 | # Standard command-line arguments for all samples. 80 | parser = argparse.ArgumentParser(description=description) 81 | parser.add_argument("-d", "--datadir", help="Location of the TensorRT sample data directory.") 82 | args, unknown_args = parser.parse_known_args() 83 | 84 | # If data directory is not specified, use the default. 85 | data_root = args.datadir if args.datadir else kDEFAULT_DATA_ROOT 86 | # If the subfolder exists, append it to the path, otherwise use the provided path as-is. 87 | subfolder_path = os.path.join(data_root, subfolder) 88 | if not os.path.exists(subfolder_path): 89 | print("WARNING: " + subfolder_path + " does not exist. Using " + data_root + " instead.") 90 | data_path = subfolder_path if os.path.exists(subfolder_path) else data_root 91 | 92 | # Make sure data directory exists. 93 | if not (os.path.exists(data_path)): 94 | raise FileNotFoundError(data_path + " does not exist. Please provide the correct data path with the -d option.") 95 | 96 | # Find all requested files. 97 | for index, f in enumerate(find_files): 98 | find_files[index] = os.path.abspath(os.path.join(data_path, f)) 99 | if not os.path.exists(find_files[index]): 100 | raise FileNotFoundError(find_files[index] + " does not exist. Please provide the correct data path with the -d option.") 101 | if find_files: 102 | return data_path, find_files 103 | else: 104 | return data_path 105 | 106 | # Simple helper data class that's a little nicer to use than a 2-tuple. 107 | class HostDeviceMem(object): 108 | def __init__(self, host_mem, device_mem): 109 | self.host = host_mem 110 | self.device = device_mem 111 | 112 | def __str__(self): 113 | return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) 114 | 115 | def __repr__(self): 116 | return self.__str__() 117 | 118 | # Allocates all buffers required for an engine, i.e. host/device inputs/outputs. 119 | def allocate_buffers(engine): 120 | inputs = [] 121 | outputs = [] 122 | bindings = [] 123 | stream = cuda.Stream() 124 | for binding in engine: 125 | size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size 126 | dtype = trt.nptype(engine.get_binding_dtype(binding)) 127 | # Allocate host and device buffers 128 | host_mem = cuda.pagelocked_empty(size, dtype) 129 | device_mem = cuda.mem_alloc(host_mem.nbytes) 130 | # Append the device buffer to device bindings. 131 | bindings.append(int(device_mem)) 132 | # Append to the appropriate list. 133 | if engine.binding_is_input(binding): 134 | inputs.append(HostDeviceMem(host_mem, device_mem)) 135 | else: 136 | outputs.append(HostDeviceMem(host_mem, device_mem)) 137 | return inputs, outputs, bindings, stream 138 | 139 | # This function is generalized for multiple inputs/outputs. 140 | # inputs and outputs are expected to be lists of HostDeviceMem objects. 141 | def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): 142 | # Transfer input data to the GPU. 143 | [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] 144 | # Run inference. 145 | context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) 146 | # Transfer predictions back from the GPU. 147 | [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] 148 | # Synchronize the stream 149 | stream.synchronize() 150 | # Return only the host outputs. 151 | return [out.host for out in outputs] 152 | -------------------------------------------------------------------------------- /data_processing.py: -------------------------------------------------------------------------------- 1 | import time 2 | import math 3 | from PIL import Image 4 | import numpy as np 5 | import torch 6 | from torch.autograd import Variable 7 | 8 | 9 | # YOLOv3-608 has been trained with these 80 categories from COCO: 10 | # Lin, Tsung-Yi, et al. "Microsoft COCO: Common Objects in Context." 11 | # European Conference on Computer Vision. Springer, Cham, 2014. 12 | 13 | def load_label_categories(label_file_path): 14 | categories = [line.rstrip('\n') for line in open(label_file_path)] 15 | return categories 16 | 17 | LABEL_FILE_PATH = 'coco_labels.txt' 18 | ALL_CATEGORIES = load_label_categories(LABEL_FILE_PATH) 19 | 20 | # Let's make sure that there are 80 classes, as expected for the COCO data set: 21 | CATEGORY_NUM = len(ALL_CATEGORIES) 22 | assert CATEGORY_NUM == 80 23 | 24 | 25 | class PreprocessYOLO(object): 26 | """A simple class for loading images with PIL and reshaping them to the specified 27 | input resolution for YOLOv3-608. 28 | """ 29 | 30 | def __init__(self, yolo_input_resolution): 31 | """Initialize with the input resolution for YOLOv3, which will stay fixed in this sample. 32 | 33 | Keyword arguments: 34 | yolo_input_resolution -- two-dimensional tuple with the target network's (spatial) 35 | input resolution in HW order 36 | """ 37 | self.yolo_input_resolution = yolo_input_resolution 38 | 39 | def process(self, input_image_path): 40 | """Load an image from the specified input path, 41 | and return it together with a pre-processed version required for feeding it into a 42 | YOLOv3 network. 43 | 44 | Keyword arguments: 45 | input_image_path -- string path of the image to be loaded 46 | """ 47 | image_raw, image_resized = self._load_and_resize(input_image_path) 48 | image_preprocessed = self._shuffle_and_normalize(image_resized) 49 | return image_raw, image_preprocessed 50 | 51 | def _load_and_resize(self, input_image_path): 52 | """Load an image from the specified path and resize it to the input resolution. 53 | Return the input image before resizing as a PIL Image (required for visualization), 54 | and the resized image as a NumPy float array. 55 | 56 | Keyword arguments: 57 | input_image_path -- string path of the image to be loaded 58 | """ 59 | 60 | image_raw = Image.open(input_image_path) 61 | # Expecting yolo_input_resolution in (height, width) format, adjusting to PIL 62 | # convention (width, height) in PIL: 63 | new_resolution = ( 64 | self.yolo_input_resolution[1], 65 | self.yolo_input_resolution[0]) 66 | image_resized = image_raw.resize( 67 | new_resolution, resample=Image.BICUBIC) 68 | image_resized = np.array(image_resized, dtype=np.float32, order='C') 69 | return image_raw, image_resized 70 | 71 | def _shuffle_and_normalize(self, image): 72 | """Normalize a NumPy array representing an image to the range [0, 1], and 73 | convert it from HWC format ("channels last") to NCHW format ("channels first" 74 | with leading batch dimension). 75 | 76 | Keyword arguments: 77 | image -- image as three-dimensional NumPy float array, in HWC format 78 | """ 79 | image /= 255.0 80 | # HWC to CHW format: 81 | image = np.transpose(image, [2, 0, 1]) 82 | # CHW to NCHW format 83 | image = np.expand_dims(image, axis=0) 84 | # Convert the image to row-major order, also known as "C order": 85 | image = np.array(image, dtype=np.float32, order='C') 86 | return image 87 | 88 | 89 | class PostprocessYOLO(object): 90 | """Class for post-processing the three outputs tensors from YOLOv3-608.""" 91 | 92 | def __init__(self, 93 | yolo_masks, 94 | yolo_anchors, 95 | obj_threshold, 96 | nms_threshold, 97 | yolo_input_resolution): 98 | """Initialize with all values that will be kept when processing several frames. 99 | Assuming 3 outputs of the network in the case of (large) YOLOv3. 100 | 101 | Keyword arguments: 102 | yolo_masks -- a list of 3 three-dimensional tuples for the YOLO masks 103 | yolo_anchors -- a list of 9 two-dimensional tuples for the YOLO anchors 104 | object_threshold -- threshold for object coverage, float value between 0 and 1 105 | nms_threshold -- threshold for non-max suppression algorithm, 106 | float value between 0 and 1 107 | input_resolution_yolo -- two-dimensional tuple with the target network's (spatial) 108 | input resolution in HW order 109 | """ 110 | self.masks = yolo_masks 111 | self.anchors = yolo_anchors 112 | self.object_threshold = obj_threshold 113 | self.nms_threshold = nms_threshold 114 | self.input_resolution_yolo = yolo_input_resolution 115 | 116 | def process(self, outputs): 117 | out_boxes = [] 118 | num_anchors = 3 119 | num_classes = 80 120 | # for output in outputs: 121 | for output, mask in zip(outputs, self.masks): 122 | anchors = [self.anchors[i] for i in mask] 123 | anchors = list(np.reshape(anchors,(6,-1))) 124 | 125 | anchor_step = len(anchors)/num_anchors 126 | 127 | output = torch.from_numpy(output) 128 | if output.dim() == 3: 129 | output = output.unsqueeze(0) 130 | batch = output.size(0) 131 | assert(output.size(1) == (5+num_classes)*num_anchors) 132 | h = output.size(2) 133 | w = output.size(3) 134 | 135 | t0 = time.time() 136 | all_boxes = [] #only output,not outputs 137 | output = output.view(batch*num_anchors, 5+num_classes, h*w).transpose(0,1).contiguous().view(5+num_classes, batch*num_anchors*h*w) 138 | 139 | #use CPU, so GPU more faster? 140 | grid_x = torch.linspace(0, w-1, w).repeat(h,1).repeat(batch*num_anchors, 1, 1).view(batch*num_anchors*h*w).type_as(output) #cuda() 141 | grid_y = torch.linspace(0, h-1, h).repeat(w,1).t().repeat(batch*num_anchors, 1, 1).view(batch*num_anchors*h*w).type_as(output) #cuda() 142 | xs = torch.sigmoid(output[0]) + grid_x 143 | ys = torch.sigmoid(output[1]) + grid_y 144 | 145 | #use CPU, so GPU more faster? 146 | anchor_w = torch.Tensor(anchors).view(num_anchors, anchor_step).index_select(1, torch.LongTensor([0])) 147 | anchor_h = torch.Tensor(anchors).view(num_anchors, anchor_step).index_select(1, torch.LongTensor([1])) 148 | anchor_w = anchor_w.repeat(batch, 1).repeat(1, 1, h*w).view(batch*num_anchors*h*w).type_as(output) #cuda() 149 | anchor_h = anchor_h.repeat(batch, 1).repeat(1, 1, h*w).view(batch*num_anchors*h*w).type_as(output) #cuda() 150 | ws = torch.exp(output[2]) * anchor_w 151 | hs = torch.exp(output[3]) * anchor_h 152 | 153 | det_confs = torch.sigmoid(output[4]) 154 | cls_confs = torch.nn.Softmax()(Variable(output[5:5+num_classes].transpose(0,1))).data 155 | cls_max_confs, cls_max_ids = torch.max(cls_confs, 1) 156 | cls_max_confs = cls_max_confs.view(-1) 157 | cls_max_ids = cls_max_ids.view(-1) 158 | t1 = time.time() 159 | 160 | sz_hw = h*w 161 | sz_hwa = sz_hw*num_anchors 162 | t2 = time.time() 163 | for b in range(batch): 164 | boxes = [] 165 | for cy in range(h): 166 | for cx in range(w): 167 | for i in range(num_anchors): 168 | ind = b*sz_hwa + i*sz_hw + cy*w + cx 169 | det_conf = det_confs[ind] 170 | conf = det_confs[ind] 171 | 172 | if conf > self.object_threshold: 173 | bcx = xs[ind] 174 | bcy = ys[ind] 175 | bw = ws[ind] 176 | bh = hs[ind] 177 | cls_max_conf = cls_max_confs[ind] 178 | cls_max_id = cls_max_ids[ind] 179 | box = [bcx/w, bcy/h, bw/w, bh/h, det_conf, cls_max_conf, cls_max_id] 180 | boxes.append(box) 181 | all_boxes.append(boxes) 182 | t3 = time.time() 183 | out_boxes.append(all_boxes) 184 | 185 | 186 | if True: 187 | print('---------------------------------') 188 | print('matrix computation : %f' % (t1-t0)) 189 | print(' gpu to cpu : %f' % (t2-t1)) 190 | print(' boxes filter : %f' % (t3-t2)) 191 | # print(' boxes filter : %f' % (t3-t0)) 192 | print('---------------------------------') 193 | return out_boxes 194 | 195 | 196 | def process1(self, outputs, resolution_raw): 197 | """Take the YOLOv3 outputs generated from a TensorRT forward pass, post-process them 198 | and return a list of bounding boxes for detected object together with their category 199 | and their confidences in separate lists. 200 | 201 | Keyword arguments: 202 | outputs -- outputs from a TensorRT engine in NCHW format 203 | resolution_raw -- the original spatial resolution from the input PIL image in WH order 204 | """ 205 | outputs_reshaped = list() 206 | for output in outputs: 207 | outputs_reshaped.append(self._reshape_output(output)) 208 | 209 | start = time.time() 210 | boxes, categories, confidences = self._process_yolo_output(outputs_reshaped, resolution_raw) 211 | end = time.time() 212 | print("_process_yolo_output") 213 | print(end - start) 214 | 215 | return boxes, categories, confidences 216 | 217 | def _reshape_output(self, output): 218 | """Reshape a TensorRT output from NCHW to NHWC format (with expected C=255), 219 | and then return it in (height,width,3,85) dimensionality after further reshaping. 220 | 221 | Keyword argument: 222 | output -- an output from a TensorRT engine after inference 223 | """ 224 | output = np.transpose(output, [0, 2, 3, 1]) 225 | _, height, width, _ = output.shape 226 | dim1, dim2 = height, width 227 | dim3 = 3 228 | # There are CATEGORY_NUM=80 object categories: 229 | dim4 = (4 + 1 + CATEGORY_NUM) 230 | return np.reshape(output, (dim1, dim2, dim3, dim4)) 231 | 232 | def _process_yolo_output(self, outputs_reshaped, resolution_raw): 233 | """Take in a list of three reshaped YOLO outputs in (height,width,3,85) shape and return 234 | return a list of bounding boxes for detected object together with their category and their 235 | confidences in separate lists. 236 | 237 | Keyword arguments: 238 | outputs_reshaped -- list of three reshaped YOLO outputs as NumPy arrays 239 | with shape (height,width,3,85) 240 | resolution_raw -- the original spatial resolution from the input PIL image in WH order 241 | """ 242 | 243 | # E.g. in YOLOv3-608, there are three output tensors, which we associate with their 244 | # respective masks. Then we iterate through all output-mask pairs and generate candidates 245 | # for bounding boxes, their corresponding category predictions and their confidences: 246 | boxes, categories, confidences = list(), list(), list() 247 | 248 | for output, mask in zip(outputs_reshaped, self.masks): 249 | start = time.time() 250 | box, category, confidence = self._process_feats(output, mask) 251 | end = time.time() 252 | print("_process_feats") 253 | print(end - start) 254 | box, category, confidence = self._filter_boxes(box, category, confidence) 255 | boxes.append(box) 256 | categories.append(category) 257 | confidences.append(confidence) 258 | 259 | 260 | boxes = np.concatenate(boxes) 261 | categories = np.concatenate(categories) 262 | confidences = np.concatenate(confidences) 263 | 264 | # Scale boxes back to original image shape: 265 | width, height = resolution_raw 266 | image_dims = [width, height, width, height] 267 | boxes = boxes * image_dims 268 | 269 | # Using the candidates from the previous (loop) step, we apply the non-max suppression 270 | # algorithm that clusters adjacent bounding boxes to a single bounding box: 271 | nms_boxes, nms_categories, nscores = list(), list(), list() 272 | for category in set(categories): 273 | idxs = np.where(categories == category) 274 | box = boxes[idxs] 275 | category = categories[idxs] 276 | confidence = confidences[idxs] 277 | 278 | keep = self._nms_boxes(box, confidence) 279 | 280 | nms_boxes.append(box[keep]) 281 | nms_categories.append(category[keep]) 282 | nscores.append(confidence[keep]) 283 | 284 | if not nms_categories and not nscores: 285 | return None, None, None 286 | 287 | boxes = np.concatenate(nms_boxes) 288 | categories = np.concatenate(nms_categories) 289 | confidences = np.concatenate(nscores) 290 | 291 | return boxes, categories, confidences 292 | 293 | def _process_feats(self, output_reshaped, mask): 294 | """Take in a reshaped YOLO output in height,width,3,85 format together with its 295 | corresponding YOLO mask and return the detected bounding boxes, the confidence, 296 | and the class probability in each cell/pixel. 297 | 298 | Keyword arguments: 299 | output_reshaped -- reshaped YOLO output as NumPy arrays with shape (height,width,3,85) 300 | mask -- 2-dimensional tuple with mask specification for this output 301 | """ 302 | 303 | # Two in-line functions required for calculating the bounding box 304 | # descriptors: 305 | def sigmoid(value): 306 | """Return the sigmoid of the input.""" 307 | return 1.0 / (1.0 + math.exp(-value)) 308 | 309 | def exponential(value): 310 | """Return the exponential of the input.""" 311 | return math.exp(value) 312 | 313 | # Vectorized calculation of above two functions: 314 | sigmoid_v = np.vectorize(sigmoid) 315 | exponential_v = np.vectorize(exponential) 316 | 317 | grid_h, grid_w, _, _ = output_reshaped.shape 318 | 319 | anchors = [self.anchors[i] for i in mask] 320 | 321 | # Reshape to N, height, width, num_anchors, box_params: 322 | anchors_tensor = np.reshape(anchors, [1, 1, len(anchors), 2]) 323 | box_xy = sigmoid_v(output_reshaped[..., :2]) 324 | box_wh = exponential_v(output_reshaped[..., 2:4]) * anchors_tensor 325 | box_confidence = sigmoid_v(output_reshaped[..., 4]) 326 | 327 | box_confidence = np.expand_dims(box_confidence, axis=-1) 328 | box_class_probs = sigmoid_v(output_reshaped[..., 5:]) 329 | 330 | col = np.tile(np.arange(0, grid_w), grid_w).reshape(-1, grid_w) 331 | row = np.tile(np.arange(0, grid_h).reshape(-1, 1), grid_h) 332 | 333 | col = col.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2) 334 | row = row.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2) 335 | grid = np.concatenate((col, row), axis=-1) 336 | 337 | box_xy += grid 338 | box_xy /= (grid_w, grid_h) 339 | box_wh /= self.input_resolution_yolo 340 | box_xy -= (box_wh / 2.) 341 | boxes = np.concatenate((box_xy, box_wh), axis=-1) 342 | 343 | # boxes: centroids, box_confidence: confidence level, box_class_probs: 344 | # class confidence 345 | return boxes, box_confidence, box_class_probs 346 | 347 | def _filter_boxes(self, boxes, box_confidences, box_class_probs): 348 | """Take in the unfiltered bounding box descriptors and discard each cell 349 | whose score is lower than the object threshold set during class initialization. 350 | 351 | Keyword arguments: 352 | boxes -- bounding box coordinates with shape (height,width,3,4); 4 for 353 | x,y,height,width coordinates of the boxes 354 | box_confidences -- bounding box confidences with shape (height,width,3,1); 1 for as 355 | confidence scalar per element 356 | box_class_probs -- class probabilities with shape (height,width,3,CATEGORY_NUM) 357 | 358 | """ 359 | box_scores = box_confidences * box_class_probs 360 | box_classes = np.argmax(box_scores, axis=-1) 361 | box_class_scores = np.max(box_scores, axis=-1) 362 | pos = np.where(box_class_scores >= self.object_threshold) 363 | 364 | boxes = boxes[pos] 365 | classes = box_classes[pos] 366 | scores = box_class_scores[pos] 367 | 368 | return boxes, classes, scores 369 | 370 | def _nms_boxes(self, boxes, box_confidences): 371 | """Apply the Non-Maximum Suppression (NMS) algorithm on the bounding boxes with their 372 | confidence scores and return an array with the indexes of the bounding boxes we want to 373 | keep (and display later). 374 | 375 | Keyword arguments: 376 | boxes -- a NumPy array containing N bounding-box coordinates that survived filtering, 377 | with shape (N,4); 4 for x,y,height,width coordinates of the boxes 378 | box_confidences -- a Numpy array containing the corresponding confidences with shape N 379 | """ 380 | x_coord = boxes[:, 0] 381 | y_coord = boxes[:, 1] 382 | width = boxes[:, 2] 383 | height = boxes[:, 3] 384 | 385 | areas = width * height 386 | ordered = box_confidences.argsort()[::-1] 387 | 388 | keep = list() 389 | while ordered.size > 0: 390 | # Index of the current element: 391 | i = ordered[0] 392 | keep.append(i) 393 | xx1 = np.maximum(x_coord[i], x_coord[ordered[1:]]) 394 | yy1 = np.maximum(y_coord[i], y_coord[ordered[1:]]) 395 | xx2 = np.minimum(x_coord[i] + width[i], x_coord[ordered[1:]] + width[ordered[1:]]) 396 | yy2 = np.minimum(y_coord[i] + height[i], y_coord[ordered[1:]] + height[ordered[1:]]) 397 | 398 | width1 = np.maximum(0.0, xx2 - xx1 + 1) 399 | height1 = np.maximum(0.0, yy2 - yy1 + 1) 400 | intersection = width1 * height1 401 | union = (areas[i] + areas[ordered[1:]] - intersection) 402 | 403 | # Compute the Intersection over Union (IoU) score: 404 | iou = intersection / union 405 | 406 | # The goal of the NMS algorithm is to reduce the number of adjacent bounding-box 407 | # candidates to a minimum. In this step, we keep only those elements whose overlap 408 | # with the current bounding box is lower than the threshold: 409 | indexes = np.where(iou <= self.nms_threshold)[0] 410 | ordered = ordered[indexes + 1] 411 | 412 | keep = np.array(keep) 413 | return keep 414 | -------------------------------------------------------------------------------- /images/dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cw-zero/TensorRT_yolo3/bf904fe07498ad0f75aa10e6ef3769dbb9b72b69/images/dog.jpg -------------------------------------------------------------------------------- /images/person.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cw-zero/TensorRT_yolo3/bf904fe07498ad0f75aa10e6ef3769dbb9b72b69/images/person.jpg -------------------------------------------------------------------------------- /onnx_to_tensorrt.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import torch 4 | import numpy as np 5 | import tensorrt as trt 6 | import pycuda.driver as cuda 7 | import pycuda.autoinit 8 | from PIL import ImageDraw 9 | import time 10 | from util import * 11 | 12 | from data_processing import PreprocessYOLO 13 | 14 | import sys, os 15 | sys.path.insert(1, os.path.join(sys.path[0], "..")) 16 | import common 17 | 18 | TRT_LOGGER = trt.Logger() 19 | 20 | def get_engine(onnx_file_path, engine_file_path=""): 21 | """Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it.""" 22 | def build_engine(): 23 | """Takes an ONNX file and creates a TensorRT engine to run inference with""" 24 | with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.OnnxParser(network, TRT_LOGGER) as parser: 25 | builder.max_workspace_size = 1 << 30 # 1GB 26 | builder.max_batch_size = 1 27 | # Parse model file 28 | if not os.path.exists(onnx_file_path): 29 | print('ONNX file {} not found, please run yolov3_to_onnx.py first to generate it.'.format(onnx_file_path)) 30 | exit(0) 31 | print('Loading ONNX file from path {}...'.format(onnx_file_path)) 32 | with open(onnx_file_path, 'rb') as model: 33 | print('Beginning ONNX file parsing') 34 | parser.parse(model.read()) 35 | print('Completed parsing of ONNX file') 36 | print('Building an engine from file {}; this may take a while...'.format(onnx_file_path)) 37 | engine = builder.build_cuda_engine(network) 38 | print("Completed creating Engine") 39 | with open(engine_file_path, "wb") as f: 40 | f.write(engine.serialize()) 41 | return engine 42 | 43 | if os.path.exists(engine_file_path): 44 | # If a serialized engine exists, use it instead of building an engine. 45 | print("Reading engine from file {}".format(engine_file_path)) 46 | with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: 47 | return runtime.deserialize_cuda_engine(f.read()) 48 | else: 49 | return build_engine() 50 | 51 | def main(): 52 | 53 | """Create a TensorRT engine for ONNX-based YOLOv3-608 and run inference.""" 54 | 55 | # Try to load a previously generated YOLOv3-608 network graph in ONNX format: 56 | onnx_file_path = 'yolov3-608.onnx' 57 | engine_file_path = "yolov3-608.trt" 58 | input_image_path = "./images/b.jpg" 59 | 60 | # Two-dimensional tuple with the target network's (spatial) input resolution in HW ordered 61 | input_resolution_yolov3_HW = (608, 608) 62 | 63 | # Create a pre-processor object by specifying the required input resolution for YOLOv3 64 | preprocessor = PreprocessYOLO(input_resolution_yolov3_HW) 65 | 66 | # Load an image from the specified input path, and return it together with a pre-processed version 67 | image_raw, image = preprocessor.process(input_image_path) 68 | 69 | # Store the shape of the original input image in WH format, we will need it for later 70 | shape_orig_WH = image_raw.size 71 | 72 | # Output shapes expected by the post-processor 73 | output_shapes = [(1, 255, 19, 19), (1, 255, 38, 38), (1, 255, 76, 76)] 74 | # output_shapes = [(1, 255, 13, 13), (1, 255, 26, 26), (1, 255, 52, 52)] 75 | 76 | # Do inference with TensorRT 77 | trt_outputs = [] 78 | a = torch.cuda.FloatTensor() 79 | average_inference_time = 0 80 | average_yolo_time = 0 81 | counter = 10 82 | with get_engine(onnx_file_path, engine_file_path) as engine, engine.create_execution_context() as context: 83 | inputs, outputs, bindings, stream = common.allocate_buffers(engine) 84 | while counter: 85 | # Do inference 86 | print('Running inference on image {}...'.format(input_image_path)) 87 | # Set host input to the image. The common.do_inference function will copy the input to the GPU before executing. 88 | inference_start = time.time() 89 | inputs[0].host = image 90 | trt_outputs = common.do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) 91 | inference_end = time.time() 92 | inference_time = inference_end-inference_start 93 | average_inference_time = average_inference_time + inference_time 94 | print('inference time : %f' % (inference_end-inference_start)) 95 | 96 | # Do yolo_layer with pytorch 97 | inp_dim = 608 98 | num_classes = 80 99 | CUDA = True 100 | yolo_anchors = [[(116, 90), (156, 198), (373, 326)], 101 | [(30, 61), (62, 45), (59, 119)], 102 | [(10, 13), (16, 30), (33, 23)]] 103 | write = 0 104 | yolo_start = time.time() 105 | for output, shape, anchors in zip(trt_outputs, output_shapes, yolo_anchors): 106 | output = output.reshape(shape) 107 | trt_output = torch.from_numpy(output).cuda() 108 | trt_output = trt_output.data 109 | trt_output = predict_transform(trt_output, inp_dim, anchors, num_classes, CUDA) 110 | 111 | if type(trt_output) == int: 112 | continue 113 | 114 | if not write: 115 | detections = trt_output 116 | write = 1 117 | 118 | else: 119 | detections = torch.cat((detections, trt_output), 1) 120 | dets = dynamic_write_results(detections, 0.5, num_classes, nms=True, nms_conf=0.45) #0.008 121 | yolo_end = time.time() 122 | yolo_time = yolo_end-yolo_start 123 | average_yolo_time = average_yolo_time + yolo_time 124 | print('yolo time : %f' % (yolo_end-yolo_start)) 125 | print('all time : %f' % (yolo_end-inference_start)) 126 | counter = counter -1 127 | 128 | average_yolo_time = average_yolo_time/10 129 | average_inference_time = average_inference_time/10 130 | print("--------------------------------------------------------") 131 | print('average yolo time : %f' % (average_yolo_time)) 132 | print('average inference time : %f' % (average_inference_time)) 133 | print("--------------------------------------------------------") 134 | 135 | if __name__ == '__main__': 136 | main() 137 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import division 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | import numpy as np 9 | import cv2 10 | #import matplotlib.pyplot as plt 11 | try: 12 | from bbox import bbox_iou 13 | except ImportError: 14 | from yolo.bbox import bbox_iou 15 | 16 | 17 | def count_parameters(model): 18 | return sum(p.numel() for p in model.parameters()) 19 | 20 | def count_learnable_parameters(model): 21 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 22 | 23 | def convert2cpu(matrix): 24 | if matrix.is_cuda: 25 | return torch.FloatTensor(matrix.size()).copy_(matrix) 26 | else: 27 | return matrix 28 | 29 | def predict_transform(prediction, inp_dim, anchors, num_classes, CUDA = True): 30 | batch_size = prediction.size(0) 31 | stride = inp_dim // prediction.size(2) 32 | grid_size = inp_dim // stride 33 | bbox_attrs = 5 + num_classes 34 | num_anchors = len(anchors) 35 | 36 | anchors = [(a[0]/stride, a[1]/stride) for a in anchors] 37 | 38 | 39 | 40 | prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size) 41 | prediction = prediction.transpose(1,2).contiguous() 42 | prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs) 43 | 44 | 45 | #Sigmoid the centre_X, centre_Y. and object confidencce 46 | prediction[:,:,0] = torch.sigmoid(prediction[:,:,0]) 47 | prediction[:,:,1] = torch.sigmoid(prediction[:,:,1]) 48 | prediction[:,:,4] = torch.sigmoid(prediction[:,:,4]) 49 | 50 | 51 | 52 | #Add the center offsets 53 | grid_len = np.arange(grid_size) 54 | a,b = np.meshgrid(grid_len, grid_len) 55 | 56 | x_offset = torch.FloatTensor(a).view(-1,1) 57 | y_offset = torch.FloatTensor(b).view(-1,1) 58 | 59 | if CUDA: 60 | x_offset = x_offset.cuda() 61 | y_offset = y_offset.cuda() 62 | 63 | x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1,num_anchors).view(-1,2).unsqueeze(0) 64 | 65 | prediction[:,:,:2] += x_y_offset 66 | 67 | #log space transform height and the width 68 | anchors = torch.FloatTensor(anchors) 69 | 70 | if CUDA: 71 | anchors = anchors.cuda() 72 | 73 | anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0) 74 | prediction[:,:,2:4] = torch.exp(prediction[:,:,2:4])*anchors 75 | 76 | #Softmax the class scores 77 | prediction[:,:,5: 5 + num_classes] = torch.sigmoid((prediction[:,:, 5 : 5 + num_classes])) 78 | 79 | prediction[:,:,:4] *= stride 80 | 81 | 82 | return prediction 83 | 84 | def load_classes(namesfile): 85 | fp = open(namesfile, "r") 86 | names = fp.read().split("\n")[:-1] 87 | return names 88 | 89 | def get_im_dim(im): 90 | im = cv2.imread(im) 91 | w,h = im.shape[1], im.shape[0] 92 | return w,h 93 | 94 | def unique(tensor): 95 | tensor_np = tensor.cpu().numpy() 96 | unique_np = np.unique(tensor_np) 97 | unique_tensor = torch.from_numpy(unique_np) 98 | 99 | tensor_res = tensor.new(unique_tensor.shape) 100 | tensor_res.copy_(unique_tensor) 101 | return tensor_res 102 | 103 | def dynamic_write_results(prediction, confidence, num_classes, nms=True, nms_conf=0.4): 104 | prediction_bak = prediction.clone() 105 | dets = write_results(prediction.clone(), confidence, num_classes, nms, nms_conf) 106 | if isinstance(dets, int): 107 | return dets 108 | 109 | if dets.shape[0] > 100: 110 | nms_conf -= 0.05 111 | dets = write_results(prediction_bak.clone(), confidence, num_classes, nms, nms_conf) 112 | 113 | return dets 114 | 115 | 116 | def write_results(prediction, confidence, num_classes, nms=True, nms_conf=0.4): 117 | conf_mask = (prediction[:, :, 4] > confidence).float().float().unsqueeze(2) 118 | prediction = prediction * conf_mask 119 | 120 | try: 121 | ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous() 122 | except: 123 | return 0 124 | 125 | box_a = prediction.new(prediction.shape) 126 | box_a[:,:,0] = (prediction[:,:,0] - prediction[:,:,2]/2) 127 | box_a[:,:,1] = (prediction[:,:,1] - prediction[:,:,3]/2) 128 | box_a[:,:,2] = (prediction[:,:,0] + prediction[:,:,2]/2) 129 | box_a[:,:,3] = (prediction[:,:,1] + prediction[:,:,3]/2) 130 | prediction[:,:,:4] = box_a[:,:,:4] 131 | 132 | batch_size = prediction.size(0) 133 | 134 | output = prediction.new(1, prediction.size(2) + 1) 135 | write = False 136 | num = 0 137 | for ind in range(batch_size): 138 | #select the image from the batch 139 | image_pred = prediction[ind] 140 | 141 | #Get the class having maximum score, and the index of that class 142 | #Get rid of num_classes softmax scores 143 | #Add the class index and the class score of class having maximum score 144 | max_conf, max_conf_score = torch.max(image_pred[:,5:5+ num_classes], 1) 145 | max_conf = max_conf.float().unsqueeze(1) 146 | max_conf_score = max_conf_score.float().unsqueeze(1) 147 | seq = (image_pred[:,:5], max_conf, max_conf_score) 148 | image_pred = torch.cat(seq, 1) 149 | 150 | #Get rid of the zero entries 151 | non_zero_ind = (torch.nonzero(image_pred[:,4])) 152 | 153 | image_pred_ = image_pred[non_zero_ind.squeeze(),:].view(-1,7) 154 | 155 | #Get the various classes detected in the image 156 | try: 157 | img_classes = unique(image_pred_[:,-1]) 158 | except: 159 | continue 160 | 161 | #WE will do NMS classwise 162 | #print(img_classes) 163 | for cls in img_classes: 164 | # if cls != 0: #0 is the person 165 | # continue 166 | #get the detections with one particular class 167 | cls_mask = image_pred_*(image_pred_[:,-1] == cls).float().unsqueeze(1) 168 | class_mask_ind = torch.nonzero(cls_mask[:,-2]).squeeze() 169 | 170 | image_pred_class = image_pred_[class_mask_ind].view(-1,7) 171 | 172 | #sort the detections such that the entry with the maximum objectness 173 | #confidence is at the top 174 | conf_sort_index = torch.sort(image_pred_class[:,4], descending = True )[1] 175 | image_pred_class = image_pred_class[conf_sort_index] 176 | idx = image_pred_class.size(0) 177 | 178 | #if nms has to be done 179 | if nms: 180 | #For each detection 181 | for i in range(idx): 182 | #Get the IOUs of all boxes that come after the one we are looking at 183 | #in the loop 184 | try: 185 | ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:]) 186 | except ValueError: 187 | break 188 | 189 | except IndexError: 190 | break 191 | 192 | #Zero out all the detections that have IoU > treshhold 193 | iou_mask = (ious < nms_conf).float().unsqueeze(1) 194 | image_pred_class[i+1:] *= iou_mask 195 | 196 | #Remove the non-zero entries 197 | non_zero_ind = torch.nonzero(image_pred_class[:,4]).squeeze() 198 | image_pred_class = image_pred_class[non_zero_ind].view(-1,7) 199 | 200 | #if nms has to be done 201 | # if nms: 202 | # # Perform non-maximum suppression 203 | # max_detections = [] 204 | # while image_pred_class.size(0): 205 | # # Get detection with highest confidence and save as max detection 206 | # max_detections.append(image_pred_class[0].unsqueeze(0)) 207 | # # Stop if we're at the last detection 208 | # if len(image_pred_class) == 1: 209 | # break 210 | # # Get the IOUs for all boxes with lower confidence 211 | # ious = bbox_iou(max_detections[-1], image_pred_class[1:]) 212 | # # Remove detections with IoU >= NMS threshold 213 | # image_pred_class = image_pred_class[1:][ious < nms_conf] 214 | 215 | # image_pred_class = torch.cat(max_detections).data 216 | 217 | 218 | #Concatenate the batch_id of the image to the detection 219 | #this helps us identify which image does the detection correspond to 220 | #We use a linear straucture to hold ALL the detections from the batch 221 | #the batch_dim is flattened 222 | #batch is identified by extra batch column 223 | 224 | batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind) 225 | seq = batch_ind, image_pred_class 226 | if not write: 227 | output = torch.cat(seq,1) 228 | write = True 229 | else: 230 | out = torch.cat(seq,1) 231 | output = torch.cat((output,out)) 232 | num += 1 233 | 234 | if not num: 235 | return 0 236 | 237 | return output 238 | 239 | #!/usr/bin/env python3 240 | # -*- coding: utf-8 -*- 241 | """ 242 | Created on Sat Mar 24 00:12:16 2018 243 | 244 | @author: ayooshmac 245 | """ 246 | 247 | def predict_transform_half(prediction, inp_dim, anchors, num_classes, CUDA = True): 248 | batch_size = prediction.size(0) 249 | stride = inp_dim // prediction.size(2) 250 | 251 | bbox_attrs = 5 + num_classes 252 | num_anchors = len(anchors) 253 | grid_size = inp_dim // stride 254 | 255 | 256 | prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size) 257 | prediction = prediction.transpose(1,2).contiguous() 258 | prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs) 259 | 260 | 261 | #Sigmoid the centre_X, centre_Y. and object confidencce 262 | prediction[:,:,0] = torch.sigmoid(prediction[:,:,0]) 263 | prediction[:,:,1] = torch.sigmoid(prediction[:,:,1]) 264 | prediction[:,:,4] = torch.sigmoid(prediction[:,:,4]) 265 | 266 | 267 | #Add the center offsets 268 | grid_len = np.arange(grid_size) 269 | a,b = np.meshgrid(grid_len, grid_len) 270 | 271 | x_offset = torch.FloatTensor(a).view(-1,1) 272 | y_offset = torch.FloatTensor(b).view(-1,1) 273 | 274 | if CUDA: 275 | x_offset = x_offset.cuda().half() 276 | y_offset = y_offset.cuda().half() 277 | 278 | x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1,num_anchors).view(-1,2).unsqueeze(0) 279 | 280 | prediction[:,:,:2] += x_y_offset 281 | 282 | #log space transform height and the width 283 | anchors = torch.HalfTensor(anchors) 284 | 285 | if CUDA: 286 | anchors = anchors.cuda() 287 | 288 | anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0) 289 | prediction[:,:,2:4] = torch.exp(prediction[:,:,2:4])*anchors 290 | 291 | #Softmax the class scores 292 | prediction[:,:,5: 5 + num_classes] = nn.Softmax(-1)(Variable(prediction[:,:, 5 : 5 + num_classes])).data 293 | 294 | prediction[:,:,:4] *= stride 295 | 296 | 297 | return prediction 298 | 299 | 300 | def write_results_half(prediction, confidence, num_classes, nms = True, nms_conf = 0.4): 301 | conf_mask = (prediction[:,:,4] > confidence).half().unsqueeze(2) 302 | prediction = prediction*conf_mask 303 | 304 | try: 305 | ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous() 306 | except: 307 | return 0 308 | 309 | 310 | 311 | box_a = prediction.new(prediction.shape) 312 | box_a[:,:,0] = (prediction[:,:,0] - prediction[:,:,2]/2) 313 | box_a[:,:,1] = (prediction[:,:,1] - prediction[:,:,3]/2) 314 | box_a[:,:,2] = (prediction[:,:,0] + prediction[:,:,2]/2) 315 | box_a[:,:,3] = (prediction[:,:,1] + prediction[:,:,3]/2) 316 | prediction[:,:,:4] = box_a[:,:,:4] 317 | 318 | 319 | 320 | batch_size = prediction.size(0) 321 | 322 | output = prediction.new(1, prediction.size(2) + 1) 323 | write = False 324 | 325 | for ind in range(batch_size): 326 | #select the image from the batch 327 | image_pred = prediction[ind] 328 | 329 | 330 | #Get the class having maximum score, and the index of that class 331 | #Get rid of num_classes softmax scores 332 | #Add the class index and the class score of class having maximum score 333 | max_conf, max_conf_score = torch.max(image_pred[:,5:5+ num_classes], 1) 334 | max_conf = max_conf.half().unsqueeze(1) 335 | max_conf_score = max_conf_score.half().unsqueeze(1) 336 | seq = (image_pred[:,:5], max_conf, max_conf_score) 337 | image_pred = torch.cat(seq, 1) 338 | 339 | 340 | #Get rid of the zero entries 341 | non_zero_ind = (torch.nonzero(image_pred[:,4])) 342 | try: 343 | image_pred_ = image_pred[non_zero_ind.squeeze(),:] 344 | except: 345 | continue 346 | 347 | #Get the various classes detected in the image 348 | img_classes = unique(image_pred_[:,-1].long()).half() 349 | 350 | 351 | 352 | 353 | #WE will do NMS classwise 354 | for cls in img_classes: 355 | #get the detections with one particular class 356 | cls_mask = image_pred_*(image_pred_[:,-1] == cls).half().unsqueeze(1) 357 | class_mask_ind = torch.nonzero(cls_mask[:,-2]).squeeze() 358 | 359 | 360 | image_pred_class = image_pred_[class_mask_ind] 361 | 362 | 363 | #sort the detections such that the entry with the maximum objectness 364 | #confidence is at the top 365 | conf_sort_index = torch.sort(image_pred_class[:,4], descending = True )[1] 366 | image_pred_class = image_pred_class[conf_sort_index] 367 | idx = image_pred_class.size(0) 368 | 369 | #if nms has to be done 370 | if nms: 371 | #For each detection 372 | for i in range(idx): 373 | #Get the IOUs of all boxes that come after the one we are looking at 374 | #in the loop 375 | try: 376 | ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:]) 377 | except ValueError: 378 | break 379 | 380 | except IndexError: 381 | break 382 | 383 | #Zero out all the detections that have IoU > treshhold 384 | iou_mask = (ious < nms_conf).half().unsqueeze(1) 385 | image_pred_class[i+1:] *= iou_mask 386 | 387 | #Remove the non-zero entries 388 | non_zero_ind = torch.nonzero(image_pred_class[:,4]).squeeze() 389 | image_pred_class = image_pred_class[non_zero_ind] 390 | 391 | 392 | 393 | #Concatenate the batch_id of the image to the detection 394 | #this helps us identify which image does the detection correspond to 395 | #We use a linear straucture to hold ALL the detections from the batch 396 | #the batch_dim is flattened 397 | #batch is identified by extra batch column 398 | batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind) 399 | seq = batch_ind, image_pred_class 400 | 401 | if not write: 402 | output = torch.cat(seq,1) 403 | write = True 404 | else: 405 | out = torch.cat(seq,1) 406 | output = torch.cat((output,out)) 407 | 408 | return output 409 | -------------------------------------------------------------------------------- /yolov3-608.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [convolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=255 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 6,7,8 609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 610 | classes=80 611 | num=9 612 | jitter=.3 613 | ignore_thresh = .7 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [convolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=255 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 3,4,5 695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 696 | classes=80 697 | num=9 698 | jitter=.3 699 | ignore_thresh = .7 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [convolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=255 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 0,1,2 782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 783 | classes=80 784 | num=9 785 | jitter=.3 786 | ignore_thresh = .7 787 | truth_thresh = 1 788 | random=1 789 | 790 | -------------------------------------------------------------------------------- /yolov3_to_onnx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | from __future__ import print_function 4 | from collections import OrderedDict 5 | import hashlib 6 | import os.path 7 | 8 | import wget 9 | 10 | import onnx 11 | from onnx import helper 12 | from onnx import TensorProto 13 | import numpy as np 14 | 15 | import sys 16 | 17 | class DarkNetParser(object): 18 | """Definition of a parser for DarkNet-based YOLOv3-608 (only tested for this topology).""" 19 | 20 | def __init__(self, supported_layers): 21 | """Initializes a DarkNetParser object. 22 | 23 | Keyword argument: 24 | supported_layers -- a string list of supported layers in DarkNet naming convention, 25 | parameters are only added to the class dictionary if a parsed layer is included. 26 | """ 27 | 28 | # A list of YOLOv3 layers containing dictionaries with all layer 29 | # parameters: 30 | self.layer_configs = OrderedDict() 31 | self.supported_layers = supported_layers 32 | self.layer_counter = 0 33 | 34 | def parse_cfg_file(self, cfg_file_path): 35 | """Takes the yolov3.cfg file and parses it layer by layer, 36 | appending each layer's parameters as a dictionary to layer_configs. 37 | 38 | Keyword argument: 39 | cfg_file_path -- path to the yolov3.cfg file as string 40 | """ 41 | with open(cfg_file_path, 'rb') as cfg_file: 42 | remainder = cfg_file.read() 43 | while remainder is not None: 44 | layer_dict, layer_name, remainder = self._next_layer(remainder) 45 | if layer_dict is not None: 46 | self.layer_configs[layer_name] = layer_dict 47 | return self.layer_configs 48 | 49 | def _next_layer(self, remainder): 50 | """Takes in a string and segments it by looking for DarkNet delimiters. 51 | Returns the layer parameters and the remaining string after the last delimiter. 52 | Example for the first Conv layer in yolo.cfg ... 53 | 54 | [convolutional] 55 | batch_normalize=1 56 | filters=32 57 | size=3 58 | stride=1 59 | pad=1 60 | activation=leaky 61 | 62 | ... becomes the following layer_dict return value: 63 | {'activation': 'leaky', 'stride': 1, 'pad': 1, 'filters': 32, 64 | 'batch_normalize': 1, 'type': 'convolutional', 'size': 3}. 65 | 66 | '001_convolutional' is returned as layer_name, and all lines that follow in yolo.cfg 67 | are returned as the next remainder. 68 | 69 | Keyword argument: 70 | remainder -- a string with all raw text after the previously parsed layer 71 | """ 72 | remainder = remainder.split('[', 1) 73 | if len(remainder) == 2: 74 | remainder = remainder[1] 75 | else: 76 | return None, None, None 77 | remainder = remainder.split(']', 1) 78 | if len(remainder) == 2: 79 | layer_type, remainder = remainder 80 | else: 81 | return None, None, None 82 | if remainder.replace(' ', '')[0] == '#': 83 | remainder = remainder.split('\n', 1)[1] 84 | 85 | layer_param_block, remainder = remainder.split('\n\n', 1) 86 | layer_param_lines = layer_param_block.split('\n')[1:] 87 | layer_name = str(self.layer_counter).zfill(3) + '_' + layer_type 88 | layer_dict = dict(type=layer_type) 89 | if layer_type in self.supported_layers: 90 | for param_line in layer_param_lines: 91 | if param_line[0] == '#': 92 | continue 93 | param_type, param_value = self._parse_params(param_line) 94 | layer_dict[param_type] = param_value 95 | self.layer_counter += 1 96 | return layer_dict, layer_name, remainder 97 | 98 | def _parse_params(self, param_line): 99 | """Identifies the parameters contained in one of the cfg file and returns 100 | them in the required format for each parameter type, e.g. as a list, an int or a float. 101 | 102 | Keyword argument: 103 | param_line -- one parsed line within a layer block 104 | """ 105 | param_line = param_line.replace(' ', '') 106 | param_type, param_value_raw = param_line.split('=') 107 | param_value = None 108 | if param_type == 'layers': 109 | layer_indexes = list() 110 | for index in param_value_raw.split(','): 111 | layer_indexes.append(int(index)) 112 | param_value = layer_indexes 113 | elif isinstance(param_value_raw, str) and not param_value_raw.isalpha(): 114 | condition_param_value_positive = param_value_raw.isdigit() 115 | condition_param_value_negative = param_value_raw[0] == '-' and \ 116 | param_value_raw[1:].isdigit() 117 | if condition_param_value_positive or condition_param_value_negative: 118 | param_value = int(param_value_raw) 119 | else: 120 | param_value = float(param_value_raw) 121 | else: 122 | param_value = str(param_value_raw) 123 | return param_type, param_value 124 | 125 | 126 | class MajorNodeSpecs(object): 127 | """Helper class used to store the names of ONNX output names, 128 | corresponding to the output of a DarkNet layer and its output channels. 129 | Some DarkNet layers are not created and there is no corresponding ONNX node, 130 | but we still need to track them in order to set up skip connections. 131 | """ 132 | 133 | def __init__(self, name, channels): 134 | """ Initialize a MajorNodeSpecs object. 135 | 136 | Keyword arguments: 137 | name -- name of the ONNX node 138 | channels -- number of output channels of this node 139 | """ 140 | self.name = name 141 | self.channels = channels 142 | self.created_onnx_node = False 143 | if name is not None and isinstance(channels, int) and channels > 0: 144 | self.created_onnx_node = True 145 | 146 | 147 | class ConvParams(object): 148 | """Helper class to store the hyper parameters of a Conv layer, 149 | including its prefix name in the ONNX graph and the expected dimensions 150 | of weights for convolution, bias, and batch normalization. 151 | 152 | Additionally acts as a wrapper for generating safe names for all 153 | weights, checking on feasible combinations. 154 | """ 155 | 156 | def __init__(self, node_name, batch_normalize, conv_weight_dims): 157 | """Constructor based on the base node name (e.g. 101_convolutional), the batch 158 | normalization setting, and the convolutional weights shape. 159 | 160 | Keyword arguments: 161 | node_name -- base name of this YOLO convolutional layer 162 | batch_normalize -- bool value if batch normalization is used 163 | conv_weight_dims -- the dimensions of this layer's convolutional weights 164 | """ 165 | self.node_name = node_name 166 | self.batch_normalize = batch_normalize 167 | assert len(conv_weight_dims) == 4 168 | self.conv_weight_dims = conv_weight_dims 169 | 170 | def generate_param_name(self, param_category, suffix): 171 | """Generates a name based on two string inputs, 172 | and checks if the combination is valid.""" 173 | assert suffix 174 | assert param_category in ['bn', 'conv'] 175 | assert(suffix in ['scale', 'mean', 'var', 'weights', 'bias']) 176 | if param_category == 'bn': 177 | assert self.batch_normalize 178 | assert suffix in ['scale', 'bias', 'mean', 'var'] 179 | elif param_category == 'conv': 180 | assert suffix in ['weights', 'bias'] 181 | if suffix == 'bias': 182 | assert not self.batch_normalize 183 | param_name = self.node_name + '_' + param_category + '_' + suffix 184 | return param_name 185 | 186 | 187 | class WeightLoader(object): 188 | """Helper class used for loading the serialized weights of a binary file stream 189 | and returning the initializers and the input tensors required for populating 190 | the ONNX graph with weights. 191 | """ 192 | 193 | def __init__(self, weights_file_path): 194 | """Initialized with a path to the YOLOv3 .weights file. 195 | 196 | Keyword argument: 197 | weights_file_path -- path to the weights file. 198 | """ 199 | self.weights_file = self._open_weights_file(weights_file_path) 200 | 201 | def load_conv_weights(self, conv_params): 202 | """Returns the initializers with weights from the weights file and 203 | the input tensors of a convolutional layer for all corresponding ONNX nodes. 204 | 205 | Keyword argument: 206 | conv_params -- a ConvParams object 207 | """ 208 | initializer = list() 209 | inputs = list() 210 | if conv_params.batch_normalize: 211 | bias_init, bias_input = self._create_param_tensors( 212 | conv_params, 'bn', 'bias') 213 | bn_scale_init, bn_scale_input = self._create_param_tensors( 214 | conv_params, 'bn', 'scale') 215 | bn_mean_init, bn_mean_input = self._create_param_tensors( 216 | conv_params, 'bn', 'mean') 217 | bn_var_init, bn_var_input = self._create_param_tensors( 218 | conv_params, 'bn', 'var') 219 | initializer.extend( 220 | [bn_scale_init, bias_init, bn_mean_init, bn_var_init]) 221 | inputs.extend([bn_scale_input, bias_input, 222 | bn_mean_input, bn_var_input]) 223 | else: 224 | bias_init, bias_input = self._create_param_tensors( 225 | conv_params, 'conv', 'bias') 226 | initializer.append(bias_init) 227 | inputs.append(bias_input) 228 | conv_init, conv_input = self._create_param_tensors( 229 | conv_params, 'conv', 'weights') 230 | initializer.append(conv_init) 231 | inputs.append(conv_input) 232 | return initializer, inputs 233 | 234 | def _open_weights_file(self, weights_file_path): 235 | """Opens a YOLOv3 DarkNet file stream and skips the header. 236 | 237 | Keyword argument: 238 | weights_file_path -- path to the weights file. 239 | """ 240 | weights_file = open(weights_file_path, 'rb') 241 | length_header = 5 242 | np.ndarray( 243 | shape=(length_header, ), dtype='int32', buffer=weights_file.read( 244 | length_header * 4)) 245 | return weights_file 246 | 247 | def _create_param_tensors(self, conv_params, param_category, suffix): 248 | """Creates the initializers with weights from the weights file together with 249 | the input tensors. 250 | 251 | Keyword arguments: 252 | conv_params -- a ConvParams object 253 | param_category -- the category of parameters to be created ('bn' or 'conv') 254 | suffix -- a string determining the sub-type of above param_category (e.g., 255 | 'weights' or 'bias') 256 | """ 257 | param_name, param_data, param_data_shape = self._load_one_param_type( 258 | conv_params, param_category, suffix) 259 | 260 | initializer_tensor = helper.make_tensor( 261 | param_name, TensorProto.FLOAT, param_data_shape, param_data) 262 | input_tensor = helper.make_tensor_value_info( 263 | param_name, TensorProto.FLOAT, param_data_shape) 264 | return initializer_tensor, input_tensor 265 | 266 | def _load_one_param_type(self, conv_params, param_category, suffix): 267 | """Deserializes the weights from a file stream in the DarkNet order. 268 | 269 | Keyword arguments: 270 | conv_params -- a ConvParams object 271 | param_category -- the category of parameters to be created ('bn' or 'conv') 272 | suffix -- a string determining the sub-type of above param_category (e.g., 273 | 'weights' or 'bias') 274 | """ 275 | param_name = conv_params.generate_param_name(param_category, suffix) 276 | channels_out, channels_in, filter_h, filter_w = conv_params.conv_weight_dims 277 | if param_category == 'bn': 278 | param_shape = [channels_out] 279 | elif param_category == 'conv': 280 | if suffix == 'weights': 281 | param_shape = [channels_out, channels_in, filter_h, filter_w] 282 | elif suffix == 'bias': 283 | param_shape = [channels_out] 284 | param_size = np.product(np.array(param_shape)) 285 | param_data = np.ndarray( 286 | shape=param_shape, 287 | dtype='float32', 288 | buffer=self.weights_file.read(param_size * 4)) 289 | param_data = param_data.flatten().astype(float) 290 | return param_name, param_data, param_shape 291 | 292 | 293 | class GraphBuilderONNX(object): 294 | """Class for creating an ONNX graph from a previously generated list of layer dictionaries.""" 295 | 296 | def __init__(self, output_tensors): 297 | """Initialize with all DarkNet default parameters used creating YOLOv3, 298 | and specify the output tensors as an OrderedDict for their output dimensions 299 | with their names as keys. 300 | 301 | Keyword argument: 302 | output_tensors -- the output tensors as an OrderedDict containing the keys' 303 | output dimensions 304 | """ 305 | self.output_tensors = output_tensors 306 | self._nodes = list() 307 | self.graph_def = None 308 | self.input_tensor = None 309 | self.epsilon_bn = 1e-5 310 | self.momentum_bn = 0.99 311 | self.alpha_lrelu = 0.1 312 | self.param_dict = OrderedDict() 313 | self.major_node_specs = list() 314 | self.batch_size = 1 315 | 316 | def build_onnx_graph( 317 | self, 318 | layer_configs, 319 | weights_file_path, 320 | verbose=True): 321 | """Iterate over all layer configs (parsed from the DarkNet representation 322 | of YOLOv3-608), create an ONNX graph, populate it with weights from the weights 323 | file and return the graph definition. 324 | 325 | Keyword arguments: 326 | layer_configs -- an OrderedDict object with all parsed layers' configurations 327 | weights_file_path -- location of the weights file 328 | verbose -- toggles if the graph is printed after creation (default: True) 329 | """ 330 | for layer_name in layer_configs.keys(): 331 | layer_dict = layer_configs[layer_name] 332 | major_node_specs = self._make_onnx_node(layer_name, layer_dict) 333 | if major_node_specs.name is not None: 334 | self.major_node_specs.append(major_node_specs) 335 | outputs = list() 336 | for tensor_name in self.output_tensors.keys(): 337 | output_dims = [self.batch_size, ] + \ 338 | self.output_tensors[tensor_name] 339 | output_tensor = helper.make_tensor_value_info( 340 | tensor_name, TensorProto.FLOAT, output_dims) 341 | outputs.append(output_tensor) 342 | inputs = [self.input_tensor] 343 | weight_loader = WeightLoader(weights_file_path) 344 | initializer = list() 345 | for layer_name in self.param_dict.keys(): 346 | _, layer_type = layer_name.split('_', 1) 347 | conv_params = self.param_dict[layer_name] 348 | assert layer_type == 'convolutional' 349 | initializer_layer, inputs_layer = weight_loader.load_conv_weights( 350 | conv_params) 351 | initializer.extend(initializer_layer) 352 | inputs.extend(inputs_layer) 353 | del weight_loader 354 | self.graph_def = helper.make_graph( 355 | nodes=self._nodes, 356 | name='YOLOv3-608', 357 | inputs=inputs, 358 | outputs=outputs, 359 | initializer=initializer 360 | ) 361 | if verbose: 362 | print(helper.printable_graph(self.graph_def)) 363 | model_def = helper.make_model(self.graph_def, 364 | producer_name='NVIDIA TensorRT sample') 365 | return model_def 366 | 367 | def _make_onnx_node(self, layer_name, layer_dict): 368 | """Take in a layer parameter dictionary, choose the correct function for 369 | creating an ONNX node and store the information important to graph creation 370 | as a MajorNodeSpec object. 371 | 372 | Keyword arguments: 373 | layer_name -- the layer's name (also the corresponding key in layer_configs) 374 | layer_dict -- a layer parameter dictionary (one element of layer_configs) 375 | """ 376 | layer_type = layer_dict['type'] 377 | if self.input_tensor is None: 378 | if layer_type == 'net': 379 | major_node_output_name, major_node_output_channels = self._make_input_tensor( 380 | layer_name, layer_dict) 381 | major_node_specs = MajorNodeSpecs(major_node_output_name, 382 | major_node_output_channels) 383 | else: 384 | raise ValueError('The first node has to be of type "net".') 385 | else: 386 | node_creators = dict() 387 | node_creators['convolutional'] = self._make_conv_node 388 | node_creators['shortcut'] = self._make_shortcut_node 389 | node_creators['route'] = self._make_route_node 390 | node_creators['upsample'] = self._make_upsample_node 391 | 392 | if layer_type in node_creators.keys(): 393 | major_node_output_name, major_node_output_channels = \ 394 | node_creators[layer_type](layer_name, layer_dict) 395 | major_node_specs = MajorNodeSpecs(major_node_output_name, 396 | major_node_output_channels) 397 | else: 398 | print( 399 | 'Layer of type %s not supported, skipping ONNX node generation.' % 400 | layer_type) 401 | major_node_specs = MajorNodeSpecs(layer_name, 402 | None) 403 | return major_node_specs 404 | 405 | def _make_input_tensor(self, layer_name, layer_dict): 406 | """Create an ONNX input tensor from a 'net' layer and store the batch size. 407 | 408 | Keyword arguments: 409 | layer_name -- the layer's name (also the corresponding key in layer_configs) 410 | layer_dict -- a layer parameter dictionary (one element of layer_configs) 411 | """ 412 | batch_size = layer_dict['batch'] 413 | channels = layer_dict['channels'] 414 | height = layer_dict['height'] 415 | width = layer_dict['width'] 416 | self.batch_size = batch_size 417 | input_tensor = helper.make_tensor_value_info( 418 | str(layer_name), TensorProto.FLOAT, [ 419 | batch_size, channels, height, width]) 420 | self.input_tensor = input_tensor 421 | return layer_name, channels 422 | 423 | def _get_previous_node_specs(self, target_index=-1): 424 | """Get a previously generated ONNX node (skip those that were not generated). 425 | Target index can be passed for jumping to a specific index. 426 | 427 | Keyword arguments: 428 | target_index -- optional for jumping to a specific index (default: -1 for jumping 429 | to previous element) 430 | """ 431 | previous_node = None 432 | for node in self.major_node_specs[target_index::-1]: 433 | if node.created_onnx_node: 434 | previous_node = node 435 | break 436 | assert previous_node is not None 437 | return previous_node 438 | 439 | def _make_conv_node(self, layer_name, layer_dict): 440 | """Create an ONNX Conv node with optional batch normalization and 441 | activation nodes. 442 | 443 | Keyword arguments: 444 | layer_name -- the layer's name (also the corresponding key in layer_configs) 445 | layer_dict -- a layer parameter dictionary (one element of layer_configs) 446 | """ 447 | previous_node_specs = self._get_previous_node_specs() 448 | inputs = [previous_node_specs.name] 449 | previous_channels = previous_node_specs.channels 450 | kernel_size = layer_dict['size'] 451 | stride = layer_dict['stride'] 452 | filters = layer_dict['filters'] 453 | batch_normalize = False 454 | if 'batch_normalize' in layer_dict.keys( 455 | ) and layer_dict['batch_normalize'] == 1: 456 | batch_normalize = True 457 | 458 | kernel_shape = [kernel_size, kernel_size] 459 | weights_shape = [filters, previous_channels] + kernel_shape 460 | conv_params = ConvParams(layer_name, batch_normalize, weights_shape) 461 | 462 | strides = [stride, stride] 463 | dilations = [1, 1] 464 | weights_name = conv_params.generate_param_name('conv', 'weights') 465 | inputs.append(weights_name) 466 | if not batch_normalize: 467 | bias_name = conv_params.generate_param_name('conv', 'bias') 468 | inputs.append(bias_name) 469 | 470 | conv_node = helper.make_node( 471 | 'Conv', 472 | inputs=inputs, 473 | outputs=[layer_name], 474 | kernel_shape=kernel_shape, 475 | strides=strides, 476 | auto_pad='SAME_LOWER', 477 | dilations=dilations, 478 | name=layer_name 479 | ) 480 | self._nodes.append(conv_node) 481 | inputs = [layer_name] 482 | layer_name_output = layer_name 483 | 484 | if batch_normalize: 485 | layer_name_bn = layer_name + '_bn' 486 | bn_param_suffixes = ['scale', 'bias', 'mean', 'var'] 487 | for suffix in bn_param_suffixes: 488 | bn_param_name = conv_params.generate_param_name('bn', suffix) 489 | inputs.append(bn_param_name) 490 | batchnorm_node = helper.make_node( 491 | 'BatchNormalization', 492 | inputs=inputs, 493 | outputs=[layer_name_bn], 494 | epsilon=self.epsilon_bn, 495 | momentum=self.momentum_bn, 496 | name=layer_name_bn 497 | ) 498 | self._nodes.append(batchnorm_node) 499 | inputs = [layer_name_bn] 500 | layer_name_output = layer_name_bn 501 | 502 | if layer_dict['activation'] == 'leaky': 503 | layer_name_lrelu = layer_name + '_lrelu' 504 | 505 | lrelu_node = helper.make_node( 506 | 'LeakyRelu', 507 | inputs=inputs, 508 | outputs=[layer_name_lrelu], 509 | name=layer_name_lrelu, 510 | alpha=self.alpha_lrelu 511 | ) 512 | self._nodes.append(lrelu_node) 513 | inputs = [layer_name_lrelu] 514 | layer_name_output = layer_name_lrelu 515 | elif layer_dict['activation'] == 'linear': 516 | pass 517 | else: 518 | print('Activation not supported.') 519 | 520 | self.param_dict[layer_name] = conv_params 521 | return layer_name_output, filters 522 | 523 | def _make_shortcut_node(self, layer_name, layer_dict): 524 | """Create an ONNX Add node with the shortcut properties from 525 | the DarkNet-based graph. 526 | 527 | Keyword arguments: 528 | layer_name -- the layer's name (also the corresponding key in layer_configs) 529 | layer_dict -- a layer parameter dictionary (one element of layer_configs) 530 | """ 531 | shortcut_index = layer_dict['from'] 532 | activation = layer_dict['activation'] 533 | assert activation == 'linear' 534 | 535 | first_node_specs = self._get_previous_node_specs() 536 | second_node_specs = self._get_previous_node_specs( 537 | target_index=shortcut_index) 538 | assert first_node_specs.channels == second_node_specs.channels 539 | channels = first_node_specs.channels 540 | inputs = [first_node_specs.name, second_node_specs.name] 541 | shortcut_node = helper.make_node( 542 | 'Add', 543 | inputs=inputs, 544 | outputs=[layer_name], 545 | name=layer_name, 546 | ) 547 | self._nodes.append(shortcut_node) 548 | return layer_name, channels 549 | 550 | def _make_route_node(self, layer_name, layer_dict): 551 | """If the 'layers' parameter from the DarkNet configuration is only one index, continue 552 | node creation at the indicated (negative) index. Otherwise, create an ONNX Concat node 553 | with the route properties from the DarkNet-based graph. 554 | 555 | Keyword arguments: 556 | layer_name -- the layer's name (also the corresponding key in layer_configs) 557 | layer_dict -- a layer parameter dictionary (one element of layer_configs) 558 | """ 559 | route_node_indexes = layer_dict['layers'] 560 | if len(route_node_indexes) == 1: 561 | split_index = route_node_indexes[0] 562 | assert split_index < 0 563 | # Increment by one because we skipped the YOLO layer: 564 | split_index += 1 565 | self.major_node_specs = self.major_node_specs[:split_index] 566 | layer_name = None 567 | channels = None 568 | else: 569 | inputs = list() 570 | channels = 0 571 | for index in route_node_indexes: 572 | if index > 0: 573 | # Increment by one because we count the input as a node (DarkNet 574 | # does not) 575 | index += 1 576 | route_node_specs = self._get_previous_node_specs( 577 | target_index=index) 578 | inputs.append(route_node_specs.name) 579 | channels += route_node_specs.channels 580 | assert inputs 581 | assert channels > 0 582 | 583 | route_node = helper.make_node( 584 | 'Concat', 585 | axis=1, 586 | inputs=inputs, 587 | outputs=[layer_name], 588 | name=layer_name, 589 | ) 590 | self._nodes.append(route_node) 591 | return layer_name, channels 592 | 593 | def _make_upsample_node(self, layer_name, layer_dict): 594 | """Create an ONNX Upsample node with the properties from 595 | the DarkNet-based graph. 596 | 597 | Keyword arguments: 598 | layer_name -- the layer's name (also the corresponding key in layer_configs) 599 | layer_dict -- a layer parameter dictionary (one element of layer_configs) 600 | """ 601 | upsample_factor = float(layer_dict['stride']) 602 | previous_node_specs = self._get_previous_node_specs() 603 | inputs = [previous_node_specs.name] 604 | channels = previous_node_specs.channels 605 | assert channels > 0 606 | upsample_node = helper.make_node( 607 | 'Upsample', 608 | mode='nearest', 609 | # For ONNX versions <0.7.0, Upsample nodes accept different parameters than 'scales': 610 | scales=[1.0, 1.0, upsample_factor, upsample_factor], 611 | inputs=inputs, 612 | outputs=[layer_name], 613 | name=layer_name, 614 | ) 615 | self._nodes.append(upsample_node) 616 | return layer_name, channels 617 | 618 | def main(): 619 | """Run the DarkNet-to-ONNX conversion for YOLOv3-608.""" 620 | # Have to use python 2 due to hashlib compatibility 621 | if sys.version_info[0] > 2: 622 | raise Exception("This is script is only compatible with python2, please re-run this script \ 623 | with python2. The rest of this sample can be run with either version of python") 624 | 625 | cfg_file_path = "yolov3-608.cfg" 626 | 627 | # These are the only layers DarkNetParser will extract parameters from. The three layers of 628 | # type 'yolo' are not parsed in detail because they are included in the post-processing later: 629 | supported_layers = ['net', 'convolutional', 'shortcut', 'route', 'upsample'] 630 | 631 | # Create a DarkNetParser object, and the use it to generate an OrderedDict with all 632 | # layer's configs from the cfg file: 633 | parser = DarkNetParser(supported_layers) 634 | layer_configs = parser.parse_cfg_file(cfg_file_path) 635 | # We do not need the parser anymore after we got layer_configs: 636 | del parser 637 | 638 | # In above layer_config, there are three outputs that we need to know the output 639 | # shape of (in CHW format): 640 | output_tensor_dims = OrderedDict() 641 | #yolo-v3(608*608) 642 | output_tensor_dims['082_convolutional'] = [255, 19, 19] 643 | output_tensor_dims['094_convolutional'] = [255, 38, 38] 644 | output_tensor_dims['106_convolutional'] = [255, 76, 76] 645 | #yolo-v3(416*416) 646 | # output_tensor_dims['082_convolutional'] = [255, 13, 13] 647 | # output_tensor_dims['094_convolutional'] = [255, 26, 26] 648 | # output_tensor_dims['106_convolutional'] = [255, 52, 52] 649 | 650 | # Create a GraphBuilderONNX object with the known output tensor dimensions: 651 | builder = GraphBuilderONNX(output_tensor_dims) 652 | 653 | weights_file_path = "yolov3-608.weights" 654 | 655 | # Now generate an ONNX graph with weights from the previously parsed layer configurations 656 | # and the weights file: 657 | yolov3_model_def = builder.build_onnx_graph( 658 | layer_configs=layer_configs, 659 | weights_file_path=weights_file_path, 660 | verbose=True) 661 | # Once we have the model definition, we do not need the builder anymore: 662 | del builder 663 | 664 | # Perform a sanity check on the ONNX model definition: 665 | onnx.checker.check_model(yolov3_model_def) 666 | 667 | # Serialize the generated ONNX graph to this file: 668 | output_file_path = 'yolov3-608.onnx' 669 | onnx.save(yolov3_model_def, output_file_path) 670 | 671 | if __name__ == '__main__': 672 | main() --------------------------------------------------------------------------------