├── images ├── dog.jpg ├── person.jpg └── person2.jpg ├── base_module.py ├── README.md ├── alpha_yolo3_module_drawing.py ├── onnx_to_trt_1batch.py ├── onnx_to_trt_multibatch.py ├── bbox.py ├── common.py ├── trt_yolo3_module_1batch.py ├── trt_yolo3_module_multibatch.py ├── yolov3-608.cfg ├── util.py └── weight_to_onnx.py /images/dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cw-zero/TensorRT_yolo3_module/HEAD/images/dog.jpg -------------------------------------------------------------------------------- /images/person.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cw-zero/TensorRT_yolo3_module/HEAD/images/person.jpg -------------------------------------------------------------------------------- /images/person2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cw-zero/TensorRT_yolo3_module/HEAD/images/person2.jpg -------------------------------------------------------------------------------- /base_module.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | class BaseModule(ABC): 4 | 5 | @abstractmethod 6 | def process_frame(self): 7 | pass 8 | 9 | @abstractmethod 10 | def process_frame_batch(self): 11 | pass -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TensorRT_yolo3_module 2 | 3 | ------ 4 | 5 | ## 1. Install TensorRT on Ubuntu 6 | ## 2. Test TensorRT_yolo3_module 7 | - a. Download yolo3.weight from [this](https://pjreddie.com/media/files/yolov3.weights), and change the name to **yolov3-608.weights**. 8 | - b. `python2 weight_to_onnx.py`. To execute this script you must use python 2.7, and you will have a file named **yolov3-608.onnx**. 9 | - c1. `python3 onnx_to_trt_1batch.py`. If you only need to process one image each time, for example you only have one camera. Executing this script you need python 3.x, and you will have a file named **yolov3-608.trt**, which is the file we ultimately need. 10 | - c2. `python3 onnx_to_trt_multibatch.py`. If you need to process multiple images each time, for example you have multiple cameras. Executing this script you also need python 3.x, and you will have a file named **yolov3-608.trt**, which is the file we ultimately need. And the data accuracy is **FP16**, so the acceleration is more obvious. 11 | - d1.`python3 trt_yolo3_module_1batch.py`, if you choose **c1** 12 | - d2.`python3 trt_yolo3_module_multibatch.py`,if you choose **c2**. It detects 4 images at a time. 13 | 14 | ## 3. Import TensorRT_yolo3_module 15 | - This project has been packaged into **class**, so you can use it directly according `import xx` command. 16 | -------------------------------------------------------------------------------- /alpha_yolo3_module_drawing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import cv2 4 | """ 5 | alpha_yolo3模块的绘图函数 6 | 7 | 必须包含如下函数 8 | drawing(frame,result_dict) 9 | 10 | 注意传递的字典应该遵循如下形式, 11 | dict = {} 12 | dict['info']={'frame_id':1,'camera_id':0}#info字段为图像的基本信息 13 | dict['img']=@#@#@#@##@# #为opencv打开的numpy.arry 14 | dict['data']={'number':1,'box_list':[[30,10,123,23]]} #data字段为算法产生的数据 15 | 16 | """ 17 | 18 | 19 | def drawing(frame,result_dict): 20 | """ 21 | 在输入的图像上按照result_dict中的信息进行绘图 22 | 23 | Parameters: 24 | frame: 图像 # opencv打开的图像,numpy.array格式 25 | result_dict: 字典,包含'info'字段与'data'字段,用来绘图 26 | 27 | Returns: 28 | None 29 | """ 30 | draw_bbx(frame,result_dict) 31 | 32 | def draw_bbx(frame,result_dict): 33 | class_color_scheme = {'person':(0,255,255),'motorbike':(72,118,255),'car':(255,191,0),'bicycle':(0,128,255),'umbrella':(72,118,255),'truck':(152,251,152),'handbag':(255,165,0),'backpack':(160,32,240)} 34 | if 'box_list' in result_dict['data']: 35 | if 'class_list' in result_dict['data']: 36 | box_list = result_dict['data']['box_list'] 37 | class_list = result_dict['data']['class_list'] 38 | i = 0 39 | for j in range(len(box_list)): 40 | xmin = box_list[j][0] 41 | xmax = box_list[j][1] 42 | ymin = box_list[j][2] 43 | ymax = box_list[j][3] 44 | class_name = class_list[j] 45 | cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), class_color_scheme[class_name], 2) 46 | cv2.putText(frame, '{}_{}'.format(class_name,i), (xmin+10, ymin+10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, class_color_scheme[class_name]) 47 | i+=1 -------------------------------------------------------------------------------- /onnx_to_trt_1batch.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import torch 4 | import numpy as np 5 | import tensorrt as trt 6 | import pycuda.driver as cuda 7 | import pycuda.autoinit 8 | 9 | import sys, os 10 | sys.path.insert(1, os.path.join(sys.path[0], "..")) 11 | import common 12 | 13 | TRT_LOGGER = trt.Logger() 14 | 15 | def get_engine(onnx_file_path, engine_file_path=""): 16 | """Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it.""" 17 | def build_engine(): 18 | """Takes an ONNX file and creates a TensorRT engine to run inference with""" 19 | with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.OnnxParser(network, TRT_LOGGER) as parser: 20 | # builder.fp16_mode = True 21 | # builder.strict_type_constraints = True 22 | 23 | builder.max_workspace_size = 1 << 30 # 1GB 24 | builder.max_batch_size = 1 25 | # Parse model file 26 | if not os.path.exists(onnx_file_path): 27 | print('ONNX file {} not found, please run yolov3_to_onnx.py first to generate it.'.format(onnx_file_path)) 28 | exit(0) 29 | print('Loading ONNX file from path {}...'.format(onnx_file_path)) 30 | with open(onnx_file_path, 'rb') as model: 31 | print('Beginning ONNX file parsing') 32 | parser.parse(model.read()) 33 | print('Completed parsing of ONNX file') 34 | print('Building an engine from file {}; this may take a while...'.format(onnx_file_path)) 35 | engine = builder.build_cuda_engine(network) 36 | print("Completed creating Engine") 37 | with open(engine_file_path, "wb") as f: 38 | f.write(engine.serialize()) 39 | # return engine 40 | 41 | if os.path.exists(engine_file_path): 42 | print("Please delete yolov3-608.trt firstly, otherwise you can not get a new file") 43 | else: 44 | build_engine() 45 | 46 | def main(): 47 | 48 | """Create a TensorRT engine for ONNX-based YOLOv3-608 and run inference.""" 49 | 50 | # Try to load a previously generated YOLOv3-608 network graph in ONNX format: 51 | onnx_file_path = 'yolov3-608.onnx' 52 | engine_file_path = "yolov3-608.trt" 53 | get_engine(onnx_file_path, engine_file_path) 54 | 55 | if __name__ == '__main__': 56 | main() 57 | -------------------------------------------------------------------------------- /onnx_to_trt_multibatch.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import torch 4 | import numpy as np 5 | import tensorrt as trt 6 | import pycuda.driver as cuda 7 | import pycuda.autoinit 8 | 9 | import sys, os 10 | sys.path.insert(1, os.path.join(sys.path[0], "..")) 11 | import common 12 | 13 | TRT_LOGGER = trt.Logger() 14 | 15 | def get_engine(onnx_file_path, engine_file_path=""): 16 | """Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it.""" 17 | def build_engine(): 18 | """Takes an ONNX file and creates a TensorRT engine to run inference with""" 19 | with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.OnnxParser(network, TRT_LOGGER) as parser: 20 | builder.fp16_mode = True 21 | builder.strict_type_constraints = True 22 | 23 | builder.max_workspace_size = 1 << 30 # 1GB 24 | builder.max_batch_size = 4 25 | # Parse model file 26 | if not os.path.exists(onnx_file_path): 27 | print('ONNX file {} not found, please run yolov3_to_onnx.py first to generate it.'.format(onnx_file_path)) 28 | exit(0) 29 | print('Loading ONNX file from path {}...'.format(onnx_file_path)) 30 | with open(onnx_file_path, 'rb') as model: 31 | print('Beginning ONNX file parsing') 32 | parser.parse(model.read()) 33 | print('Completed parsing of ONNX file') 34 | print('Building an engine from file {}; this may take a while...'.format(onnx_file_path)) 35 | engine = builder.build_cuda_engine(network) 36 | print("Completed creating Engine") 37 | with open(engine_file_path, "wb") as f: 38 | f.write(engine.serialize()) 39 | # return engine 40 | 41 | if os.path.exists(engine_file_path): 42 | print("Please delete yolov3-608.trt firstly, otherwise you can not get a new file") 43 | else: 44 | build_engine() 45 | 46 | def main(): 47 | 48 | """Create a TensorRT engine for ONNX-based YOLOv3-608 and run inference.""" 49 | 50 | # Try to load a previously generated YOLOv3-608 network graph in ONNX format: 51 | onnx_file_path = 'yolov3-608.onnx' 52 | engine_file_path = "yolov3-608.trt" 53 | get_engine(onnx_file_path, engine_file_path) 54 | 55 | if __name__ == '__main__': 56 | main() 57 | -------------------------------------------------------------------------------- /bbox.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import torch 4 | import random 5 | 6 | import numpy as np 7 | import cv2 8 | 9 | def confidence_filter(result, confidence): 10 | conf_mask = (result[:,:,4] > confidence).float().unsqueeze(2) 11 | result = result*conf_mask 12 | 13 | return result 14 | 15 | def confidence_filter_cls(result, confidence): 16 | max_scores = torch.max(result[:,:,5:25], 2)[0] 17 | res = torch.cat((result, max_scores),2) 18 | print(res.shape) 19 | 20 | 21 | cond_1 = (res[:,:,4] > confidence).float() 22 | cond_2 = (res[:,:,25] > 0.995).float() 23 | 24 | conf = cond_1 + cond_2 25 | conf = torch.clamp(conf, 0.0, 1.0) 26 | conf = conf.unsqueeze(2) 27 | result = result*conf 28 | return result 29 | 30 | 31 | 32 | def get_abs_coord(box): 33 | box[2], box[3] = abs(box[2]), abs(box[3]) 34 | x1 = (box[0] - box[2]/2) - 1 35 | y1 = (box[1] - box[3]/2) - 1 36 | x2 = (box[0] + box[2]/2) - 1 37 | y2 = (box[1] + box[3]/2) - 1 38 | return x1, y1, x2, y2 39 | 40 | 41 | 42 | def sanity_fix(box): 43 | if (box[0] > box[2]): 44 | box[0], box[2] = box[2], box[0] 45 | 46 | if (box[1] > box[3]): 47 | box[1], box[3] = box[3], box[1] 48 | 49 | return box 50 | 51 | def bbox_iou(box1, box2): 52 | """ 53 | Returns the IoU of two bounding boxes 54 | 55 | 56 | """ 57 | #Get the coordinates of bounding boxes 58 | b1_x1, b1_y1, b1_x2, b1_y2 = box1[:,0], box1[:,1], box1[:,2], box1[:,3] 59 | b2_x1, b2_y1, b2_x2, b2_y2 = box2[:,0], box2[:,1], box2[:,2], box2[:,3] 60 | 61 | #get the corrdinates of the intersection rectangle 62 | inter_rect_x1 = torch.max(b1_x1, b2_x1) 63 | inter_rect_y1 = torch.max(b1_y1, b2_y1) 64 | inter_rect_x2 = torch.min(b1_x2, b2_x2) 65 | inter_rect_y2 = torch.min(b1_y2, b2_y2) 66 | 67 | #Intersection area 68 | 69 | inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1,torch.zeros(inter_rect_x2.shape).cuda())*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape).cuda()) 70 | # inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1,torch.zeros(inter_rect_x2.shape))*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape)) 71 | 72 | #Union Area 73 | b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1) 74 | b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1) 75 | 76 | iou = inter_area / (b1_area + b2_area - inter_area) 77 | 78 | return iou 79 | 80 | 81 | def pred_corner_coord(prediction): 82 | #Get indices of non-zero confidence bboxes 83 | ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous() 84 | 85 | box = prediction[ind_nz[0], ind_nz[1]] 86 | 87 | 88 | box_a = box.new(box.shape) 89 | box_a[:,0] = (box[:,0] - box[:,2]/2) 90 | box_a[:,1] = (box[:,1] - box[:,3]/2) 91 | box_a[:,2] = (box[:,0] + box[:,2]/2) 92 | box_a[:,3] = (box[:,1] + box[:,3]/2) 93 | box[:,:4] = box_a[:,:4] 94 | 95 | prediction[ind_nz[0], ind_nz[1]] = box 96 | 97 | return prediction 98 | 99 | 100 | 101 | 102 | def write(x, batches, results, colors, classes): 103 | c1 = tuple(x[1:3].int()) 104 | c2 = tuple(x[3:5].int()) 105 | img = results[int(x[0])] 106 | cls = int(x[-1]) 107 | label = "{0}".format(classes[cls]) 108 | color = random.choice(colors) 109 | cv2.rectangle(img, c1, c2,color, 1) 110 | t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0] 111 | c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 112 | cv2.rectangle(img, c1, c2,color, -1) 113 | cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1); 114 | return img 115 | -------------------------------------------------------------------------------- /common.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 1993-2018 NVIDIA Corporation. All rights reserved. 3 | # 4 | # NOTICE TO LICENSEE: 5 | # 6 | # This source code and/or documentation ("Licensed Deliverables") are 7 | # subject to NVIDIA intellectual property rights under U.S. and 8 | # international Copyright laws. 9 | # 10 | # These Licensed Deliverables contained herein is PROPRIETARY and 11 | # CONFIDENTIAL to NVIDIA and is being provided under the terms and 12 | # conditions of a form of NVIDIA software license agreement by and 13 | # between NVIDIA and Licensee ("License Agreement") or electronically 14 | # accepted by Licensee. Notwithstanding any terms or conditions to 15 | # the contrary in the License Agreement, reproduction or disclosure 16 | # of the Licensed Deliverables to any third party without the express 17 | # written consent of NVIDIA is prohibited. 18 | # 19 | # NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE 20 | # LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE 21 | # SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS 22 | # PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. 23 | # NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED 24 | # DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, 25 | # NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 26 | # NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE 27 | # LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY 28 | # SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY 29 | # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 30 | # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 31 | # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 32 | # OF THESE LICENSED DELIVERABLES. 33 | # 34 | # U.S. Government End Users. These Licensed Deliverables are a 35 | # "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 36 | # 1995), consisting of "commercial computer software" and "commercial 37 | # computer software documentation" as such terms are used in 48 38 | # C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government 39 | # only as a commercial end item. Consistent with 48 C.F.R.12.212 and 40 | # 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all 41 | # U.S. Government End Users acquire the Licensed Deliverables with 42 | # only those rights set forth herein. 43 | # 44 | # Any use of the Licensed Deliverables in individual and commercial 45 | # software must include, in the user documentation and internal 46 | # comments to the code, the above Disclaimer and U.S. Government End 47 | # Users Notice. 48 | # 49 | 50 | import os 51 | import argparse 52 | import numpy as np 53 | import pycuda.driver as cuda 54 | import tensorrt as trt 55 | 56 | try: 57 | # Sometimes python2 does not understand FileNotFoundError 58 | FileNotFoundError 59 | except NameError: 60 | FileNotFoundError = IOError 61 | 62 | def GiB(val): 63 | return val * 1 << 30 64 | 65 | def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[]): 66 | ''' 67 | Parses sample arguments. 68 | Args: 69 | description (str): Description of the sample. 70 | subfolder (str): The subfolder containing data relevant to this sample 71 | find_files (str): A list of filenames to find. Each filename will be replaced with an absolute path. 72 | Returns: 73 | str: Path of data directory. 74 | Raises: 75 | FileNotFoundError 76 | ''' 77 | kDEFAULT_DATA_ROOT = os.path.abspath("/usr/src/tensorrt/data") 78 | 79 | # Standard command-line arguments for all samples. 80 | parser = argparse.ArgumentParser(description=description) 81 | parser.add_argument("-d", "--datadir", help="Location of the TensorRT sample data directory.") 82 | args, unknown_args = parser.parse_known_args() 83 | 84 | # If data directory is not specified, use the default. 85 | data_root = args.datadir if args.datadir else kDEFAULT_DATA_ROOT 86 | # If the subfolder exists, append it to the path, otherwise use the provided path as-is. 87 | subfolder_path = os.path.join(data_root, subfolder) 88 | if not os.path.exists(subfolder_path): 89 | print("WARNING: " + subfolder_path + " does not exist. Using " + data_root + " instead.") 90 | data_path = subfolder_path if os.path.exists(subfolder_path) else data_root 91 | 92 | # Make sure data directory exists. 93 | if not (os.path.exists(data_path)): 94 | raise FileNotFoundError(data_path + " does not exist. Please provide the correct data path with the -d option.") 95 | 96 | # Find all requested files. 97 | for index, f in enumerate(find_files): 98 | find_files[index] = os.path.abspath(os.path.join(data_path, f)) 99 | if not os.path.exists(find_files[index]): 100 | raise FileNotFoundError(find_files[index] + " does not exist. Please provide the correct data path with the -d option.") 101 | if find_files: 102 | return data_path, find_files 103 | else: 104 | return data_path 105 | 106 | # Simple helper data class that's a little nicer to use than a 2-tuple. 107 | class HostDeviceMem(object): 108 | def __init__(self, host_mem, device_mem): 109 | self.host = host_mem 110 | self.device = device_mem 111 | 112 | def __str__(self): 113 | return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) 114 | 115 | def __repr__(self): 116 | return self.__str__() 117 | 118 | # Allocates all buffers required for an engine, i.e. host/device inputs/outputs. 119 | def allocate_buffers(engine): 120 | inputs = [] 121 | outputs = [] 122 | bindings = [] 123 | stream = cuda.Stream() 124 | for binding in engine: 125 | size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size 126 | # dtype = trt.nptype(engine.get_binding_dtype(binding)) 127 | dtype = np.float32 128 | # Allocate host and device buffers 129 | host_mem = cuda.pagelocked_empty(size, dtype) 130 | device_mem = cuda.mem_alloc(host_mem.nbytes) 131 | # Append the device buffer to device bindings. 132 | bindings.append(int(device_mem)) 133 | # Append to the appropriate list. 134 | if engine.binding_is_input(binding): 135 | inputs.append(HostDeviceMem(host_mem, device_mem)) 136 | else: 137 | outputs.append(HostDeviceMem(host_mem, device_mem)) 138 | return inputs, outputs, bindings, stream 139 | 140 | # This function is generalized for multiple inputs/outputs. 141 | # inputs and outputs are expected to be lists of HostDeviceMem objects. 142 | def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): 143 | # Transfer input data to the GPU. 144 | [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] 145 | # Run inference. 146 | context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) 147 | # Transfer predictions back from the GPU. 148 | [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] 149 | # Synchronize the stream 150 | stream.synchronize() 151 | # Return only the host outputs. 152 | return [out.host for out in outputs] 153 | -------------------------------------------------------------------------------- /trt_yolo3_module_1batch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import tensorrt as trt 4 | import pycuda.driver as cuda 5 | import pycuda.autoinit 6 | import time 7 | from base_module import BaseModule 8 | from util import * 9 | from alpha_yolo3_module_drawing import drawing 10 | 11 | # from data_processing import PreprocessYOLO 12 | 13 | import sys, os 14 | sys.path.insert(1, os.path.join(sys.path[0], "..")) 15 | import common 16 | 17 | TRT_LOGGER = trt.Logger() 18 | 19 | def get_engine(engine_file_path): 20 | if os.path.exists(engine_file_path): 21 | print("Reading engine from file {}".format(engine_file_path)) 22 | with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: 23 | return runtime.deserialize_cuda_engine(f.read()) 24 | else: 25 | print("TRT file not found") 26 | 27 | 28 | def prep_image(orig_im, inp_dim): 29 | dim = orig_im.shape[1], orig_im.shape[0] 30 | img = (letterbox_image(orig_im, (inp_dim, inp_dim))) 31 | img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy() #(3 608 608) 32 | img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) 33 | img_ = img_.numpy() 34 | return img_, orig_im, dim 35 | 36 | def letterbox_image(img, inp_dim): 37 | '''resize image with unchanged aspect ratio using padding''' 38 | img_w, img_h = img.shape[1], img.shape[0] 39 | w, h = inp_dim 40 | new_w = int(img_w * min(w / img_w, h / img_h)) 41 | new_h = int(img_h * min(w / img_w, h / img_h)) 42 | resized_image = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_CUBIC) 43 | canvas = np.full((inp_dim[1], inp_dim[0], 3), 128) 44 | canvas[(h - new_h) // 2:(h - new_h) // 2 + new_h, (w - new_w) // 2:(w - new_w) // 2 + new_w, :] = resized_image 45 | return canvas 46 | 47 | class trt_yolo3_module(BaseModule): 48 | def __init__(self, init_dict): 49 | a = torch.cuda.FloatTensor() #pytorch必须首先占用部分CUDA 50 | builder = trt.Builder(TRT_LOGGER) 51 | builder.fp16_mode = True 52 | builder.strict_type_constraints = True 53 | self.trt_file = init_dict['trt'] 54 | self.use_cuda = init_dict['use_cuda'] 55 | self.inp_dim = 608 56 | self.num_classes = 80 57 | self.output_shapes = [(1, 255, 19, 19), (1, 255, 38, 38), (1, 255, 76, 76)] #yolo3-608 58 | self.yolo_anchors = [[(116, 90), (156, 198), (373, 326)], 59 | [(30, 61), (62, 45), (59, 119)], 60 | [(10, 13), (16, 30), (33, 23)]] 61 | 62 | self.engine = get_engine(self.trt_file) 63 | self.inputs, self.outputs, self.bindings, self.stream = common.allocate_buffers(self.engine) 64 | self.context = self.engine.create_execution_context() 65 | 66 | def preparing(self,orig_img_list): 67 | img = [] 68 | orig_img = [] 69 | im_name = [] 70 | im_dim_list = [] 71 | batch = 1 72 | for im in orig_img_list: 73 | im_name_k = '' 74 | img_k, orig_img_k, im_dim_list_k = prep_image(im, self.inp_dim) 75 | img.append(img_k) 76 | orig_img.append(orig_img_k) 77 | im_name.append(im_name_k) 78 | im_dim_list.append(im_dim_list_k) 79 | 80 | with torch.no_grad(): 81 | im_dim_list = torch.FloatTensor(im_dim_list).repeat(1,2) 82 | im_dim_list_ = im_dim_list 83 | 84 | procession_tuple = (img, orig_img, im_name, im_dim_list) 85 | return procession_tuple 86 | 87 | def detection(self,procession_tuple): 88 | (img, orig_img, im_name, im_dim_list) = procession_tuple 89 | # with get_engine(self.trt_file) as engine, engine.create_execution_context() as context: 90 | if 1: 91 | # inputs, outputs, bindings, stream = common.allocate_buffers(self.engine) 92 | inference_start = time.time() 93 | self.inputs[0].host = img[0] #waiting fix bug 94 | trt_outputs = common.do_inference(self.context, bindings=self.bindings, inputs=self.inputs, outputs=self.outputs, stream=self.stream) 95 | inference_end = time.time() 96 | # print('inference time : %f' % (inference_end-inference_start)) 97 | write = 0 98 | for output, shape, anchors in zip(trt_outputs, self.output_shapes, self.yolo_anchors): 99 | output = output.reshape(shape) 100 | trt_output = torch.from_numpy(output).cuda().data 101 | # trt_output = trt_output.data 102 | # cuda_time1 = time.time() 103 | trt_output = predict_transform(trt_output, self.inp_dim, anchors, self.num_classes, self.use_cuda) 104 | # cuda_time2 = time.time() 105 | # print('CUDA time : %f' % (cuda_time2 - cuda_time1)) 106 | if type(trt_output) == int: 107 | continue 108 | 109 | if not write: 110 | detections = trt_output 111 | write = 1 112 | 113 | else: 114 | detections = torch.cat((detections, trt_output), 1) 115 | 116 | o_time1 = time.time() 117 | print('TensorRT inference time : %f' % (o_time1-inference_start)) 118 | dets = dynamic_write_results(detections, 0.5, self.num_classes, nms=True, nms_conf=0.45) 119 | o_time2 = time.time() 120 | print('After process time : %f' %(o_time2-o_time1)) 121 | class_list_all = [] 122 | box_list_all = [] 123 | conf_list_all = [] 124 | if not isinstance(dets,int): 125 | dets = dets.cpu() 126 | im_dim_list = torch.index_select(im_dim_list,0, dets[:, 0].long()) 127 | scaling_factor = torch.min(self.inp_dim / im_dim_list, 1)[0].view(-1, 1) 128 | dets[:, [1, 3]] -= (self.inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 129 | dets[:, [2, 4]] -= (self.inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 130 | dets[:, 1:5] /= scaling_factor 131 | for j in range(dets.shape[0]): 132 | dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) 133 | dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) 134 | boxes = dets[:, 1:5] 135 | scores = dets[:, 5:6] 136 | for k in range(len(orig_img)): 137 | boxes_k = boxes[dets[:,0]==k] 138 | scores_k = scores[dets[:,0]==k] 139 | class_list = [] 140 | box_list = [] 141 | for b in boxes_k: 142 | x1=int(b[0]) 143 | x2=int(b[2]) 144 | y1=int(b[1]) 145 | y2=int(b[3]) 146 | box_list.append([x1,x2,y1,y2]) 147 | class_list.append('person') 148 | 149 | score_list = scores_k.numpy().tolist() 150 | s_list = [] 151 | for s in score_list: 152 | s_list.append(s[0]) 153 | box_list_all.append(box_list) 154 | conf_list_all.append(s_list) 155 | class_list_all.append(class_list) 156 | 157 | return (class_list_all,box_list_all,conf_list_all) 158 | 159 | 160 | 161 | def dict_checkup(self,dict): 162 | if 'img' not in dict: 163 | dict['img']= '' 164 | print('no img in dict') 165 | if 'data' not in dict: 166 | dict['data']={} 167 | print('no data in dict') 168 | if 'info' not in dict: 169 | dict['info']={} 170 | print('no info in dict') 171 | 172 | def process_frame(self, frame_dic): 173 | pass 174 | 175 | def process_frame_batch(self, frame_dic_list): 176 | for dic in frame_dic_list: 177 | self.dict_checkup(dic) 178 | 179 | img_list = [] 180 | for dic in frame_dic_list: 181 | img_list.append(dic['img']) 182 | 183 | procession_tuple = self.preparing(img_list) 184 | # (img, orig_img, im_name, im_dim_list) = procession_tuple 185 | (class_list_all,box_list_all,conf_list_all) = self.detection(procession_tuple) 186 | if len(class_list_all) == 0: 187 | for frame_dic in frame_dic_list: 188 | frame_dic['data']['number'] = 0 189 | frame_dic['data']['box_list'] = [] 190 | frame_dic['data']['class_list'] = [] 191 | frame_dic['data']['conf_list'] = [] 192 | else: 193 | for i,frame_dic in enumerate(frame_dic_list): 194 | frame_dic['data']['number'] = len(class_list_all[i]) 195 | frame_dic['data']['box_list'] = box_list_all[i] 196 | frame_dic['data']['class_list'] = class_list_all[i] 197 | frame_dic['data']['conf_list'] = conf_list_all[i] 198 | 199 | return frame_dic_list 200 | 201 | 202 | 203 | 204 | if __name__ == '__main__': 205 | init_dict = {'trt':"yolov3-608.trt", 'use_cuda':True} 206 | alpha_yolo3_unit = trt_yolo3_module(init_dict) 207 | 208 | input_dic_list = [] 209 | img_path = './images/person.jpg' 210 | dic = {'img':cv2.imread(img_path),'data':{},'info':{}} 211 | input_dic_list.append(dic) 212 | 213 | while True: 214 | output_dic_list = alpha_yolo3_unit.process_frame_batch(input_dic_list) 215 | for dic in output_dic_list: 216 | img_array = dic['img'] 217 | drawing(img_array,dic) 218 | cv2.imshow('show',img_array) 219 | cv2.waitKey(5000) 220 | -------------------------------------------------------------------------------- /trt_yolo3_module_multibatch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import tensorrt as trt 4 | import pycuda.driver as cuda 5 | import pycuda.autoinit 6 | import time 7 | from base_module import BaseModule 8 | from util import * 9 | from alpha_yolo3_module_drawing import drawing 10 | 11 | # from data_processing import PreprocessYOLO 12 | 13 | import sys, os 14 | sys.path.insert(1, os.path.join(sys.path[0], "..")) 15 | import common 16 | 17 | TRT_LOGGER = trt.Logger() 18 | 19 | def get_engine(engine_file_path): 20 | if os.path.exists(engine_file_path): 21 | print("Reading engine from file {}".format(engine_file_path)) 22 | with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: 23 | return runtime.deserialize_cuda_engine(f.read()) 24 | else: 25 | print("TRT file not found") 26 | 27 | 28 | def prep_image(orig_im, inp_dim): 29 | dim = orig_im.shape[1], orig_im.shape[0] 30 | img = (letterbox_image(orig_im, (inp_dim, inp_dim))) 31 | img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy() #(3 608 608) 32 | img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) 33 | img_ = img_.numpy() 34 | return img_, orig_im, dim 35 | 36 | def letterbox_image(img, inp_dim): 37 | '''resize image with unchanged aspect ratio using padding''' 38 | img_w, img_h = img.shape[1], img.shape[0] 39 | w, h = inp_dim 40 | new_w = int(img_w * min(w / img_w, h / img_h)) 41 | new_h = int(img_h * min(w / img_w, h / img_h)) 42 | resized_image = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_CUBIC) 43 | canvas = np.full((inp_dim[1], inp_dim[0], 3), 128) 44 | canvas[(h - new_h) // 2:(h - new_h) // 2 + new_h, (w - new_w) // 2:(w - new_w) // 2 + new_w, :] = resized_image 45 | return canvas 46 | 47 | class trt_yolo3_module(BaseModule): 48 | def __init__(self, init_dict): 49 | a = torch.cuda.FloatTensor() #pytorch必须首先占用部分CUDA 50 | builder = trt.Builder(TRT_LOGGER) 51 | builder.max_batch_size = 4 52 | builder.fp16_mode = True 53 | builder.strict_type_constraints = True 54 | self.trt_file = init_dict['trt'] 55 | self.use_cuda = init_dict['use_cuda'] 56 | self.inp_dim = 608 57 | self.num_classes = 80 58 | self.output_shapes = [(4, 255, 19, 19), (4, 255, 38, 38), (4, 255, 76, 76)] #yolo3-608 59 | self.yolo_anchors = [[(116, 90), (156, 198), (373, 326)], 60 | [(30, 61), (62, 45), (59, 119)], 61 | [(10, 13), (16, 30), (33, 23)]] 62 | 63 | self.engine = get_engine(self.trt_file) 64 | self.inputs, self.outputs, self.bindings, self.stream = common.allocate_buffers(self.engine) 65 | self.context = self.engine.create_execution_context() 66 | 67 | def preparing(self,orig_img_list): 68 | img = [] 69 | orig_img = [] 70 | im_name = [] 71 | im_dim_list = [] 72 | for im in orig_img_list: 73 | im_name_k = '' 74 | img_k, orig_img_k, im_dim_list_k = prep_image(im, self.inp_dim) 75 | img.append(img_k) 76 | orig_img.append(orig_img_k) 77 | im_name.append(im_name_k) 78 | im_dim_list.append(im_dim_list_k) 79 | 80 | with torch.no_grad(): 81 | # img = torch.cat(img) 82 | im_dim_list = torch.FloatTensor(im_dim_list).repeat(1,2) 83 | im_dim_list_ = im_dim_list 84 | 85 | procession_tuple = (img, orig_img, im_name, im_dim_list) 86 | return procession_tuple 87 | 88 | def detection(self,procession_tuple): 89 | (img, orig_img, im_name, im_dim_list) = procession_tuple 90 | # aaa = np.array(img) 91 | # with get_engine(self.trt_file) as engine, engine.create_execution_context() as context: 92 | if 1: 93 | # inputs, outputs, bindings, stream = common.allocate_buffers(self.engine) 94 | inference_start = time.time() 95 | self.inputs[0].host = np.array(img) #img[0] 96 | trt_outputs = common.do_inference(self.context, bindings=self.bindings, inputs=self.inputs, outputs=self.outputs, stream=self.stream, batch_size=4) 97 | inference_end = time.time() 98 | # print('inference time : %f' % (inference_end-inference_start)) 99 | write = 0 100 | for output, shape, anchors in zip(trt_outputs, self.output_shapes, self.yolo_anchors): 101 | output = output.reshape(shape) 102 | trt_output = torch.from_numpy(output).cuda().data 103 | # trt_output = trt_output.data 104 | # cuda_time1 = time.time() 105 | trt_output = predict_transform(trt_output, self.inp_dim, anchors, self.num_classes, self.use_cuda) 106 | # cuda_time2 = time.time() 107 | # print('CUDA time : %f' % (cuda_time2 - cuda_time1)) 108 | if type(trt_output) == int: 109 | continue 110 | 111 | if not write: 112 | detections = trt_output 113 | write = 1 114 | 115 | else: 116 | detections = torch.cat((detections, trt_output), 1) 117 | 118 | o_time1 = time.time() 119 | print('TensorRT inference time : %f' % (o_time1-inference_start)) 120 | dets = dynamic_write_results(detections, 0.5, self.num_classes, nms=True, nms_conf=0.45) 121 | o_time2 = time.time() 122 | print('After process time : %f' %(o_time2-o_time1)) 123 | class_list_all = [] 124 | box_list_all = [] 125 | conf_list_all = [] 126 | if not isinstance(dets,int): 127 | dets = dets.cpu() 128 | im_dim_list = torch.index_select(im_dim_list,0, dets[:, 0].long()) 129 | scaling_factor = torch.min(self.inp_dim / im_dim_list, 1)[0].view(-1, 1) 130 | dets[:, [1, 3]] -= (self.inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 131 | dets[:, [2, 4]] -= (self.inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 132 | dets[:, 1:5] /= scaling_factor 133 | for j in range(dets.shape[0]): 134 | dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) 135 | dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) 136 | boxes = dets[:, 1:5] 137 | scores = dets[:, 5:6] 138 | for k in range(len(orig_img)): 139 | boxes_k = boxes[dets[:,0]==k] 140 | scores_k = scores[dets[:,0]==k] 141 | class_list = [] 142 | box_list = [] 143 | for b in boxes_k: 144 | x1=int(b[0]) 145 | x2=int(b[2]) 146 | y1=int(b[1]) 147 | y2=int(b[3]) 148 | box_list.append([x1,x2,y1,y2]) 149 | class_list.append('person') 150 | 151 | score_list = scores_k.numpy().tolist() 152 | s_list = [] 153 | for s in score_list: 154 | s_list.append(s[0]) 155 | box_list_all.append(box_list) 156 | conf_list_all.append(s_list) 157 | class_list_all.append(class_list) 158 | 159 | return (class_list_all,box_list_all,conf_list_all) 160 | 161 | 162 | 163 | def dict_checkup(self,dict): 164 | if 'img' not in dict: 165 | dict['img']= '' 166 | print('no img in dict') 167 | if 'data' not in dict: 168 | dict['data']={} 169 | print('no data in dict') 170 | if 'info' not in dict: 171 | dict['info']={} 172 | print('no info in dict') 173 | 174 | def process_frame(self, frame_dic): 175 | pass 176 | 177 | def process_frame_batch(self, frame_dic_list): 178 | for dic in frame_dic_list: 179 | self.dict_checkup(dic) 180 | 181 | img_list = [] 182 | for dic in frame_dic_list: 183 | img_list.append(dic['img']) 184 | 185 | procession_tuple = self.preparing(img_list) 186 | # (img, orig_img, im_name, im_dim_list) = procession_tuple 187 | (class_list_all,box_list_all,conf_list_all) = self.detection(procession_tuple) 188 | if len(class_list_all) == 0: 189 | for frame_dic in frame_dic_list: 190 | frame_dic['data']['number'] = 0 191 | frame_dic['data']['box_list'] = [] 192 | frame_dic['data']['class_list'] = [] 193 | frame_dic['data']['conf_list'] = [] 194 | else: 195 | for i,frame_dic in enumerate(frame_dic_list): 196 | frame_dic['data']['number'] = len(class_list_all[i]) 197 | frame_dic['data']['box_list'] = box_list_all[i] 198 | frame_dic['data']['class_list'] = class_list_all[i] 199 | frame_dic['data']['conf_list'] = conf_list_all[i] 200 | 201 | return frame_dic_list 202 | 203 | 204 | 205 | 206 | if __name__ == '__main__': 207 | init_dict = {'trt':"yolov3-608.trt", 'use_cuda':True} 208 | alpha_yolo3_unit = trt_yolo3_module(init_dict) 209 | 210 | input_dic_list = [] 211 | 212 | img_path = './images/person.jpg' 213 | dic = {'img':cv2.imread(img_path),'data':{},'info':{}} 214 | input_dic_list.append(dic) 215 | 216 | img_path = './images/person2.jpg' 217 | dic = {'img':cv2.imread(img_path),'data':{},'info':{}} 218 | input_dic_list.append(dic) 219 | 220 | img_path = './images/person.jpg' 221 | dic = {'img':cv2.imread(img_path),'data':{},'info':{}} 222 | input_dic_list.append(dic) 223 | 224 | img_path = './images/person2.jpg' 225 | dic = {'img':cv2.imread(img_path),'data':{},'info':{}} 226 | input_dic_list.append(dic) 227 | 228 | while True: 229 | output_dic_list = alpha_yolo3_unit.process_frame_batch(input_dic_list) 230 | # for dic in output_dic_list: 231 | # img_array = dic['img'] 232 | # drawing(img_array,dic) 233 | # cv2.imshow('show',img_array) 234 | # cv2.waitKey(5000) 235 | -------------------------------------------------------------------------------- /yolov3-608.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [convolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=255 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 6,7,8 609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 610 | classes=80 611 | num=9 612 | jitter=.3 613 | ignore_thresh = .7 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [convolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=255 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 3,4,5 695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 696 | classes=80 697 | num=9 698 | jitter=.3 699 | ignore_thresh = .7 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [convolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=255 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 0,1,2 782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 783 | classes=80 784 | num=9 785 | jitter=.3 786 | ignore_thresh = .7 787 | truth_thresh = 1 788 | random=1 789 | 790 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import division 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | import numpy as np 9 | import sys 10 | if '/opt/ros/kinetic/lib/python2.7/dist-packages' in sys.path: 11 | sys.path.remove('/opt/ros/kinetic/lib/python2.7/dist-packages') 12 | import cv2 13 | #import matplotlib.pyplot as plt 14 | try: 15 | from bbox import bbox_iou 16 | except ImportError: 17 | from yolo.bbox import bbox_iou 18 | 19 | 20 | def count_parameters(model): 21 | return sum(p.numel() for p in model.parameters()) 22 | 23 | def count_learnable_parameters(model): 24 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 25 | 26 | def convert2cpu(matrix): 27 | if matrix.is_cuda: 28 | return torch.FloatTensor(matrix.size()).copy_(matrix) 29 | else: 30 | return matrix 31 | 32 | def predict_transform(prediction, inp_dim, anchors, num_classes, CUDA = True): 33 | batch_size = prediction.size(0) 34 | stride = inp_dim // prediction.size(2) 35 | grid_size = inp_dim // stride 36 | bbox_attrs = 5 + num_classes 37 | num_anchors = len(anchors) 38 | 39 | anchors = [(a[0]/stride, a[1]/stride) for a in anchors] 40 | 41 | 42 | 43 | prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size) 44 | prediction = prediction.transpose(1,2).contiguous() 45 | prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs) 46 | 47 | 48 | #Sigmoid the centre_X, centre_Y. and object confidencce 49 | prediction[:,:,0] = torch.sigmoid(prediction[:,:,0]) 50 | prediction[:,:,1] = torch.sigmoid(prediction[:,:,1]) 51 | prediction[:,:,4] = torch.sigmoid(prediction[:,:,4]) 52 | 53 | 54 | 55 | #Add the center offsets 56 | grid_len = np.arange(grid_size) 57 | a,b = np.meshgrid(grid_len, grid_len) 58 | 59 | x_offset = torch.FloatTensor(a).view(-1,1) 60 | y_offset = torch.FloatTensor(b).view(-1,1) 61 | 62 | if CUDA: 63 | x_offset = x_offset.cuda() 64 | y_offset = y_offset.cuda() 65 | 66 | x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1,num_anchors).view(-1,2).unsqueeze(0) 67 | 68 | prediction[:,:,:2] += x_y_offset 69 | 70 | #log space transform height and the width 71 | anchors = torch.FloatTensor(anchors) 72 | 73 | if CUDA: 74 | anchors = anchors.cuda() 75 | 76 | anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0) 77 | prediction[:,:,2:4] = torch.exp(prediction[:,:,2:4])*anchors 78 | 79 | #Softmax the class scores 80 | prediction[:,:,5: 5 + num_classes] = torch.sigmoid((prediction[:,:, 5 : 5 + num_classes])) 81 | 82 | prediction[:,:,:4] *= stride 83 | 84 | 85 | return prediction 86 | 87 | def load_classes(namesfile): 88 | fp = open(namesfile, "r") 89 | names = fp.read().split("\n")[:-1] 90 | return names 91 | 92 | def get_im_dim(im): 93 | im = cv2.imread(im) 94 | w,h = im.shape[1], im.shape[0] 95 | return w,h 96 | 97 | def unique(tensor): 98 | tensor_np = tensor.cpu().numpy() 99 | unique_np = np.unique(tensor_np) 100 | unique_tensor = torch.from_numpy(unique_np) 101 | 102 | tensor_res = tensor.new(unique_tensor.shape) 103 | tensor_res.copy_(unique_tensor) 104 | return tensor_res 105 | 106 | def dynamic_write_results(prediction, confidence, num_classes, nms=True, nms_conf=0.4): 107 | prediction_bak = prediction.clone() 108 | dets = write_results(prediction.clone(), confidence, num_classes, nms, nms_conf) 109 | if isinstance(dets, int): 110 | return dets 111 | 112 | if dets.shape[0] > 100: 113 | nms_conf -= 0.05 114 | dets = write_results(prediction_bak.clone(), confidence, num_classes, nms, nms_conf) 115 | 116 | return dets 117 | 118 | 119 | def write_results(prediction, confidence, num_classes, nms=True, nms_conf=0.4): 120 | conf_mask = (prediction[:, :, 4] > confidence).float().float().unsqueeze(2) 121 | prediction = prediction * conf_mask 122 | 123 | try: 124 | ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous() 125 | except: 126 | return 0 127 | 128 | box_a = prediction.new(prediction.shape) 129 | box_a[:,:,0] = (prediction[:,:,0] - prediction[:,:,2]/2) 130 | box_a[:,:,1] = (prediction[:,:,1] - prediction[:,:,3]/2) 131 | box_a[:,:,2] = (prediction[:,:,0] + prediction[:,:,2]/2) 132 | box_a[:,:,3] = (prediction[:,:,1] + prediction[:,:,3]/2) 133 | prediction[:,:,:4] = box_a[:,:,:4] 134 | 135 | batch_size = prediction.size(0) 136 | 137 | output = prediction.new(1, prediction.size(2) + 1) 138 | write = False 139 | num = 0 140 | for ind in range(batch_size): 141 | #select the image from the batch 142 | image_pred = prediction[ind] 143 | 144 | #Get the class having maximum score, and the index of that class 145 | #Get rid of num_classes softmax scores 146 | #Add the class index and the class score of class having maximum score 147 | max_conf, max_conf_score = torch.max(image_pred[:,5:5+ num_classes], 1) 148 | max_conf = max_conf.float().unsqueeze(1) 149 | max_conf_score = max_conf_score.float().unsqueeze(1) 150 | seq = (image_pred[:,:5], max_conf, max_conf_score) 151 | image_pred = torch.cat(seq, 1) 152 | 153 | #Get rid of the zero entries 154 | non_zero_ind = (torch.nonzero(image_pred[:,4])) 155 | 156 | image_pred_ = image_pred[non_zero_ind.squeeze(),:].view(-1,7) 157 | 158 | #Get the various classes detected in the image 159 | try: 160 | img_classes = unique(image_pred_[:,-1]) 161 | except: 162 | continue 163 | 164 | #WE will do NMS classwise 165 | #print(img_classes) 166 | for cls in img_classes: 167 | if cls != 0: #0 is the person 168 | continue 169 | #get the detections with one particular class 170 | cls_mask = image_pred_*(image_pred_[:,-1] == cls).float().unsqueeze(1) 171 | class_mask_ind = torch.nonzero(cls_mask[:,-2]).squeeze() 172 | 173 | image_pred_class = image_pred_[class_mask_ind].view(-1,7) 174 | 175 | #sort the detections such that the entry with the maximum objectness 176 | #confidence is at the top 177 | conf_sort_index = torch.sort(image_pred_class[:,4], descending = True )[1] 178 | image_pred_class = image_pred_class[conf_sort_index] 179 | idx = image_pred_class.size(0) 180 | 181 | #if nms has to be done 182 | if nms: 183 | #For each detection 184 | for i in range(idx): 185 | #Get the IOUs of all boxes that come after the one we are looking at 186 | #in the loop 187 | try: 188 | ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:]) 189 | except ValueError: 190 | break 191 | 192 | except IndexError: 193 | break 194 | 195 | #Zero out all the detections that have IoU > treshhold 196 | iou_mask = (ious < nms_conf).float().unsqueeze(1) 197 | image_pred_class[i+1:] *= iou_mask 198 | 199 | #Remove the non-zero entries 200 | non_zero_ind = torch.nonzero(image_pred_class[:,4]).squeeze() 201 | image_pred_class = image_pred_class[non_zero_ind].view(-1,7) 202 | 203 | #if nms has to be done 204 | # if nms: 205 | # # Perform non-maximum suppression 206 | # max_detections = [] 207 | # while image_pred_class.size(0): 208 | # # Get detection with highest confidence and save as max detection 209 | # max_detections.append(image_pred_class[0].unsqueeze(0)) 210 | # # Stop if we're at the last detection 211 | # if len(image_pred_class) == 1: 212 | # break 213 | # # Get the IOUs for all boxes with lower confidence 214 | # ious = bbox_iou(max_detections[-1], image_pred_class[1:]) 215 | # # Remove detections with IoU >= NMS threshold 216 | # image_pred_class = image_pred_class[1:][ious < nms_conf] 217 | 218 | # image_pred_class = torch.cat(max_detections).data 219 | 220 | 221 | #Concatenate the batch_id of the image to the detection 222 | #this helps us identify which image does the detection correspond to 223 | #We use a linear straucture to hold ALL the detections from the batch 224 | #the batch_dim is flattened 225 | #batch is identified by extra batch column 226 | 227 | batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind) 228 | seq = batch_ind, image_pred_class 229 | if not write: 230 | output = torch.cat(seq,1) 231 | write = True 232 | else: 233 | out = torch.cat(seq,1) 234 | output = torch.cat((output,out)) 235 | num += 1 236 | 237 | if not num: 238 | return 0 239 | 240 | return output 241 | 242 | #!/usr/bin/env python3 243 | # -*- coding: utf-8 -*- 244 | """ 245 | Created on Sat Mar 24 00:12:16 2018 246 | 247 | @author: ayooshmac 248 | """ 249 | 250 | def predict_transform_half(prediction, inp_dim, anchors, num_classes, CUDA = True): 251 | batch_size = prediction.size(0) 252 | stride = inp_dim // prediction.size(2) 253 | 254 | bbox_attrs = 5 + num_classes 255 | num_anchors = len(anchors) 256 | grid_size = inp_dim // stride 257 | 258 | 259 | prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size) 260 | prediction = prediction.transpose(1,2).contiguous() 261 | prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs) 262 | 263 | 264 | #Sigmoid the centre_X, centre_Y. and object confidencce 265 | prediction[:,:,0] = torch.sigmoid(prediction[:,:,0]) 266 | prediction[:,:,1] = torch.sigmoid(prediction[:,:,1]) 267 | prediction[:,:,4] = torch.sigmoid(prediction[:,:,4]) 268 | 269 | 270 | #Add the center offsets 271 | grid_len = np.arange(grid_size) 272 | a,b = np.meshgrid(grid_len, grid_len) 273 | 274 | x_offset = torch.FloatTensor(a).view(-1,1) 275 | y_offset = torch.FloatTensor(b).view(-1,1) 276 | 277 | if CUDA: 278 | x_offset = x_offset.cuda().half() 279 | y_offset = y_offset.cuda().half() 280 | 281 | x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1,num_anchors).view(-1,2).unsqueeze(0) 282 | 283 | prediction[:,:,:2] += x_y_offset 284 | 285 | #log space transform height and the width 286 | anchors = torch.HalfTensor(anchors) 287 | 288 | if CUDA: 289 | anchors = anchors.cuda() 290 | 291 | anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0) 292 | prediction[:,:,2:4] = torch.exp(prediction[:,:,2:4])*anchors 293 | 294 | #Softmax the class scores 295 | prediction[:,:,5: 5 + num_classes] = nn.Softmax(-1)(Variable(prediction[:,:, 5 : 5 + num_classes])).data 296 | 297 | prediction[:,:,:4] *= stride 298 | 299 | 300 | return prediction 301 | 302 | 303 | def write_results_half(prediction, confidence, num_classes, nms = True, nms_conf = 0.4): 304 | conf_mask = (prediction[:,:,4] > confidence).half().unsqueeze(2) 305 | prediction = prediction*conf_mask 306 | 307 | try: 308 | ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous() 309 | except: 310 | return 0 311 | 312 | 313 | 314 | box_a = prediction.new(prediction.shape) 315 | box_a[:,:,0] = (prediction[:,:,0] - prediction[:,:,2]/2) 316 | box_a[:,:,1] = (prediction[:,:,1] - prediction[:,:,3]/2) 317 | box_a[:,:,2] = (prediction[:,:,0] + prediction[:,:,2]/2) 318 | box_a[:,:,3] = (prediction[:,:,1] + prediction[:,:,3]/2) 319 | prediction[:,:,:4] = box_a[:,:,:4] 320 | 321 | 322 | 323 | batch_size = prediction.size(0) 324 | 325 | output = prediction.new(1, prediction.size(2) + 1) 326 | write = False 327 | 328 | for ind in range(batch_size): 329 | #select the image from the batch 330 | image_pred = prediction[ind] 331 | 332 | 333 | #Get the class having maximum score, and the index of that class 334 | #Get rid of num_classes softmax scores 335 | #Add the class index and the class score of class having maximum score 336 | max_conf, max_conf_score = torch.max(image_pred[:,5:5+ num_classes], 1) 337 | max_conf = max_conf.half().unsqueeze(1) 338 | max_conf_score = max_conf_score.half().unsqueeze(1) 339 | seq = (image_pred[:,:5], max_conf, max_conf_score) 340 | image_pred = torch.cat(seq, 1) 341 | 342 | 343 | #Get rid of the zero entries 344 | non_zero_ind = (torch.nonzero(image_pred[:,4])) 345 | try: 346 | image_pred_ = image_pred[non_zero_ind.squeeze(),:] 347 | except: 348 | continue 349 | 350 | #Get the various classes detected in the image 351 | img_classes = unique(image_pred_[:,-1].long()).half() 352 | 353 | 354 | 355 | 356 | #WE will do NMS classwise 357 | for cls in img_classes: 358 | #get the detections with one particular class 359 | cls_mask = image_pred_*(image_pred_[:,-1] == cls).half().unsqueeze(1) 360 | class_mask_ind = torch.nonzero(cls_mask[:,-2]).squeeze() 361 | 362 | 363 | image_pred_class = image_pred_[class_mask_ind] 364 | 365 | 366 | #sort the detections such that the entry with the maximum objectness 367 | #confidence is at the top 368 | conf_sort_index = torch.sort(image_pred_class[:,4], descending = True )[1] 369 | image_pred_class = image_pred_class[conf_sort_index] 370 | idx = image_pred_class.size(0) 371 | 372 | #if nms has to be done 373 | if nms: 374 | #For each detection 375 | for i in range(idx): 376 | #Get the IOUs of all boxes that come after the one we are looking at 377 | #in the loop 378 | try: 379 | ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:]) 380 | except ValueError: 381 | break 382 | 383 | except IndexError: 384 | break 385 | 386 | #Zero out all the detections that have IoU > treshhold 387 | iou_mask = (ious < nms_conf).half().unsqueeze(1) 388 | image_pred_class[i+1:] *= iou_mask 389 | 390 | #Remove the non-zero entries 391 | non_zero_ind = torch.nonzero(image_pred_class[:,4]).squeeze() 392 | image_pred_class = image_pred_class[non_zero_ind] 393 | 394 | 395 | 396 | #Concatenate the batch_id of the image to the detection 397 | #this helps us identify which image does the detection correspond to 398 | #We use a linear straucture to hold ALL the detections from the batch 399 | #the batch_dim is flattened 400 | #batch is identified by extra batch column 401 | batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind) 402 | seq = batch_ind, image_pred_class 403 | 404 | if not write: 405 | output = torch.cat(seq,1) 406 | write = True 407 | else: 408 | out = torch.cat(seq,1) 409 | output = torch.cat((output,out)) 410 | 411 | return output 412 | -------------------------------------------------------------------------------- /weight_to_onnx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | from __future__ import print_function 4 | from collections import OrderedDict 5 | import hashlib 6 | import os.path 7 | 8 | import onnx 9 | from onnx import helper 10 | from onnx import TensorProto 11 | import numpy as np 12 | 13 | import sys 14 | 15 | class DarkNetParser(object): 16 | """Definition of a parser for DarkNet-based YOLOv3-608 (only tested for this topology).""" 17 | 18 | def __init__(self, supported_layers): 19 | """Initializes a DarkNetParser object. 20 | 21 | Keyword argument: 22 | supported_layers -- a string list of supported layers in DarkNet naming convention, 23 | parameters are only added to the class dictionary if a parsed layer is included. 24 | """ 25 | 26 | # A list of YOLOv3 layers containing dictionaries with all layer 27 | # parameters: 28 | self.layer_configs = OrderedDict() 29 | self.supported_layers = supported_layers 30 | self.layer_counter = 0 31 | 32 | def parse_cfg_file(self, cfg_file_path): 33 | """Takes the yolov3.cfg file and parses it layer by layer, 34 | appending each layer's parameters as a dictionary to layer_configs. 35 | 36 | Keyword argument: 37 | cfg_file_path -- path to the yolov3.cfg file as string 38 | """ 39 | with open(cfg_file_path, 'rb') as cfg_file: 40 | remainder = cfg_file.read() 41 | while remainder is not None: 42 | layer_dict, layer_name, remainder = self._next_layer(remainder) 43 | if layer_dict is not None: 44 | self.layer_configs[layer_name] = layer_dict 45 | return self.layer_configs 46 | 47 | def _next_layer(self, remainder): 48 | """Takes in a string and segments it by looking for DarkNet delimiters. 49 | Returns the layer parameters and the remaining string after the last delimiter. 50 | Example for the first Conv layer in yolo.cfg ... 51 | 52 | [convolutional] 53 | batch_normalize=1 54 | filters=32 55 | size=3 56 | stride=1 57 | pad=1 58 | activation=leaky 59 | 60 | ... becomes the following layer_dict return value: 61 | {'activation': 'leaky', 'stride': 1, 'pad': 1, 'filters': 32, 62 | 'batch_normalize': 1, 'type': 'convolutional', 'size': 3}. 63 | 64 | '001_convolutional' is returned as layer_name, and all lines that follow in yolo.cfg 65 | are returned as the next remainder. 66 | 67 | Keyword argument: 68 | remainder -- a string with all raw text after the previously parsed layer 69 | """ 70 | remainder = remainder.split('[', 1) 71 | if len(remainder) == 2: 72 | remainder = remainder[1] 73 | else: 74 | return None, None, None 75 | remainder = remainder.split(']', 1) 76 | if len(remainder) == 2: 77 | layer_type, remainder = remainder 78 | else: 79 | return None, None, None 80 | if remainder.replace(' ', '')[0] == '#': 81 | remainder = remainder.split('\n', 1)[1] 82 | 83 | layer_param_block, remainder = remainder.split('\n\n', 1) 84 | layer_param_lines = layer_param_block.split('\n')[1:] 85 | layer_name = str(self.layer_counter).zfill(3) + '_' + layer_type 86 | layer_dict = dict(type=layer_type) 87 | if layer_type in self.supported_layers: 88 | for param_line in layer_param_lines: 89 | if param_line[0] == '#': 90 | continue 91 | param_type, param_value = self._parse_params(param_line) 92 | layer_dict[param_type] = param_value 93 | self.layer_counter += 1 94 | return layer_dict, layer_name, remainder 95 | 96 | def _parse_params(self, param_line): 97 | """Identifies the parameters contained in one of the cfg file and returns 98 | them in the required format for each parameter type, e.g. as a list, an int or a float. 99 | 100 | Keyword argument: 101 | param_line -- one parsed line within a layer block 102 | """ 103 | param_line = param_line.replace(' ', '') 104 | param_type, param_value_raw = param_line.split('=') 105 | param_value = None 106 | if param_type == 'layers': 107 | layer_indexes = list() 108 | for index in param_value_raw.split(','): 109 | layer_indexes.append(int(index)) 110 | param_value = layer_indexes 111 | elif isinstance(param_value_raw, str) and not param_value_raw.isalpha(): 112 | condition_param_value_positive = param_value_raw.isdigit() 113 | condition_param_value_negative = param_value_raw[0] == '-' and \ 114 | param_value_raw[1:].isdigit() 115 | if condition_param_value_positive or condition_param_value_negative: 116 | param_value = int(param_value_raw) 117 | else: 118 | param_value = float(param_value_raw) 119 | else: 120 | param_value = str(param_value_raw) 121 | return param_type, param_value 122 | 123 | 124 | class MajorNodeSpecs(object): 125 | """Helper class used to store the names of ONNX output names, 126 | corresponding to the output of a DarkNet layer and its output channels. 127 | Some DarkNet layers are not created and there is no corresponding ONNX node, 128 | but we still need to track them in order to set up skip connections. 129 | """ 130 | 131 | def __init__(self, name, channels): 132 | """ Initialize a MajorNodeSpecs object. 133 | 134 | Keyword arguments: 135 | name -- name of the ONNX node 136 | channels -- number of output channels of this node 137 | """ 138 | self.name = name 139 | self.channels = channels 140 | self.created_onnx_node = False 141 | if name is not None and isinstance(channels, int) and channels > 0: 142 | self.created_onnx_node = True 143 | 144 | 145 | class ConvParams(object): 146 | """Helper class to store the hyper parameters of a Conv layer, 147 | including its prefix name in the ONNX graph and the expected dimensions 148 | of weights for convolution, bias, and batch normalization. 149 | 150 | Additionally acts as a wrapper for generating safe names for all 151 | weights, checking on feasible combinations. 152 | """ 153 | 154 | def __init__(self, node_name, batch_normalize, conv_weight_dims): 155 | """Constructor based on the base node name (e.g. 101_convolutional), the batch 156 | normalization setting, and the convolutional weights shape. 157 | 158 | Keyword arguments: 159 | node_name -- base name of this YOLO convolutional layer 160 | batch_normalize -- bool value if batch normalization is used 161 | conv_weight_dims -- the dimensions of this layer's convolutional weights 162 | """ 163 | self.node_name = node_name 164 | self.batch_normalize = batch_normalize 165 | assert len(conv_weight_dims) == 4 166 | self.conv_weight_dims = conv_weight_dims 167 | 168 | def generate_param_name(self, param_category, suffix): 169 | """Generates a name based on two string inputs, 170 | and checks if the combination is valid.""" 171 | assert suffix 172 | assert param_category in ['bn', 'conv'] 173 | assert(suffix in ['scale', 'mean', 'var', 'weights', 'bias']) 174 | if param_category == 'bn': 175 | assert self.batch_normalize 176 | assert suffix in ['scale', 'bias', 'mean', 'var'] 177 | elif param_category == 'conv': 178 | assert suffix in ['weights', 'bias'] 179 | if suffix == 'bias': 180 | assert not self.batch_normalize 181 | param_name = self.node_name + '_' + param_category + '_' + suffix 182 | return param_name 183 | 184 | 185 | class WeightLoader(object): 186 | """Helper class used for loading the serialized weights of a binary file stream 187 | and returning the initializers and the input tensors required for populating 188 | the ONNX graph with weights. 189 | """ 190 | 191 | def __init__(self, weights_file_path): 192 | """Initialized with a path to the YOLOv3 .weights file. 193 | 194 | Keyword argument: 195 | weights_file_path -- path to the weights file. 196 | """ 197 | self.weights_file = self._open_weights_file(weights_file_path) 198 | 199 | def load_conv_weights(self, conv_params): 200 | """Returns the initializers with weights from the weights file and 201 | the input tensors of a convolutional layer for all corresponding ONNX nodes. 202 | 203 | Keyword argument: 204 | conv_params -- a ConvParams object 205 | """ 206 | initializer = list() 207 | inputs = list() 208 | if conv_params.batch_normalize: 209 | bias_init, bias_input = self._create_param_tensors( 210 | conv_params, 'bn', 'bias') 211 | bn_scale_init, bn_scale_input = self._create_param_tensors( 212 | conv_params, 'bn', 'scale') 213 | bn_mean_init, bn_mean_input = self._create_param_tensors( 214 | conv_params, 'bn', 'mean') 215 | bn_var_init, bn_var_input = self._create_param_tensors( 216 | conv_params, 'bn', 'var') 217 | initializer.extend( 218 | [bn_scale_init, bias_init, bn_mean_init, bn_var_init]) 219 | inputs.extend([bn_scale_input, bias_input, 220 | bn_mean_input, bn_var_input]) 221 | else: 222 | bias_init, bias_input = self._create_param_tensors( 223 | conv_params, 'conv', 'bias') 224 | initializer.append(bias_init) 225 | inputs.append(bias_input) 226 | conv_init, conv_input = self._create_param_tensors( 227 | conv_params, 'conv', 'weights') 228 | initializer.append(conv_init) 229 | inputs.append(conv_input) 230 | return initializer, inputs 231 | 232 | def _open_weights_file(self, weights_file_path): 233 | """Opens a YOLOv3 DarkNet file stream and skips the header. 234 | 235 | Keyword argument: 236 | weights_file_path -- path to the weights file. 237 | """ 238 | weights_file = open(weights_file_path, 'rb') 239 | length_header = 5 240 | np.ndarray( 241 | shape=(length_header, ), dtype='int32', buffer=weights_file.read( 242 | length_header * 4)) 243 | return weights_file 244 | 245 | def _create_param_tensors(self, conv_params, param_category, suffix): 246 | """Creates the initializers with weights from the weights file together with 247 | the input tensors. 248 | 249 | Keyword arguments: 250 | conv_params -- a ConvParams object 251 | param_category -- the category of parameters to be created ('bn' or 'conv') 252 | suffix -- a string determining the sub-type of above param_category (e.g., 253 | 'weights' or 'bias') 254 | """ 255 | param_name, param_data, param_data_shape = self._load_one_param_type( 256 | conv_params, param_category, suffix) 257 | 258 | initializer_tensor = helper.make_tensor( 259 | param_name, TensorProto.FLOAT, param_data_shape, param_data) 260 | input_tensor = helper.make_tensor_value_info( 261 | param_name, TensorProto.FLOAT, param_data_shape) 262 | return initializer_tensor, input_tensor 263 | 264 | def _load_one_param_type(self, conv_params, param_category, suffix): 265 | """Deserializes the weights from a file stream in the DarkNet order. 266 | 267 | Keyword arguments: 268 | conv_params -- a ConvParams object 269 | param_category -- the category of parameters to be created ('bn' or 'conv') 270 | suffix -- a string determining the sub-type of above param_category (e.g., 271 | 'weights' or 'bias') 272 | """ 273 | param_name = conv_params.generate_param_name(param_category, suffix) 274 | channels_out, channels_in, filter_h, filter_w = conv_params.conv_weight_dims 275 | if param_category == 'bn': 276 | param_shape = [channels_out] 277 | elif param_category == 'conv': 278 | if suffix == 'weights': 279 | param_shape = [channels_out, channels_in, filter_h, filter_w] 280 | elif suffix == 'bias': 281 | param_shape = [channels_out] 282 | param_size = np.product(np.array(param_shape)) 283 | param_data = np.ndarray( 284 | shape=param_shape, 285 | dtype='float32', 286 | buffer=self.weights_file.read(param_size * 4)) 287 | param_data = param_data.flatten().astype(float) 288 | return param_name, param_data, param_shape 289 | 290 | 291 | class GraphBuilderONNX(object): 292 | """Class for creating an ONNX graph from a previously generated list of layer dictionaries.""" 293 | 294 | def __init__(self, output_tensors): 295 | """Initialize with all DarkNet default parameters used creating YOLOv3, 296 | and specify the output tensors as an OrderedDict for their output dimensions 297 | with their names as keys. 298 | 299 | Keyword argument: 300 | output_tensors -- the output tensors as an OrderedDict containing the keys' 301 | output dimensions 302 | """ 303 | self.output_tensors = output_tensors 304 | self._nodes = list() 305 | self.graph_def = None 306 | self.input_tensor = None 307 | self.epsilon_bn = 1e-5 308 | self.momentum_bn = 0.99 309 | self.alpha_lrelu = 0.1 310 | self.param_dict = OrderedDict() 311 | self.major_node_specs = list() 312 | self.batch_size = 1 313 | 314 | def build_onnx_graph( 315 | self, 316 | layer_configs, 317 | weights_file_path, 318 | verbose=True): 319 | """Iterate over all layer configs (parsed from the DarkNet representation 320 | of YOLOv3-608), create an ONNX graph, populate it with weights from the weights 321 | file and return the graph definition. 322 | 323 | Keyword arguments: 324 | layer_configs -- an OrderedDict object with all parsed layers' configurations 325 | weights_file_path -- location of the weights file 326 | verbose -- toggles if the graph is printed after creation (default: True) 327 | """ 328 | for layer_name in layer_configs.keys(): 329 | layer_dict = layer_configs[layer_name] 330 | major_node_specs = self._make_onnx_node(layer_name, layer_dict) 331 | if major_node_specs.name is not None: 332 | self.major_node_specs.append(major_node_specs) 333 | outputs = list() 334 | for tensor_name in self.output_tensors.keys(): 335 | output_dims = [self.batch_size, ] + \ 336 | self.output_tensors[tensor_name] 337 | output_tensor = helper.make_tensor_value_info( 338 | tensor_name, TensorProto.FLOAT, output_dims) 339 | outputs.append(output_tensor) 340 | inputs = [self.input_tensor] 341 | weight_loader = WeightLoader(weights_file_path) 342 | initializer = list() 343 | for layer_name in self.param_dict.keys(): 344 | _, layer_type = layer_name.split('_', 1) 345 | conv_params = self.param_dict[layer_name] 346 | assert layer_type == 'convolutional' 347 | initializer_layer, inputs_layer = weight_loader.load_conv_weights( 348 | conv_params) 349 | initializer.extend(initializer_layer) 350 | inputs.extend(inputs_layer) 351 | del weight_loader 352 | self.graph_def = helper.make_graph( 353 | nodes=self._nodes, 354 | name='YOLOv3-608', 355 | inputs=inputs, 356 | outputs=outputs, 357 | initializer=initializer 358 | ) 359 | if verbose: 360 | print(helper.printable_graph(self.graph_def)) 361 | model_def = helper.make_model(self.graph_def, 362 | producer_name='NVIDIA TensorRT sample') 363 | return model_def 364 | 365 | def _make_onnx_node(self, layer_name, layer_dict): 366 | """Take in a layer parameter dictionary, choose the correct function for 367 | creating an ONNX node and store the information important to graph creation 368 | as a MajorNodeSpec object. 369 | 370 | Keyword arguments: 371 | layer_name -- the layer's name (also the corresponding key in layer_configs) 372 | layer_dict -- a layer parameter dictionary (one element of layer_configs) 373 | """ 374 | layer_type = layer_dict['type'] 375 | if self.input_tensor is None: 376 | if layer_type == 'net': 377 | major_node_output_name, major_node_output_channels = self._make_input_tensor( 378 | layer_name, layer_dict) 379 | major_node_specs = MajorNodeSpecs(major_node_output_name, 380 | major_node_output_channels) 381 | else: 382 | raise ValueError('The first node has to be of type "net".') 383 | else: 384 | node_creators = dict() 385 | node_creators['convolutional'] = self._make_conv_node 386 | node_creators['shortcut'] = self._make_shortcut_node 387 | node_creators['route'] = self._make_route_node 388 | node_creators['upsample'] = self._make_upsample_node 389 | 390 | if layer_type in node_creators.keys(): 391 | major_node_output_name, major_node_output_channels = \ 392 | node_creators[layer_type](layer_name, layer_dict) 393 | major_node_specs = MajorNodeSpecs(major_node_output_name, 394 | major_node_output_channels) 395 | else: 396 | print( 397 | 'Layer of type %s not supported, skipping ONNX node generation.' % 398 | layer_type) 399 | major_node_specs = MajorNodeSpecs(layer_name, 400 | None) 401 | return major_node_specs 402 | 403 | def _make_input_tensor(self, layer_name, layer_dict): 404 | """Create an ONNX input tensor from a 'net' layer and store the batch size. 405 | 406 | Keyword arguments: 407 | layer_name -- the layer's name (also the corresponding key in layer_configs) 408 | layer_dict -- a layer parameter dictionary (one element of layer_configs) 409 | """ 410 | batch_size = layer_dict['batch'] 411 | channels = layer_dict['channels'] 412 | height = layer_dict['height'] 413 | width = layer_dict['width'] 414 | self.batch_size = batch_size 415 | input_tensor = helper.make_tensor_value_info( 416 | str(layer_name), TensorProto.FLOAT, [ 417 | batch_size, channels, height, width]) 418 | self.input_tensor = input_tensor 419 | return layer_name, channels 420 | 421 | def _get_previous_node_specs(self, target_index=-1): 422 | """Get a previously generated ONNX node (skip those that were not generated). 423 | Target index can be passed for jumping to a specific index. 424 | 425 | Keyword arguments: 426 | target_index -- optional for jumping to a specific index (default: -1 for jumping 427 | to previous element) 428 | """ 429 | previous_node = None 430 | for node in self.major_node_specs[target_index::-1]: 431 | if node.created_onnx_node: 432 | previous_node = node 433 | break 434 | assert previous_node is not None 435 | return previous_node 436 | 437 | def _make_conv_node(self, layer_name, layer_dict): 438 | """Create an ONNX Conv node with optional batch normalization and 439 | activation nodes. 440 | 441 | Keyword arguments: 442 | layer_name -- the layer's name (also the corresponding key in layer_configs) 443 | layer_dict -- a layer parameter dictionary (one element of layer_configs) 444 | """ 445 | previous_node_specs = self._get_previous_node_specs() 446 | inputs = [previous_node_specs.name] 447 | previous_channels = previous_node_specs.channels 448 | kernel_size = layer_dict['size'] 449 | stride = layer_dict['stride'] 450 | filters = layer_dict['filters'] 451 | batch_normalize = False 452 | if 'batch_normalize' in layer_dict.keys( 453 | ) and layer_dict['batch_normalize'] == 1: 454 | batch_normalize = True 455 | 456 | kernel_shape = [kernel_size, kernel_size] 457 | weights_shape = [filters, previous_channels] + kernel_shape 458 | conv_params = ConvParams(layer_name, batch_normalize, weights_shape) 459 | 460 | strides = [stride, stride] 461 | dilations = [1, 1] 462 | weights_name = conv_params.generate_param_name('conv', 'weights') 463 | inputs.append(weights_name) 464 | if not batch_normalize: 465 | bias_name = conv_params.generate_param_name('conv', 'bias') 466 | inputs.append(bias_name) 467 | 468 | conv_node = helper.make_node( 469 | 'Conv', 470 | inputs=inputs, 471 | outputs=[layer_name], 472 | kernel_shape=kernel_shape, 473 | strides=strides, 474 | auto_pad='SAME_LOWER', 475 | dilations=dilations, 476 | name=layer_name 477 | ) 478 | self._nodes.append(conv_node) 479 | inputs = [layer_name] 480 | layer_name_output = layer_name 481 | 482 | if batch_normalize: 483 | layer_name_bn = layer_name + '_bn' 484 | bn_param_suffixes = ['scale', 'bias', 'mean', 'var'] 485 | for suffix in bn_param_suffixes: 486 | bn_param_name = conv_params.generate_param_name('bn', suffix) 487 | inputs.append(bn_param_name) 488 | batchnorm_node = helper.make_node( 489 | 'BatchNormalization', 490 | inputs=inputs, 491 | outputs=[layer_name_bn], 492 | epsilon=self.epsilon_bn, 493 | momentum=self.momentum_bn, 494 | name=layer_name_bn 495 | ) 496 | self._nodes.append(batchnorm_node) 497 | inputs = [layer_name_bn] 498 | layer_name_output = layer_name_bn 499 | 500 | if layer_dict['activation'] == 'leaky': 501 | layer_name_lrelu = layer_name + '_lrelu' 502 | 503 | lrelu_node = helper.make_node( 504 | 'LeakyRelu', 505 | inputs=inputs, 506 | outputs=[layer_name_lrelu], 507 | name=layer_name_lrelu, 508 | alpha=self.alpha_lrelu 509 | ) 510 | self._nodes.append(lrelu_node) 511 | inputs = [layer_name_lrelu] 512 | layer_name_output = layer_name_lrelu 513 | elif layer_dict['activation'] == 'linear': 514 | pass 515 | else: 516 | print('Activation not supported.') 517 | 518 | self.param_dict[layer_name] = conv_params 519 | return layer_name_output, filters 520 | 521 | def _make_shortcut_node(self, layer_name, layer_dict): 522 | """Create an ONNX Add node with the shortcut properties from 523 | the DarkNet-based graph. 524 | 525 | Keyword arguments: 526 | layer_name -- the layer's name (also the corresponding key in layer_configs) 527 | layer_dict -- a layer parameter dictionary (one element of layer_configs) 528 | """ 529 | shortcut_index = layer_dict['from'] 530 | activation = layer_dict['activation'] 531 | assert activation == 'linear' 532 | 533 | first_node_specs = self._get_previous_node_specs() 534 | second_node_specs = self._get_previous_node_specs( 535 | target_index=shortcut_index) 536 | assert first_node_specs.channels == second_node_specs.channels 537 | channels = first_node_specs.channels 538 | inputs = [first_node_specs.name, second_node_specs.name] 539 | shortcut_node = helper.make_node( 540 | 'Add', 541 | inputs=inputs, 542 | outputs=[layer_name], 543 | name=layer_name, 544 | ) 545 | self._nodes.append(shortcut_node) 546 | return layer_name, channels 547 | 548 | def _make_route_node(self, layer_name, layer_dict): 549 | """If the 'layers' parameter from the DarkNet configuration is only one index, continue 550 | node creation at the indicated (negative) index. Otherwise, create an ONNX Concat node 551 | with the route properties from the DarkNet-based graph. 552 | 553 | Keyword arguments: 554 | layer_name -- the layer's name (also the corresponding key in layer_configs) 555 | layer_dict -- a layer parameter dictionary (one element of layer_configs) 556 | """ 557 | route_node_indexes = layer_dict['layers'] 558 | if len(route_node_indexes) == 1: 559 | split_index = route_node_indexes[0] 560 | assert split_index < 0 561 | # Increment by one because we skipped the YOLO layer: 562 | split_index += 1 563 | self.major_node_specs = self.major_node_specs[:split_index] 564 | layer_name = None 565 | channels = None 566 | else: 567 | inputs = list() 568 | channels = 0 569 | for index in route_node_indexes: 570 | if index > 0: 571 | # Increment by one because we count the input as a node (DarkNet 572 | # does not) 573 | index += 1 574 | route_node_specs = self._get_previous_node_specs( 575 | target_index=index) 576 | inputs.append(route_node_specs.name) 577 | channels += route_node_specs.channels 578 | assert inputs 579 | assert channels > 0 580 | 581 | route_node = helper.make_node( 582 | 'Concat', 583 | axis=1, 584 | inputs=inputs, 585 | outputs=[layer_name], 586 | name=layer_name, 587 | ) 588 | self._nodes.append(route_node) 589 | return layer_name, channels 590 | 591 | def _make_upsample_node(self, layer_name, layer_dict): 592 | """Create an ONNX Upsample node with the properties from 593 | the DarkNet-based graph. 594 | 595 | Keyword arguments: 596 | layer_name -- the layer's name (also the corresponding key in layer_configs) 597 | layer_dict -- a layer parameter dictionary (one element of layer_configs) 598 | """ 599 | upsample_factor = float(layer_dict['stride']) 600 | previous_node_specs = self._get_previous_node_specs() 601 | inputs = [previous_node_specs.name] 602 | channels = previous_node_specs.channels 603 | assert channels > 0 604 | upsample_node = helper.make_node( 605 | 'Upsample', 606 | mode='nearest', 607 | # For ONNX versions <0.7.0, Upsample nodes accept different parameters than 'scales': 608 | scales=[1.0, 1.0, upsample_factor, upsample_factor], 609 | inputs=inputs, 610 | outputs=[layer_name], 611 | name=layer_name, 612 | ) 613 | self._nodes.append(upsample_node) 614 | return layer_name, channels 615 | 616 | def main(): 617 | """Run the DarkNet-to-ONNX conversion for YOLOv3-608.""" 618 | # Have to use python 2 due to hashlib compatibility 619 | if sys.version_info[0] > 2: 620 | raise Exception("This is script is only compatible with python2, please re-run this script \ 621 | with python2. The rest of this sample can be run with either version of python") 622 | 623 | cfg_file_path = "yolov3-608.cfg" 624 | 625 | # These are the only layers DarkNetParser will extract parameters from. The three layers of 626 | # type 'yolo' are not parsed in detail because they are included in the post-processing later: 627 | supported_layers = ['net', 'convolutional', 'shortcut', 'route', 'upsample'] 628 | 629 | # Create a DarkNetParser object, and the use it to generate an OrderedDict with all 630 | # layer's configs from the cfg file: 631 | parser = DarkNetParser(supported_layers) 632 | layer_configs = parser.parse_cfg_file(cfg_file_path) 633 | # We do not need the parser anymore after we got layer_configs: 634 | del parser 635 | 636 | # In above layer_config, there are three outputs that we need to know the output 637 | # shape of (in CHW format): 638 | output_tensor_dims = OrderedDict() 639 | #yolo-v3(608*608) 640 | output_tensor_dims['082_convolutional'] = [255, 19, 19] 641 | output_tensor_dims['094_convolutional'] = [255, 38, 38] 642 | output_tensor_dims['106_convolutional'] = [255, 76, 76] 643 | #yolo-v3(416*416) 644 | # output_tensor_dims['082_convolutional'] = [255, 13, 13] 645 | # output_tensor_dims['094_convolutional'] = [255, 26, 26] 646 | # output_tensor_dims['106_convolutional'] = [255, 52, 52] 647 | 648 | # Create a GraphBuilderONNX object with the known output tensor dimensions: 649 | builder = GraphBuilderONNX(output_tensor_dims) 650 | 651 | weights_file_path = "yolov3-608.weights" 652 | 653 | # Now generate an ONNX graph with weights from the previously parsed layer configurations 654 | # and the weights file: 655 | yolov3_model_def = builder.build_onnx_graph( 656 | layer_configs=layer_configs, 657 | weights_file_path=weights_file_path, 658 | verbose=True) 659 | # Once we have the model definition, we do not need the builder anymore: 660 | del builder 661 | 662 | # Perform a sanity check on the ONNX model definition: 663 | onnx.checker.check_model(yolov3_model_def) 664 | 665 | # Serialize the generated ONNX graph to this file: 666 | output_file_path = 'yolov3-608.onnx' 667 | onnx.save(yolov3_model_def, output_file_path) 668 | 669 | if __name__ == '__main__': 670 | main() --------------------------------------------------------------------------------