├── README.md
├── bbox.py
├── coco_labels.txt
├── common.py
├── data_processing.py
├── images
    ├── dog.jpg
    └── person.jpg
├── onnx_to_tensorrt.py
├── util.py
├── yolov3-608.cfg
└── yolov3_to_onnx.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Update on 2019-04-19
 2 | - I have optimized and upgraded this project. So：
 3 | - If you see this project for the first time, you can jump to [This project](https://github.com/Cw-zero/TensorRT_yolo3_module) directly. 
 4 | - If you meet some bug on this project,you can try [This project](https://github.com/Cw-zero/TensorRT_yolo3_module).
 5 | 
 6 | # Use TensorRT accelerate yolo3
 7 | ---
 8 | ## 1. How to run this project
 9 | - a. Download yolo3.weight from [this](https://pjreddie.com/media/files/yolov3.weights), and change the name to **yolov3-608.weights**.
10 | - b. `python yolov3_to_onnx.py`, you will have a file named **yolov3-608.onnx**
11 | - c. `python onnx_to_tensorrt.py`,you can get the result of detections.
12 | 
13 | ## 2. Performance compare
14 | - a.You can download and run [this project](https://github.com/ayooshkathuria/pytorch-yolo-v3), which our project is changed from it.
15 | It detection speed is about **100ms** per image.
16 | 
17 | - b.Our project speed is about **62ms** per image
18 | 
19 | ## 3.Others
20 | - If you are more familiar with Chinese, you can refer to this blog(https://www.cnblogs.com/justcoder/), which has more details.
21 | 


--------------------------------------------------------------------------------
/bbox.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import torch 
  4 | import random
  5 | 
  6 | import numpy as np
  7 | import cv2
  8 | 
  9 | def confidence_filter(result, confidence):
 10 |     conf_mask = (result[:,:,4] > confidence).float().unsqueeze(2)
 11 |     result = result*conf_mask    
 12 |     
 13 |     return result
 14 | 
 15 | def confidence_filter_cls(result, confidence):
 16 |     max_scores = torch.max(result[:,:,5:25], 2)[0]
 17 |     res = torch.cat((result, max_scores),2)
 18 |     print(res.shape)
 19 |     
 20 |     
 21 |     cond_1 = (res[:,:,4] > confidence).float()
 22 |     cond_2 = (res[:,:,25] > 0.995).float()
 23 |     
 24 |     conf = cond_1 + cond_2
 25 |     conf = torch.clamp(conf, 0.0, 1.0)
 26 |     conf = conf.unsqueeze(2)
 27 |     result = result*conf   
 28 |     return result
 29 | 
 30 | 
 31 | 
 32 | def get_abs_coord(box):
 33 |     box[2], box[3] = abs(box[2]), abs(box[3])
 34 |     x1 = (box[0] - box[2]/2) - 1 
 35 |     y1 = (box[1] - box[3]/2) - 1 
 36 |     x2 = (box[0] + box[2]/2) - 1 
 37 |     y2 = (box[1] + box[3]/2) - 1
 38 |     return x1, y1, x2, y2
 39 |     
 40 | 
 41 | 
 42 | def sanity_fix(box):
 43 |     if (box[0] > box[2]):
 44 |         box[0], box[2] = box[2], box[0]
 45 |     
 46 |     if (box[1] >  box[3]):
 47 |         box[1], box[3] = box[3], box[1]
 48 |         
 49 |     return box
 50 | 
 51 | def bbox_iou(box1, box2):
 52 |     """
 53 |     Returns the IoU of two bounding boxes 
 54 |     
 55 |     
 56 |     """
 57 |     #Get the coordinates of bounding boxes
 58 |     b1_x1, b1_y1, b1_x2, b1_y2 = box1[:,0], box1[:,1], box1[:,2], box1[:,3]
 59 |     b2_x1, b2_y1, b2_x2, b2_y2 = box2[:,0], box2[:,1], box2[:,2], box2[:,3]
 60 |     
 61 |     #get the corrdinates of the intersection rectangle
 62 |     inter_rect_x1 =  torch.max(b1_x1, b2_x1)
 63 |     inter_rect_y1 =  torch.max(b1_y1, b2_y1)
 64 |     inter_rect_x2 =  torch.min(b1_x2, b2_x2)
 65 |     inter_rect_y2 =  torch.min(b1_y2, b2_y2)
 66 |     
 67 |     #Intersection area
 68 |     
 69 |     inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1,torch.zeros(inter_rect_x2.shape).cuda())*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape).cuda())
 70 |     # inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1,torch.zeros(inter_rect_x2.shape))*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape))
 71 |     
 72 |     #Union Area
 73 |     b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1)
 74 |     b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1)
 75 |     
 76 |     iou = inter_area / (b1_area + b2_area - inter_area)
 77 |     
 78 |     return iou
 79 | 
 80 | 
 81 | def pred_corner_coord(prediction):
 82 |     #Get indices of non-zero confidence bboxes
 83 |     ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous()
 84 |     
 85 |     box = prediction[ind_nz[0], ind_nz[1]]
 86 |     
 87 |     
 88 |     box_a = box.new(box.shape)
 89 |     box_a[:,0] = (box[:,0] - box[:,2]/2)
 90 |     box_a[:,1] = (box[:,1] - box[:,3]/2)
 91 |     box_a[:,2] = (box[:,0] + box[:,2]/2) 
 92 |     box_a[:,3] = (box[:,1] + box[:,3]/2)
 93 |     box[:,:4] = box_a[:,:4]
 94 |     
 95 |     prediction[ind_nz[0], ind_nz[1]] = box
 96 |     
 97 |     return prediction
 98 | 
 99 | 
100 | 
101 | 
102 | def write(x, batches, results, colors, classes):
103 |     c1 = tuple(x[1:3].int())
104 |     c2 = tuple(x[3:5].int())
105 |     img = results[int(x[0])]
106 |     cls = int(x[-1])
107 |     label = "{0}".format(classes[cls])
108 |     color = random.choice(colors)
109 |     cv2.rectangle(img, c1, c2,color, 1)
110 |     t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0]
111 |     c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4
112 |     cv2.rectangle(img, c1, c2,color, -1)
113 |     cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1);
114 |     return img
115 | 


--------------------------------------------------------------------------------
/coco_labels.txt:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorbike
 5 | aeroplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | sofa
59 | pottedplant
60 | bed
61 | diningtable
62 | toilet
63 | tvmonitor
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush


--------------------------------------------------------------------------------
/common.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
  3 | #
  4 | # NOTICE TO LICENSEE:
  5 | #
  6 | # This source code and/or documentation ("Licensed Deliverables") are
  7 | # subject to NVIDIA intellectual property rights under U.S. and
  8 | # international Copyright laws.
  9 | #
 10 | # These Licensed Deliverables contained herein is PROPRIETARY and
 11 | # CONFIDENTIAL to NVIDIA and is being provided under the terms and
 12 | # conditions of a form of NVIDIA software license agreement by and
 13 | # between NVIDIA and Licensee ("License Agreement") or electronically
 14 | # accepted by Licensee.  Notwithstanding any terms or conditions to
 15 | # the contrary in the License Agreement, reproduction or disclosure
 16 | # of the Licensed Deliverables to any third party without the express
 17 | # written consent of NVIDIA is prohibited.
 18 | #
 19 | # NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 20 | # LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 21 | # SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 22 | # PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 23 | # NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 24 | # DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 25 | # NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 26 | # NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 27 | # LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 28 | # SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 29 | # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 30 | # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 31 | # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 32 | # OF THESE LICENSED DELIVERABLES.
 33 | #
 34 | # U.S. Government End Users.  These Licensed Deliverables are a
 35 | # "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 36 | # 1995), consisting of "commercial computer software" and "commercial
 37 | # computer software documentation" as such terms are used in 48
 38 | # C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 39 | # only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 40 | # 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 41 | # U.S. Government End Users acquire the Licensed Deliverables with
 42 | # only those rights set forth herein.
 43 | #
 44 | # Any use of the Licensed Deliverables in individual and commercial
 45 | # software must include, in the user documentation and internal
 46 | # comments to the code, the above Disclaimer and U.S. Government End
 47 | # Users Notice.
 48 | #
 49 | 
 50 | import os
 51 | import argparse
 52 | import numpy as np
 53 | import pycuda.driver as cuda
 54 | import tensorrt as trt
 55 | 
 56 | try:
 57 |     # Sometimes python2 does not understand FileNotFoundError
 58 |     FileNotFoundError
 59 | except NameError:
 60 |     FileNotFoundError = IOError
 61 | 
 62 | def GiB(val):
 63 |     return val * 1 << 30
 64 | 
 65 | def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[]):
 66 |     '''
 67 |     Parses sample arguments.
 68 |     Args:
 69 |         description (str): Description of the sample.
 70 |         subfolder (str): The subfolder containing data relevant to this sample
 71 |         find_files (str): A list of filenames to find. Each filename will be replaced with an absolute path.
 72 |     Returns:
 73 |         str: Path of data directory.
 74 |     Raises:
 75 |         FileNotFoundError
 76 |     '''
 77 |     kDEFAULT_DATA_ROOT = os.path.abspath("/usr/src/tensorrt/data")
 78 | 
 79 |     # Standard command-line arguments for all samples.
 80 |     parser = argparse.ArgumentParser(description=description)
 81 |     parser.add_argument("-d", "--datadir", help="Location of the TensorRT sample data directory.")
 82 |     args, unknown_args = parser.parse_known_args()
 83 | 
 84 |     # If data directory is not specified, use the default.
 85 |     data_root = args.datadir if args.datadir else kDEFAULT_DATA_ROOT
 86 |     # If the subfolder exists, append it to the path, otherwise use the provided path as-is.
 87 |     subfolder_path = os.path.join(data_root, subfolder)
 88 |     if not os.path.exists(subfolder_path):
 89 |         print("WARNING: " + subfolder_path + " does not exist. Using " + data_root + " instead.")
 90 |     data_path = subfolder_path if os.path.exists(subfolder_path) else data_root
 91 | 
 92 |     # Make sure data directory exists.
 93 |     if not (os.path.exists(data_path)):
 94 |         raise FileNotFoundError(data_path + " does not exist. Please provide the correct data path with the -d option.")
 95 | 
 96 |     # Find all requested files.
 97 |     for index, f in enumerate(find_files):
 98 |         find_files[index] = os.path.abspath(os.path.join(data_path, f))
 99 |         if not os.path.exists(find_files[index]):
100 |             raise FileNotFoundError(find_files[index] + " does not exist. Please provide the correct data path with the -d option.")
101 |     if find_files:
102 |         return data_path, find_files
103 |     else:
104 |         return data_path
105 | 
106 | # Simple helper data class that's a little nicer to use than a 2-tuple.
107 | class HostDeviceMem(object):
108 |     def __init__(self, host_mem, device_mem):
109 |         self.host = host_mem
110 |         self.device = device_mem
111 | 
112 |     def __str__(self):
113 |         return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
114 | 
115 |     def __repr__(self):
116 |         return self.__str__()
117 | 
118 | # Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
119 | def allocate_buffers(engine):
120 |     inputs = []
121 |     outputs = []
122 |     bindings = []
123 |     stream = cuda.Stream()
124 |     for binding in engine:
125 |         size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
126 |         dtype = trt.nptype(engine.get_binding_dtype(binding))
127 |         # Allocate host and device buffers
128 |         host_mem = cuda.pagelocked_empty(size, dtype)
129 |         device_mem = cuda.mem_alloc(host_mem.nbytes)
130 |         # Append the device buffer to device bindings.
131 |         bindings.append(int(device_mem))
132 |         # Append to the appropriate list.
133 |         if engine.binding_is_input(binding):
134 |             inputs.append(HostDeviceMem(host_mem, device_mem))
135 |         else:
136 |             outputs.append(HostDeviceMem(host_mem, device_mem))
137 |     return inputs, outputs, bindings, stream
138 | 
139 | # This function is generalized for multiple inputs/outputs.
140 | # inputs and outputs are expected to be lists of HostDeviceMem objects.
141 | def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
142 |     # Transfer input data to the GPU.
143 |     [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
144 |     # Run inference.
145 |     context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
146 |     # Transfer predictions back from the GPU.
147 |     [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
148 |     # Synchronize the stream
149 |     stream.synchronize()
150 |     # Return only the host outputs.
151 |     return [out.host for out in outputs]
152 | 


--------------------------------------------------------------------------------
/data_processing.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import math
  3 | from PIL import Image
  4 | import numpy as np
  5 | import torch
  6 | from torch.autograd import Variable
  7 | 
  8 | 
  9 | # YOLOv3-608 has been trained with these 80 categories from COCO:
 10 | # Lin, Tsung-Yi, et al. "Microsoft COCO: Common Objects in Context."
 11 | # European Conference on Computer Vision. Springer, Cham, 2014.
 12 | 
 13 | def load_label_categories(label_file_path):
 14 |     categories = [line.rstrip('\n') for line in open(label_file_path)]
 15 |     return categories
 16 | 
 17 | LABEL_FILE_PATH = 'coco_labels.txt'
 18 | ALL_CATEGORIES = load_label_categories(LABEL_FILE_PATH)
 19 | 
 20 | # Let's make sure that there are 80 classes, as expected for the COCO data set:
 21 | CATEGORY_NUM = len(ALL_CATEGORIES)
 22 | assert CATEGORY_NUM == 80
 23 | 
 24 | 
 25 | class PreprocessYOLO(object):
 26 |     """A simple class for loading images with PIL and reshaping them to the specified
 27 |     input resolution for YOLOv3-608.
 28 |     """
 29 | 
 30 |     def __init__(self, yolo_input_resolution):
 31 |         """Initialize with the input resolution for YOLOv3, which will stay fixed in this sample.
 32 | 
 33 |         Keyword arguments:
 34 |         yolo_input_resolution -- two-dimensional tuple with the target network's (spatial)
 35 |         input resolution in HW order
 36 |         """
 37 |         self.yolo_input_resolution = yolo_input_resolution
 38 | 
 39 |     def process(self, input_image_path):
 40 |         """Load an image from the specified input path,
 41 |         and return it together with a pre-processed version required for feeding it into a
 42 |         YOLOv3 network.
 43 | 
 44 |         Keyword arguments:
 45 |         input_image_path -- string path of the image to be loaded
 46 |         """
 47 |         image_raw, image_resized = self._load_and_resize(input_image_path)
 48 |         image_preprocessed = self._shuffle_and_normalize(image_resized)
 49 |         return image_raw, image_preprocessed
 50 | 
 51 |     def _load_and_resize(self, input_image_path):
 52 |         """Load an image from the specified path and resize it to the input resolution.
 53 |         Return the input image before resizing as a PIL Image (required for visualization),
 54 |         and the resized image as a NumPy float array.
 55 | 
 56 |         Keyword arguments:
 57 |         input_image_path -- string path of the image to be loaded
 58 |         """
 59 | 
 60 |         image_raw = Image.open(input_image_path)
 61 |         # Expecting yolo_input_resolution in (height, width) format, adjusting to PIL
 62 |         # convention (width, height) in PIL:
 63 |         new_resolution = (
 64 |             self.yolo_input_resolution[1],
 65 |             self.yolo_input_resolution[0])
 66 |         image_resized = image_raw.resize(
 67 |             new_resolution, resample=Image.BICUBIC)
 68 |         image_resized = np.array(image_resized, dtype=np.float32, order='C')
 69 |         return image_raw, image_resized
 70 | 
 71 |     def _shuffle_and_normalize(self, image):
 72 |         """Normalize a NumPy array representing an image to the range [0, 1], and
 73 |         convert it from HWC format ("channels last") to NCHW format ("channels first"
 74 |         with leading batch dimension).
 75 | 
 76 |         Keyword arguments:
 77 |         image -- image as three-dimensional NumPy float array, in HWC format
 78 |         """
 79 |         image /= 255.0
 80 |         # HWC to CHW format:
 81 |         image = np.transpose(image, [2, 0, 1])
 82 |         # CHW to NCHW format
 83 |         image = np.expand_dims(image, axis=0)
 84 |         # Convert the image to row-major order, also known as "C order":
 85 |         image = np.array(image, dtype=np.float32, order='C')
 86 |         return image
 87 | 
 88 | 
 89 | class PostprocessYOLO(object):
 90 |     """Class for post-processing the three outputs tensors from YOLOv3-608."""
 91 | 
 92 |     def __init__(self,
 93 |                  yolo_masks,
 94 |                  yolo_anchors,
 95 |                  obj_threshold,
 96 |                  nms_threshold,
 97 |                  yolo_input_resolution):
 98 |         """Initialize with all values that will be kept when processing several frames.
 99 |         Assuming 3 outputs of the network in the case of (large) YOLOv3.
100 | 
101 |         Keyword arguments:
102 |         yolo_masks -- a list of 3 three-dimensional tuples for the YOLO masks
103 |         yolo_anchors -- a list of 9 two-dimensional tuples for the YOLO anchors
104 |         object_threshold -- threshold for object coverage, float value between 0 and 1
105 |         nms_threshold -- threshold for non-max suppression algorithm,
106 |         float value between 0 and 1
107 |         input_resolution_yolo -- two-dimensional tuple with the target network's (spatial)
108 |         input resolution in HW order
109 |         """
110 |         self.masks = yolo_masks
111 |         self.anchors = yolo_anchors
112 |         self.object_threshold = obj_threshold
113 |         self.nms_threshold = nms_threshold
114 |         self.input_resolution_yolo = yolo_input_resolution
115 | 
116 |     def process(self, outputs):
117 |         out_boxes = []
118 |         num_anchors = 3
119 |         num_classes = 80
120 |         # for output in outputs:
121 |         for output, mask in zip(outputs, self.masks):
122 |             anchors = [self.anchors[i] for i in mask]
123 |             anchors = list(np.reshape(anchors,(6,-1)))
124 | 
125 |             anchor_step = len(anchors)/num_anchors
126 | 
127 |             output = torch.from_numpy(output)
128 |             if output.dim() == 3:
129 |                 output = output.unsqueeze(0)
130 |             batch = output.size(0)
131 |             assert(output.size(1) == (5+num_classes)*num_anchors)
132 |             h = output.size(2)
133 |             w = output.size(3)
134 | 
135 |             t0 = time.time()
136 |             all_boxes = []  #only output,not outputs
137 |             output = output.view(batch*num_anchors, 5+num_classes, h*w).transpose(0,1).contiguous().view(5+num_classes, batch*num_anchors*h*w)
138 | 
139 |             #use CPU, so GPU more faster?
140 |             grid_x = torch.linspace(0, w-1, w).repeat(h,1).repeat(batch*num_anchors, 1, 1).view(batch*num_anchors*h*w).type_as(output) #cuda()
141 |             grid_y = torch.linspace(0, h-1, h).repeat(w,1).t().repeat(batch*num_anchors, 1, 1).view(batch*num_anchors*h*w).type_as(output) #cuda()
142 |             xs = torch.sigmoid(output[0]) + grid_x
143 |             ys = torch.sigmoid(output[1]) + grid_y
144 | 
145 |             #use CPU, so GPU more faster?
146 |             anchor_w = torch.Tensor(anchors).view(num_anchors, anchor_step).index_select(1, torch.LongTensor([0]))
147 |             anchor_h = torch.Tensor(anchors).view(num_anchors, anchor_step).index_select(1, torch.LongTensor([1]))
148 |             anchor_w = anchor_w.repeat(batch, 1).repeat(1, 1, h*w).view(batch*num_anchors*h*w).type_as(output) #cuda()
149 |             anchor_h = anchor_h.repeat(batch, 1).repeat(1, 1, h*w).view(batch*num_anchors*h*w).type_as(output) #cuda()
150 |             ws = torch.exp(output[2]) * anchor_w
151 |             hs = torch.exp(output[3]) * anchor_h
152 | 
153 |             det_confs = torch.sigmoid(output[4])
154 |             cls_confs = torch.nn.Softmax()(Variable(output[5:5+num_classes].transpose(0,1))).data
155 |             cls_max_confs, cls_max_ids = torch.max(cls_confs, 1)
156 |             cls_max_confs = cls_max_confs.view(-1)
157 |             cls_max_ids = cls_max_ids.view(-1)
158 |             t1 = time.time()
159 | 
160 |             sz_hw = h*w
161 |             sz_hwa = sz_hw*num_anchors
162 |             t2 = time.time()
163 |             for b in range(batch):
164 |                 boxes = []
165 |                 for cy in range(h):
166 |                     for cx in range(w):
167 |                         for i in range(num_anchors):
168 |                             ind = b*sz_hwa + i*sz_hw + cy*w + cx
169 |                             det_conf =  det_confs[ind]
170 |                             conf =  det_confs[ind]
171 | 
172 |                             if conf > self.object_threshold:
173 |                                 bcx = xs[ind]
174 |                                 bcy = ys[ind]
175 |                                 bw = ws[ind]
176 |                                 bh = hs[ind]
177 |                                 cls_max_conf = cls_max_confs[ind]
178 |                                 cls_max_id = cls_max_ids[ind]
179 |                                 box = [bcx/w, bcy/h, bw/w, bh/h, det_conf, cls_max_conf, cls_max_id]
180 |                                 boxes.append(box)
181 |                 all_boxes.append(boxes)
182 |             t3 = time.time()
183 |             out_boxes.append(all_boxes)
184 |             
185 |             
186 |             if True:
187 |                 print('---------------------------------')
188 |                 print('matrix computation : %f' % (t1-t0))
189 |                 print('        gpu to cpu : %f' % (t2-t1))
190 |                 print('      boxes filter : %f' % (t3-t2))
191 |                 # print('      boxes filter : %f' % (t3-t0))
192 |                 print('---------------------------------')
193 |         return out_boxes
194 |         
195 | 
196 |     def process1(self, outputs, resolution_raw):
197 |         """Take the YOLOv3 outputs generated from a TensorRT forward pass, post-process them
198 |         and return a list of bounding boxes for detected object together with their category
199 |         and their confidences in separate lists.
200 | 
201 |         Keyword arguments:
202 |         outputs -- outputs from a TensorRT engine in NCHW format
203 |         resolution_raw -- the original spatial resolution from the input PIL image in WH order
204 |         """
205 |         outputs_reshaped = list()
206 |         for output in outputs:
207 |             outputs_reshaped.append(self._reshape_output(output))
208 | 
209 |         start = time.time()
210 |         boxes, categories, confidences = self._process_yolo_output(outputs_reshaped, resolution_raw)
211 |         end = time.time()
212 |         print("_process_yolo_output")
213 |         print(end - start)
214 | 
215 |         return boxes, categories, confidences
216 | 
217 |     def _reshape_output(self, output):
218 |         """Reshape a TensorRT output from NCHW to NHWC format (with expected C=255),
219 |         and then return it in (height,width,3,85) dimensionality after further reshaping.
220 | 
221 |         Keyword argument:
222 |         output -- an output from a TensorRT engine after inference
223 |         """
224 |         output = np.transpose(output, [0, 2, 3, 1])
225 |         _, height, width, _ = output.shape
226 |         dim1, dim2 = height, width
227 |         dim3 = 3
228 |         # There are CATEGORY_NUM=80 object categories:
229 |         dim4 = (4 + 1 + CATEGORY_NUM)
230 |         return np.reshape(output, (dim1, dim2, dim3, dim4))
231 | 
232 |     def _process_yolo_output(self, outputs_reshaped, resolution_raw):
233 |         """Take in a list of three reshaped YOLO outputs in (height,width,3,85) shape and return
234 |         return a list of bounding boxes for detected object together with their category and their
235 |         confidences in separate lists.
236 | 
237 |         Keyword arguments:
238 |         outputs_reshaped -- list of three reshaped YOLO outputs as NumPy arrays
239 |         with shape (height,width,3,85)
240 |         resolution_raw -- the original spatial resolution from the input PIL image in WH order
241 |         """
242 | 
243 |         # E.g. in YOLOv3-608, there are three output tensors, which we associate with their
244 |         # respective masks. Then we iterate through all output-mask pairs and generate candidates
245 |         # for bounding boxes, their corresponding category predictions and their confidences:
246 |         boxes, categories, confidences = list(), list(), list()
247 |         
248 |         for output, mask in zip(outputs_reshaped, self.masks):
249 |             start = time.time()
250 |             box, category, confidence = self._process_feats(output, mask)
251 |             end = time.time()
252 |             print("_process_feats")
253 |             print(end - start)
254 |             box, category, confidence = self._filter_boxes(box, category, confidence)
255 |             boxes.append(box)
256 |             categories.append(category)
257 |             confidences.append(confidence)
258 |             
259 | 
260 |         boxes = np.concatenate(boxes)
261 |         categories = np.concatenate(categories)
262 |         confidences = np.concatenate(confidences)
263 | 
264 |         # Scale boxes back to original image shape:
265 |         width, height = resolution_raw
266 |         image_dims = [width, height, width, height]
267 |         boxes = boxes * image_dims
268 | 
269 |         # Using the candidates from the previous (loop) step, we apply the non-max suppression
270 |         # algorithm that clusters adjacent bounding boxes to a single bounding box:
271 |         nms_boxes, nms_categories, nscores = list(), list(), list()
272 |         for category in set(categories):
273 |             idxs = np.where(categories == category)
274 |             box = boxes[idxs]
275 |             category = categories[idxs]
276 |             confidence = confidences[idxs]
277 | 
278 |             keep = self._nms_boxes(box, confidence)
279 | 
280 |             nms_boxes.append(box[keep])
281 |             nms_categories.append(category[keep])
282 |             nscores.append(confidence[keep])
283 | 
284 |         if not nms_categories and not nscores:
285 |             return None, None, None
286 | 
287 |         boxes = np.concatenate(nms_boxes)
288 |         categories = np.concatenate(nms_categories)
289 |         confidences = np.concatenate(nscores)
290 | 
291 |         return boxes, categories, confidences
292 | 
293 |     def _process_feats(self, output_reshaped, mask):
294 |         """Take in a reshaped YOLO output in height,width,3,85 format together with its
295 |         corresponding YOLO mask and return the detected bounding boxes, the confidence,
296 |         and the class probability in each cell/pixel.
297 | 
298 |         Keyword arguments:
299 |         output_reshaped -- reshaped YOLO output as NumPy arrays with shape (height,width,3,85)
300 |         mask -- 2-dimensional tuple with mask specification for this output
301 |         """
302 | 
303 |         # Two in-line functions required for calculating the bounding box
304 |         # descriptors:
305 |         def sigmoid(value):
306 |             """Return the sigmoid of the input."""
307 |             return 1.0 / (1.0 + math.exp(-value))
308 | 
309 |         def exponential(value):
310 |             """Return the exponential of the input."""
311 |             return math.exp(value)
312 | 
313 |         # Vectorized calculation of above two functions:
314 |         sigmoid_v = np.vectorize(sigmoid)
315 |         exponential_v = np.vectorize(exponential)
316 | 
317 |         grid_h, grid_w, _, _ = output_reshaped.shape
318 | 
319 |         anchors = [self.anchors[i] for i in mask]
320 | 
321 |         # Reshape to N, height, width, num_anchors, box_params:
322 |         anchors_tensor = np.reshape(anchors, [1, 1, len(anchors), 2])
323 |         box_xy = sigmoid_v(output_reshaped[..., :2])
324 |         box_wh = exponential_v(output_reshaped[..., 2:4]) * anchors_tensor
325 |         box_confidence = sigmoid_v(output_reshaped[..., 4])
326 | 
327 |         box_confidence = np.expand_dims(box_confidence, axis=-1)
328 |         box_class_probs = sigmoid_v(output_reshaped[..., 5:])
329 | 
330 |         col = np.tile(np.arange(0, grid_w), grid_w).reshape(-1, grid_w)
331 |         row = np.tile(np.arange(0, grid_h).reshape(-1, 1), grid_h)
332 | 
333 |         col = col.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
334 |         row = row.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
335 |         grid = np.concatenate((col, row), axis=-1)
336 | 
337 |         box_xy += grid
338 |         box_xy /= (grid_w, grid_h)
339 |         box_wh /= self.input_resolution_yolo
340 |         box_xy -= (box_wh / 2.)
341 |         boxes = np.concatenate((box_xy, box_wh), axis=-1)
342 | 
343 |         # boxes: centroids, box_confidence: confidence level, box_class_probs:
344 |         # class confidence
345 |         return boxes, box_confidence, box_class_probs
346 | 
347 |     def _filter_boxes(self, boxes, box_confidences, box_class_probs):
348 |         """Take in the unfiltered bounding box descriptors and discard each cell
349 |         whose score is lower than the object threshold set during class initialization.
350 | 
351 |         Keyword arguments:
352 |         boxes -- bounding box coordinates with shape (height,width,3,4); 4 for
353 |         x,y,height,width coordinates of the boxes
354 |         box_confidences -- bounding box confidences with shape (height,width,3,1); 1 for as
355 |         confidence scalar per element
356 |         box_class_probs -- class probabilities with shape (height,width,3,CATEGORY_NUM)
357 | 
358 |         """
359 |         box_scores = box_confidences * box_class_probs
360 |         box_classes = np.argmax(box_scores, axis=-1)
361 |         box_class_scores = np.max(box_scores, axis=-1)
362 |         pos = np.where(box_class_scores >= self.object_threshold)
363 | 
364 |         boxes = boxes[pos]
365 |         classes = box_classes[pos]
366 |         scores = box_class_scores[pos]
367 | 
368 |         return boxes, classes, scores
369 | 
370 |     def _nms_boxes(self, boxes, box_confidences):
371 |         """Apply the Non-Maximum Suppression (NMS) algorithm on the bounding boxes with their
372 |         confidence scores and return an array with the indexes of the bounding boxes we want to
373 |         keep (and display later).
374 | 
375 |         Keyword arguments:
376 |         boxes -- a NumPy array containing N bounding-box coordinates that survived filtering,
377 |         with shape (N,4); 4 for x,y,height,width coordinates of the boxes
378 |         box_confidences -- a Numpy array containing the corresponding confidences with shape N
379 |         """
380 |         x_coord = boxes[:, 0]
381 |         y_coord = boxes[:, 1]
382 |         width = boxes[:, 2]
383 |         height = boxes[:, 3]
384 | 
385 |         areas = width * height
386 |         ordered = box_confidences.argsort()[::-1]
387 | 
388 |         keep = list()
389 |         while ordered.size > 0:
390 |             # Index of the current element:
391 |             i = ordered[0]
392 |             keep.append(i)
393 |             xx1 = np.maximum(x_coord[i], x_coord[ordered[1:]])
394 |             yy1 = np.maximum(y_coord[i], y_coord[ordered[1:]])
395 |             xx2 = np.minimum(x_coord[i] + width[i], x_coord[ordered[1:]] + width[ordered[1:]])
396 |             yy2 = np.minimum(y_coord[i] + height[i], y_coord[ordered[1:]] + height[ordered[1:]])
397 | 
398 |             width1 = np.maximum(0.0, xx2 - xx1 + 1)
399 |             height1 = np.maximum(0.0, yy2 - yy1 + 1)
400 |             intersection = width1 * height1
401 |             union = (areas[i] + areas[ordered[1:]] - intersection)
402 | 
403 |             # Compute the Intersection over Union (IoU) score:
404 |             iou = intersection / union
405 | 
406 |             # The goal of the NMS algorithm is to reduce the number of adjacent bounding-box
407 |             # candidates to a minimum. In this step, we keep only those elements whose overlap
408 |             # with the current bounding box is lower than the threshold:
409 |             indexes = np.where(iou <= self.nms_threshold)[0]
410 |             ordered = ordered[indexes + 1]
411 | 
412 |         keep = np.array(keep)
413 |         return keep
414 | 


--------------------------------------------------------------------------------
/images/dog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cw-zero/TensorRT_yolo3/bf904fe07498ad0f75aa10e6ef3769dbb9b72b69/images/dog.jpg


--------------------------------------------------------------------------------
/images/person.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cw-zero/TensorRT_yolo3/bf904fe07498ad0f75aa10e6ef3769dbb9b72b69/images/person.jpg


--------------------------------------------------------------------------------
/onnx_to_tensorrt.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import torch
  4 | import numpy as np
  5 | import tensorrt as trt
  6 | import pycuda.driver as cuda
  7 | import pycuda.autoinit
  8 | from PIL import ImageDraw
  9 | import time
 10 | from util import *
 11 | 
 12 | from data_processing import PreprocessYOLO
 13 | 
 14 | import sys, os
 15 | sys.path.insert(1, os.path.join(sys.path[0], ".."))
 16 | import common
 17 | 
 18 | TRT_LOGGER = trt.Logger()
 19 | 
 20 | def get_engine(onnx_file_path, engine_file_path=""):
 21 |     """Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it."""
 22 |     def build_engine():
 23 |         """Takes an ONNX file and creates a TensorRT engine to run inference with"""
 24 |         with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
 25 |             builder.max_workspace_size = 1 << 30 # 1GB
 26 |             builder.max_batch_size = 1
 27 |             # Parse model file
 28 |             if not os.path.exists(onnx_file_path):
 29 |                 print('ONNX file {} not found, please run yolov3_to_onnx.py first to generate it.'.format(onnx_file_path))
 30 |                 exit(0)
 31 |             print('Loading ONNX file from path {}...'.format(onnx_file_path))
 32 |             with open(onnx_file_path, 'rb') as model:
 33 |                 print('Beginning ONNX file parsing')
 34 |                 parser.parse(model.read())
 35 |             print('Completed parsing of ONNX file')
 36 |             print('Building an engine from file {}; this may take a while...'.format(onnx_file_path))
 37 |             engine = builder.build_cuda_engine(network)
 38 |             print("Completed creating Engine")
 39 |             with open(engine_file_path, "wb") as f:
 40 |                 f.write(engine.serialize())
 41 |             return engine
 42 | 
 43 |     if os.path.exists(engine_file_path):
 44 |         # If a serialized engine exists, use it instead of building an engine.
 45 |         print("Reading engine from file {}".format(engine_file_path))
 46 |         with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
 47 |             return runtime.deserialize_cuda_engine(f.read())
 48 |     else:
 49 |         return build_engine()
 50 | 
 51 | def main():
 52 | 
 53 |     """Create a TensorRT engine for ONNX-based YOLOv3-608 and run inference."""
 54 | 
 55 |     # Try to load a previously generated YOLOv3-608 network graph in ONNX format:
 56 |     onnx_file_path = 'yolov3-608.onnx'
 57 |     engine_file_path = "yolov3-608.trt"
 58 |     input_image_path = "./images/b.jpg"
 59 | 
 60 |     # Two-dimensional tuple with the target network's (spatial) input resolution in HW ordered
 61 |     input_resolution_yolov3_HW = (608, 608)
 62 | 
 63 |     # Create a pre-processor object by specifying the required input resolution for YOLOv3
 64 |     preprocessor = PreprocessYOLO(input_resolution_yolov3_HW)
 65 | 
 66 |     # Load an image from the specified input path, and return it together with  a pre-processed version
 67 |     image_raw, image = preprocessor.process(input_image_path)
 68 | 
 69 |     # Store the shape of the original input image in WH format, we will need it for later
 70 |     shape_orig_WH = image_raw.size
 71 | 
 72 |     # Output shapes expected by the post-processor
 73 |     output_shapes = [(1, 255, 19, 19), (1, 255, 38, 38), (1, 255, 76, 76)]
 74 |     # output_shapes = [(1, 255, 13, 13), (1, 255, 26, 26), (1, 255, 52, 52)]
 75 | 
 76 |     # Do inference with TensorRT
 77 |     trt_outputs = []
 78 |     a = torch.cuda.FloatTensor()
 79 |     average_inference_time = 0
 80 |     average_yolo_time = 0
 81 |     counter = 10
 82 |     with get_engine(onnx_file_path, engine_file_path) as engine, engine.create_execution_context() as context:
 83 |         inputs, outputs, bindings, stream = common.allocate_buffers(engine)
 84 |         while counter:
 85 |             # Do inference
 86 |             print('Running inference on image {}...'.format(input_image_path))
 87 |             # Set host input to the image. The common.do_inference function will copy the input to the GPU before executing.
 88 |             inference_start = time.time()
 89 |             inputs[0].host = image
 90 |             trt_outputs = common.do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
 91 |             inference_end = time.time()
 92 |             inference_time = inference_end-inference_start
 93 |             average_inference_time = average_inference_time + inference_time
 94 |             print('inference time : %f' % (inference_end-inference_start))
 95 | 
 96 |             # Do yolo_layer with pytorch
 97 |             inp_dim = 608
 98 |             num_classes = 80
 99 |             CUDA = True
100 |             yolo_anchors = [[(116, 90), (156, 198), (373, 326)],
101 |                             [(30, 61),  (62, 45),   (59, 119)],
102 |                             [(10, 13),  (16, 30),   (33, 23)]]
103 |             write = 0
104 |             yolo_start = time.time()
105 |             for output, shape, anchors in zip(trt_outputs, output_shapes, yolo_anchors):
106 |                 output = output.reshape(shape) 
107 |                 trt_output = torch.from_numpy(output).cuda()
108 |                 trt_output = trt_output.data
109 |                 trt_output = predict_transform(trt_output, inp_dim, anchors, num_classes, CUDA)
110 | 
111 |                 if type(trt_output) == int:
112 |                     continue
113 | 
114 |                 if not write:
115 |                     detections = trt_output
116 |                     write = 1
117 | 
118 |                 else:
119 |                     detections = torch.cat((detections, trt_output), 1)
120 |             dets = dynamic_write_results(detections, 0.5, num_classes, nms=True, nms_conf=0.45) #0.008
121 |             yolo_end = time.time()
122 |             yolo_time = yolo_end-yolo_start
123 |             average_yolo_time = average_yolo_time + yolo_time
124 |             print('yolo time : %f' % (yolo_end-yolo_start))
125 |             print('all time : %f' % (yolo_end-inference_start))
126 |             counter = counter -1
127 | 
128 |         average_yolo_time = average_yolo_time/10
129 |         average_inference_time = average_inference_time/10
130 |         print("--------------------------------------------------------")
131 |         print('average yolo time : %f' % (average_yolo_time))
132 |         print('average inference time : %f' % (average_inference_time))
133 |         print("--------------------------------------------------------")
134 | 
135 | if __name__ == '__main__':
136 |     main()
137 | 


--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from __future__ import division
  3 | 
  4 | import torch 
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F 
  7 | from torch.autograd import Variable
  8 | import numpy as np
  9 | import cv2 
 10 | #import matplotlib.pyplot as plt
 11 | try:
 12 |     from bbox import bbox_iou
 13 | except ImportError:
 14 |     from yolo.bbox import bbox_iou
 15 | 
 16 | 
 17 | def count_parameters(model):
 18 |     return sum(p.numel() for p in model.parameters())
 19 | 
 20 | def count_learnable_parameters(model):
 21 |     return sum(p.numel() for p in model.parameters() if p.requires_grad)
 22 | 
 23 | def convert2cpu(matrix):
 24 |     if matrix.is_cuda:
 25 |         return torch.FloatTensor(matrix.size()).copy_(matrix)
 26 |     else:
 27 |         return matrix
 28 | 
 29 | def predict_transform(prediction, inp_dim, anchors, num_classes, CUDA = True):
 30 |     batch_size = prediction.size(0)
 31 |     stride =  inp_dim // prediction.size(2)
 32 |     grid_size = inp_dim // stride
 33 |     bbox_attrs = 5 + num_classes
 34 |     num_anchors = len(anchors)
 35 |     
 36 |     anchors = [(a[0]/stride, a[1]/stride) for a in anchors]
 37 | 
 38 | 
 39 | 
 40 |     prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size)
 41 |     prediction = prediction.transpose(1,2).contiguous()
 42 |     prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs)
 43 | 
 44 | 
 45 |     #Sigmoid the  centre_X, centre_Y. and object confidencce
 46 |     prediction[:,:,0] = torch.sigmoid(prediction[:,:,0])
 47 |     prediction[:,:,1] = torch.sigmoid(prediction[:,:,1])
 48 |     prediction[:,:,4] = torch.sigmoid(prediction[:,:,4])
 49 |     
 50 | 
 51 |     
 52 |     #Add the center offsets
 53 |     grid_len = np.arange(grid_size)
 54 |     a,b = np.meshgrid(grid_len, grid_len)
 55 |     
 56 |     x_offset = torch.FloatTensor(a).view(-1,1)
 57 |     y_offset = torch.FloatTensor(b).view(-1,1)
 58 |     
 59 |     if CUDA:
 60 |         x_offset = x_offset.cuda()
 61 |         y_offset = y_offset.cuda()
 62 |     
 63 |     x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1,num_anchors).view(-1,2).unsqueeze(0)
 64 |     
 65 |     prediction[:,:,:2] += x_y_offset
 66 |       
 67 |     #log space transform height and the width
 68 |     anchors = torch.FloatTensor(anchors)
 69 |     
 70 |     if CUDA:
 71 |         anchors = anchors.cuda()
 72 |     
 73 |     anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0)
 74 |     prediction[:,:,2:4] = torch.exp(prediction[:,:,2:4])*anchors
 75 | 
 76 |     #Softmax the class scores
 77 |     prediction[:,:,5: 5 + num_classes] = torch.sigmoid((prediction[:,:, 5 : 5 + num_classes]))
 78 | 
 79 |     prediction[:,:,:4] *= stride
 80 |    
 81 |     
 82 |     return prediction
 83 | 
 84 | def load_classes(namesfile):
 85 |     fp = open(namesfile, "r")
 86 |     names = fp.read().split("\n")[:-1]
 87 |     return names
 88 | 
 89 | def get_im_dim(im):
 90 |     im = cv2.imread(im)
 91 |     w,h = im.shape[1], im.shape[0]
 92 |     return w,h
 93 | 
 94 | def unique(tensor):
 95 |     tensor_np = tensor.cpu().numpy()
 96 |     unique_np = np.unique(tensor_np)
 97 |     unique_tensor = torch.from_numpy(unique_np)
 98 | 
 99 |     tensor_res = tensor.new(unique_tensor.shape)
100 |     tensor_res.copy_(unique_tensor)
101 |     return tensor_res
102 | 
103 | def dynamic_write_results(prediction, confidence, num_classes, nms=True, nms_conf=0.4):
104 |     prediction_bak = prediction.clone()
105 |     dets = write_results(prediction.clone(), confidence, num_classes, nms, nms_conf)
106 |     if isinstance(dets, int):
107 |         return dets
108 | 
109 |     if dets.shape[0] > 100:
110 |         nms_conf -= 0.05
111 |         dets = write_results(prediction_bak.clone(), confidence, num_classes, nms, nms_conf)
112 | 
113 |     return dets
114 | 
115 | 
116 | def write_results(prediction, confidence, num_classes, nms=True, nms_conf=0.4):
117 |     conf_mask = (prediction[:, :, 4] > confidence).float().float().unsqueeze(2)
118 |     prediction = prediction * conf_mask
119 | 
120 |     try:
121 |         ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous()
122 |     except:
123 |         return 0
124 | 
125 |     box_a = prediction.new(prediction.shape)
126 |     box_a[:,:,0] = (prediction[:,:,0] - prediction[:,:,2]/2)
127 |     box_a[:,:,1] = (prediction[:,:,1] - prediction[:,:,3]/2)
128 |     box_a[:,:,2] = (prediction[:,:,0] + prediction[:,:,2]/2) 
129 |     box_a[:,:,3] = (prediction[:,:,1] + prediction[:,:,3]/2)
130 |     prediction[:,:,:4] = box_a[:,:,:4]
131 | 
132 |     batch_size = prediction.size(0)
133 | 
134 |     output = prediction.new(1, prediction.size(2) + 1)
135 |     write = False
136 |     num = 0
137 |     for ind in range(batch_size):
138 |         #select the image from the batch
139 |         image_pred = prediction[ind]
140 | 
141 |         #Get the class having maximum score, and the index of that class
142 |         #Get rid of num_classes softmax scores 
143 |         #Add the class index and the class score of class having maximum score
144 |         max_conf, max_conf_score = torch.max(image_pred[:,5:5+ num_classes], 1)
145 |         max_conf = max_conf.float().unsqueeze(1)
146 |         max_conf_score = max_conf_score.float().unsqueeze(1)
147 |         seq = (image_pred[:,:5], max_conf, max_conf_score)
148 |         image_pred = torch.cat(seq, 1)
149 | 
150 |         #Get rid of the zero entries
151 |         non_zero_ind =  (torch.nonzero(image_pred[:,4]))
152 | 
153 |         image_pred_ = image_pred[non_zero_ind.squeeze(),:].view(-1,7)
154 | 
155 |         #Get the various classes detected in the image
156 |         try:
157 |             img_classes = unique(image_pred_[:,-1])
158 |         except:
159 |             continue
160 | 
161 |         #WE will do NMS classwise
162 |         #print(img_classes)
163 |         for cls in img_classes:
164 |             # if cls != 0: #0 is the person
165 |             #     continue
166 |             #get the detections with one particular class
167 |             cls_mask = image_pred_*(image_pred_[:,-1] == cls).float().unsqueeze(1)
168 |             class_mask_ind = torch.nonzero(cls_mask[:,-2]).squeeze()
169 | 
170 |             image_pred_class = image_pred_[class_mask_ind].view(-1,7)
171 | 
172 |             #sort the detections such that the entry with the maximum objectness
173 |             #confidence is at the top
174 |             conf_sort_index = torch.sort(image_pred_class[:,4], descending = True )[1]
175 |             image_pred_class = image_pred_class[conf_sort_index]
176 |             idx = image_pred_class.size(0)
177 | 
178 |             #if nms has to be done
179 |             if nms:
180 |                 #For each detection
181 |                 for i in range(idx):
182 |                     #Get the IOUs of all boxes that come after the one we are looking at 
183 |                     #in the loop
184 |                     try:
185 |                         ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:])
186 |                     except ValueError:
187 |                         break
188 |         
189 |                     except IndexError:
190 |                         break
191 |                     
192 |                     #Zero out all the detections that have IoU > treshhold
193 |                     iou_mask = (ious < nms_conf).float().unsqueeze(1)
194 |                     image_pred_class[i+1:] *= iou_mask       
195 |                     
196 |                     #Remove the non-zero entries
197 |                     non_zero_ind = torch.nonzero(image_pred_class[:,4]).squeeze()
198 |                     image_pred_class = image_pred_class[non_zero_ind].view(-1,7)
199 | 
200 |             #if nms has to be done
201 |             # if nms:
202 |             #     # Perform non-maximum suppression
203 |             #     max_detections = []
204 |             #     while image_pred_class.size(0):
205 |             #         # Get detection with highest confidence and save as max detection
206 |             #         max_detections.append(image_pred_class[0].unsqueeze(0))
207 |             #         # Stop if we're at the last detection
208 |             #         if len(image_pred_class) == 1:
209 |             #             break
210 |             #         # Get the IOUs for all boxes with lower confidence
211 |             #         ious = bbox_iou(max_detections[-1], image_pred_class[1:])
212 |             #         # Remove detections with IoU >= NMS threshold
213 |             #         image_pred_class = image_pred_class[1:][ious < nms_conf]
214 | 
215 |             #     image_pred_class = torch.cat(max_detections).data
216 | 
217 | 
218 |             #Concatenate the batch_id of the image to the detection
219 |             #this helps us identify which image does the detection correspond to 
220 |             #We use a linear straucture to hold ALL the detections from the batch
221 |             #the batch_dim is flattened
222 |             #batch is identified by extra batch column
223 | 
224 |             batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind)
225 |             seq = batch_ind, image_pred_class
226 |             if not write:
227 |                 output = torch.cat(seq,1)
228 |                 write = True
229 |             else:
230 |                 out = torch.cat(seq,1)
231 |                 output = torch.cat((output,out))
232 |             num += 1
233 |     
234 |     if not num:
235 |         return 0
236 | 
237 |     return output
238 | 
239 | #!/usr/bin/env python3
240 | # -*- coding: utf-8 -*-
241 | """
242 | Created on Sat Mar 24 00:12:16 2018
243 | 
244 | @author: ayooshmac
245 | """
246 | 
247 | def predict_transform_half(prediction, inp_dim, anchors, num_classes, CUDA = True):
248 |     batch_size = prediction.size(0)
249 |     stride =  inp_dim // prediction.size(2)
250 | 
251 |     bbox_attrs = 5 + num_classes
252 |     num_anchors = len(anchors)
253 |     grid_size = inp_dim // stride
254 | 
255 |     
256 |     prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size)
257 |     prediction = prediction.transpose(1,2).contiguous()
258 |     prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs)
259 |     
260 |     
261 |     #Sigmoid the  centre_X, centre_Y. and object confidencce
262 |     prediction[:,:,0] = torch.sigmoid(prediction[:,:,0])
263 |     prediction[:,:,1] = torch.sigmoid(prediction[:,:,1])
264 |     prediction[:,:,4] = torch.sigmoid(prediction[:,:,4])
265 | 
266 |     
267 |     #Add the center offsets
268 |     grid_len = np.arange(grid_size)
269 |     a,b = np.meshgrid(grid_len, grid_len)
270 |     
271 |     x_offset = torch.FloatTensor(a).view(-1,1)
272 |     y_offset = torch.FloatTensor(b).view(-1,1)
273 |     
274 |     if CUDA:
275 |         x_offset = x_offset.cuda().half()
276 |         y_offset = y_offset.cuda().half()
277 |     
278 |     x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1,num_anchors).view(-1,2).unsqueeze(0)
279 |     
280 |     prediction[:,:,:2] += x_y_offset
281 |       
282 |     #log space transform height and the width
283 |     anchors = torch.HalfTensor(anchors)
284 |     
285 |     if CUDA:
286 |         anchors = anchors.cuda()
287 |     
288 |     anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0)
289 |     prediction[:,:,2:4] = torch.exp(prediction[:,:,2:4])*anchors
290 | 
291 |     #Softmax the class scores
292 |     prediction[:,:,5: 5 + num_classes] = nn.Softmax(-1)(Variable(prediction[:,:, 5 : 5 + num_classes])).data
293 | 
294 |     prediction[:,:,:4] *= stride
295 |     
296 |     
297 |     return prediction
298 | 
299 | 
300 | def write_results_half(prediction, confidence, num_classes, nms = True, nms_conf = 0.4):
301 |     conf_mask = (prediction[:,:,4] > confidence).half().unsqueeze(2)
302 |     prediction = prediction*conf_mask
303 |     
304 |     try:
305 |         ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous()
306 |     except:
307 |         return 0
308 |     
309 |     
310 |     
311 |     box_a = prediction.new(prediction.shape)
312 |     box_a[:,:,0] = (prediction[:,:,0] - prediction[:,:,2]/2)
313 |     box_a[:,:,1] = (prediction[:,:,1] - prediction[:,:,3]/2)
314 |     box_a[:,:,2] = (prediction[:,:,0] + prediction[:,:,2]/2) 
315 |     box_a[:,:,3] = (prediction[:,:,1] + prediction[:,:,3]/2)
316 |     prediction[:,:,:4] = box_a[:,:,:4]
317 |     
318 |     
319 |     
320 |     batch_size = prediction.size(0)
321 |     
322 |     output = prediction.new(1, prediction.size(2) + 1)
323 |     write = False
324 |     
325 |     for ind in range(batch_size):
326 |         #select the image from the batch
327 |         image_pred = prediction[ind]
328 | 
329 |         
330 |         #Get the class having maximum score, and the index of that class
331 |         #Get rid of num_classes softmax scores 
332 |         #Add the class index and the class score of class having maximum score
333 |         max_conf, max_conf_score = torch.max(image_pred[:,5:5+ num_classes], 1)
334 |         max_conf = max_conf.half().unsqueeze(1)
335 |         max_conf_score = max_conf_score.half().unsqueeze(1)
336 |         seq = (image_pred[:,:5], max_conf, max_conf_score)
337 |         image_pred = torch.cat(seq, 1)
338 |         
339 |         
340 |         #Get rid of the zero entries
341 |         non_zero_ind =  (torch.nonzero(image_pred[:,4]))
342 |         try:
343 |             image_pred_ = image_pred[non_zero_ind.squeeze(),:]
344 |         except:
345 |             continue
346 |         
347 |         #Get the various classes detected in the image
348 |         img_classes = unique(image_pred_[:,-1].long()).half()
349 |         
350 |         
351 |         
352 |                 
353 |         #WE will do NMS classwise
354 |         for cls in img_classes:
355 |             #get the detections with one particular class
356 |             cls_mask = image_pred_*(image_pred_[:,-1] == cls).half().unsqueeze(1)
357 |             class_mask_ind = torch.nonzero(cls_mask[:,-2]).squeeze()
358 |             
359 | 
360 |             image_pred_class = image_pred_[class_mask_ind]
361 | 
362 |         
363 |              #sort the detections such that the entry with the maximum objectness
364 |              #confidence is at the top
365 |             conf_sort_index = torch.sort(image_pred_class[:,4], descending = True )[1]
366 |             image_pred_class = image_pred_class[conf_sort_index]
367 |             idx = image_pred_class.size(0)
368 |             
369 |             #if nms has to be done
370 |             if nms:
371 |                 #For each detection
372 |                 for i in range(idx):
373 |                     #Get the IOUs of all boxes that come after the one we are looking at 
374 |                     #in the loop
375 |                     try:
376 |                         ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:])
377 |                     except ValueError:
378 |                         break
379 |         
380 |                     except IndexError:
381 |                         break
382 |                     
383 |                     #Zero out all the detections that have IoU > treshhold
384 |                     iou_mask = (ious < nms_conf).half().unsqueeze(1)
385 |                     image_pred_class[i+1:] *= iou_mask       
386 |                     
387 |                     #Remove the non-zero entries
388 |                     non_zero_ind = torch.nonzero(image_pred_class[:,4]).squeeze()
389 |                     image_pred_class = image_pred_class[non_zero_ind]
390 |                     
391 |                     
392 |             
393 |             #Concatenate the batch_id of the image to the detection
394 |             #this helps us identify which image does the detection correspond to 
395 |             #We use a linear straucture to hold ALL the detections from the batch
396 |             #the batch_dim is flattened
397 |             #batch is identified by extra batch column
398 |             batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind)
399 |             seq = batch_ind, image_pred_class
400 |             
401 |             if not write:
402 |                 output = torch.cat(seq,1)
403 |                 write = True
404 |             else:
405 |                 out = torch.cat(seq,1)
406 |                 output = torch.cat((output,out))
407 |     
408 |     return output
409 | 


--------------------------------------------------------------------------------
/yolov3-608.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | # batch=1
  4 | # subdivisions=1
  5 | # Training
  6 | batch=64
  7 | subdivisions=16
  8 | width=608
  9 | height=608
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | # Downsample
 34 | 
 35 | [convolutional]
 36 | batch_normalize=1
 37 | filters=64
 38 | size=3
 39 | stride=2
 40 | pad=1
 41 | activation=leaky
 42 | 
 43 | [convolutional]
 44 | batch_normalize=1
 45 | filters=32
 46 | size=1
 47 | stride=1
 48 | pad=1
 49 | activation=leaky
 50 | 
 51 | [convolutional]
 52 | batch_normalize=1
 53 | filters=64
 54 | size=3
 55 | stride=1
 56 | pad=1
 57 | activation=leaky
 58 | 
 59 | [shortcut]
 60 | from=-3
 61 | activation=linear
 62 | 
 63 | # Downsample
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=2
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=64
 76 | size=1
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [convolutional]
 82 | batch_normalize=1
 83 | filters=128
 84 | size=3
 85 | stride=1
 86 | pad=1
 87 | activation=leaky
 88 | 
 89 | [shortcut]
 90 | from=-3
 91 | activation=linear
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=64
 96 | size=1
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 | 
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 | 
113 | # Downsample
114 | 
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 | 
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 | 
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 | 
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 | 
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 | 
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 | 
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 | 
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 | 
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 | 
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 | 
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 | 
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 | 
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 | 
203 | 
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 | 
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 | 
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 | 
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 | 
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 | 
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 | 
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 | 
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 | 
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 | 
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 | 
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 | 
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 | 
284 | # Downsample
285 | 
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 | 
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 | 
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 | 
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 | 
314 | 
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 | 
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 | 
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 | 
335 | 
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 | 
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 | 
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 | 
356 | 
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 | 
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 | 
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 | 
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 | 
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 | 
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 | 
397 | 
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 | 
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 | 
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 | 
418 | 
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 | 
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 | 
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 | 
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 | 
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 | 
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 | 
459 | # Downsample
460 | 
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 | 
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 | 
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 | 
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 | 
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 | 
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 | 
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 | 
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 | 
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 | 
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 | 
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 | 
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 | 
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 | 
549 | ######################
550 | 
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 | 
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 | 
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 | 
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 | 
583 | [convolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 | 
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 | 
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=255
604 | activation=linear
605 | 
606 | 
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
610 | classes=80
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .7
614 | truth_thresh = 1
615 | random=1
616 | 
617 | 
618 | [route]
619 | layers = -4
620 | 
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 | 
629 | [upsample]
630 | stride=2
631 | 
632 | [route]
633 | layers = -1, 61
634 | 
635 | 
636 | 
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 | 
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 | 
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 | 
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 | 
669 | [convolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 | 
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 | 
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=255
690 | activation=linear
691 | 
692 | 
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
696 | classes=80
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .7
700 | truth_thresh = 1
701 | random=1
702 | 
703 | 
704 | 
705 | [route]
706 | layers = -4
707 | 
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 | 
716 | [upsample]
717 | stride=2
718 | 
719 | [route]
720 | layers = -1, 36
721 | 
722 | 
723 | 
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 | 
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 | 
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 | 
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 | 
756 | [convolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 | 
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 | 
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=255
777 | activation=linear
778 | 
779 | 
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
783 | classes=80
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .7
787 | truth_thresh = 1
788 | random=1
789 | 
790 | 


--------------------------------------------------------------------------------
/yolov3_to_onnx.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | 
  3 | from __future__ import print_function
  4 | from collections import OrderedDict
  5 | import hashlib
  6 | import os.path
  7 | 
  8 | import wget
  9 | 
 10 | import onnx
 11 | from onnx import helper
 12 | from onnx import TensorProto
 13 | import numpy as np
 14 | 
 15 | import sys
 16 | 
 17 | class DarkNetParser(object):
 18 |     """Definition of a parser for DarkNet-based YOLOv3-608 (only tested for this topology)."""
 19 | 
 20 |     def __init__(self, supported_layers):
 21 |         """Initializes a DarkNetParser object.
 22 | 
 23 |         Keyword argument:
 24 |         supported_layers -- a string list of supported layers in DarkNet naming convention,
 25 |         parameters are only added to the class dictionary if a parsed layer is included.
 26 |         """
 27 | 
 28 |         # A list of YOLOv3 layers containing dictionaries with all layer
 29 |         # parameters:
 30 |         self.layer_configs = OrderedDict()
 31 |         self.supported_layers = supported_layers
 32 |         self.layer_counter = 0
 33 | 
 34 |     def parse_cfg_file(self, cfg_file_path):
 35 |         """Takes the yolov3.cfg file and parses it layer by layer,
 36 |         appending each layer's parameters as a dictionary to layer_configs.
 37 | 
 38 |         Keyword argument:
 39 |         cfg_file_path -- path to the yolov3.cfg file as string
 40 |         """
 41 |         with open(cfg_file_path, 'rb') as cfg_file:
 42 |             remainder = cfg_file.read()
 43 |             while remainder is not None:
 44 |                 layer_dict, layer_name, remainder = self._next_layer(remainder)
 45 |                 if layer_dict is not None:
 46 |                     self.layer_configs[layer_name] = layer_dict
 47 |         return self.layer_configs
 48 | 
 49 |     def _next_layer(self, remainder):
 50 |         """Takes in a string and segments it by looking for DarkNet delimiters.
 51 |         Returns the layer parameters and the remaining string after the last delimiter.
 52 |         Example for the first Conv layer in yolo.cfg ...
 53 | 
 54 |         [convolutional]
 55 |         batch_normalize=1
 56 |         filters=32
 57 |         size=3
 58 |         stride=1
 59 |         pad=1
 60 |         activation=leaky
 61 | 
 62 |         ... becomes the following layer_dict return value:
 63 |         {'activation': 'leaky', 'stride': 1, 'pad': 1, 'filters': 32,
 64 |         'batch_normalize': 1, 'type': 'convolutional', 'size': 3}.
 65 | 
 66 |         '001_convolutional' is returned as layer_name, and all lines that follow in yolo.cfg
 67 |         are returned as the next remainder.
 68 | 
 69 |         Keyword argument:
 70 |         remainder -- a string with all raw text after the previously parsed layer
 71 |         """
 72 |         remainder = remainder.split('[', 1)
 73 |         if len(remainder) == 2:
 74 |             remainder = remainder[1]
 75 |         else:
 76 |             return None, None, None
 77 |         remainder = remainder.split(']', 1)
 78 |         if len(remainder) == 2:
 79 |             layer_type, remainder = remainder
 80 |         else:
 81 |             return None, None, None
 82 |         if remainder.replace(' ', '')[0] == '#':
 83 |             remainder = remainder.split('\n', 1)[1]
 84 | 
 85 |         layer_param_block, remainder = remainder.split('\n\n', 1)
 86 |         layer_param_lines = layer_param_block.split('\n')[1:]
 87 |         layer_name = str(self.layer_counter).zfill(3) + '_' + layer_type
 88 |         layer_dict = dict(type=layer_type)
 89 |         if layer_type in self.supported_layers:
 90 |             for param_line in layer_param_lines:
 91 |                 if param_line[0] == '#':
 92 |                     continue
 93 |                 param_type, param_value = self._parse_params(param_line)
 94 |                 layer_dict[param_type] = param_value
 95 |         self.layer_counter += 1
 96 |         return layer_dict, layer_name, remainder
 97 | 
 98 |     def _parse_params(self, param_line):
 99 |         """Identifies the parameters contained in one of the cfg file and returns
100 |         them in the required format for each parameter type, e.g. as a list, an int or a float.
101 | 
102 |         Keyword argument:
103 |         param_line -- one parsed line within a layer block
104 |         """
105 |         param_line = param_line.replace(' ', '')
106 |         param_type, param_value_raw = param_line.split('=')
107 |         param_value = None
108 |         if param_type == 'layers':
109 |             layer_indexes = list()
110 |             for index in param_value_raw.split(','):
111 |                 layer_indexes.append(int(index))
112 |             param_value = layer_indexes
113 |         elif isinstance(param_value_raw, str) and not param_value_raw.isalpha():
114 |             condition_param_value_positive = param_value_raw.isdigit()
115 |             condition_param_value_negative = param_value_raw[0] == '-' and \
116 |                 param_value_raw[1:].isdigit()
117 |             if condition_param_value_positive or condition_param_value_negative:
118 |                 param_value = int(param_value_raw)
119 |             else:
120 |                 param_value = float(param_value_raw)
121 |         else:
122 |             param_value = str(param_value_raw)
123 |         return param_type, param_value
124 | 
125 | 
126 | class MajorNodeSpecs(object):
127 |     """Helper class used to store the names of ONNX output names,
128 |     corresponding to the output of a DarkNet layer and its output channels.
129 |     Some DarkNet layers are not created and there is no corresponding ONNX node,
130 |     but we still need to track them in order to set up skip connections.
131 |     """
132 | 
133 |     def __init__(self, name, channels):
134 |         """ Initialize a MajorNodeSpecs object.
135 | 
136 |         Keyword arguments:
137 |         name -- name of the ONNX node
138 |         channels -- number of output channels of this node
139 |         """
140 |         self.name = name
141 |         self.channels = channels
142 |         self.created_onnx_node = False
143 |         if name is not None and isinstance(channels, int) and channels > 0:
144 |             self.created_onnx_node = True
145 | 
146 | 
147 | class ConvParams(object):
148 |     """Helper class to store the hyper parameters of a Conv layer,
149 |     including its prefix name in the ONNX graph and the expected dimensions
150 |     of weights for convolution, bias, and batch normalization.
151 | 
152 |     Additionally acts as a wrapper for generating safe names for all
153 |     weights, checking on feasible combinations.
154 |     """
155 | 
156 |     def __init__(self, node_name, batch_normalize, conv_weight_dims):
157 |         """Constructor based on the base node name (e.g. 101_convolutional), the batch
158 |         normalization setting, and the convolutional weights shape.
159 | 
160 |         Keyword arguments:
161 |         node_name -- base name of this YOLO convolutional layer
162 |         batch_normalize -- bool value if batch normalization is used
163 |         conv_weight_dims -- the dimensions of this layer's convolutional weights
164 |         """
165 |         self.node_name = node_name
166 |         self.batch_normalize = batch_normalize
167 |         assert len(conv_weight_dims) == 4
168 |         self.conv_weight_dims = conv_weight_dims
169 | 
170 |     def generate_param_name(self, param_category, suffix):
171 |         """Generates a name based on two string inputs,
172 |         and checks if the combination is valid."""
173 |         assert suffix
174 |         assert param_category in ['bn', 'conv']
175 |         assert(suffix in ['scale', 'mean', 'var', 'weights', 'bias'])
176 |         if param_category == 'bn':
177 |             assert self.batch_normalize
178 |             assert suffix in ['scale', 'bias', 'mean', 'var']
179 |         elif param_category == 'conv':
180 |             assert suffix in ['weights', 'bias']
181 |             if suffix == 'bias':
182 |                 assert not self.batch_normalize
183 |         param_name = self.node_name + '_' + param_category + '_' + suffix
184 |         return param_name
185 | 
186 | 
187 | class WeightLoader(object):
188 |     """Helper class used for loading the serialized weights of a binary file stream
189 |     and returning the initializers and the input tensors required for populating
190 |     the ONNX graph with weights.
191 |     """
192 | 
193 |     def __init__(self, weights_file_path):
194 |         """Initialized with a path to the YOLOv3 .weights file.
195 | 
196 |         Keyword argument:
197 |         weights_file_path -- path to the weights file.
198 |         """
199 |         self.weights_file = self._open_weights_file(weights_file_path)
200 | 
201 |     def load_conv_weights(self, conv_params):
202 |         """Returns the initializers with weights from the weights file and
203 |         the input tensors of a convolutional layer for all corresponding ONNX nodes.
204 | 
205 |         Keyword argument:
206 |         conv_params -- a ConvParams object
207 |         """
208 |         initializer = list()
209 |         inputs = list()
210 |         if conv_params.batch_normalize:
211 |             bias_init, bias_input = self._create_param_tensors(
212 |                 conv_params, 'bn', 'bias')
213 |             bn_scale_init, bn_scale_input = self._create_param_tensors(
214 |                 conv_params, 'bn', 'scale')
215 |             bn_mean_init, bn_mean_input = self._create_param_tensors(
216 |                 conv_params, 'bn', 'mean')
217 |             bn_var_init, bn_var_input = self._create_param_tensors(
218 |                 conv_params, 'bn', 'var')
219 |             initializer.extend(
220 |                 [bn_scale_init, bias_init, bn_mean_init, bn_var_init])
221 |             inputs.extend([bn_scale_input, bias_input,
222 |                            bn_mean_input, bn_var_input])
223 |         else:
224 |             bias_init, bias_input = self._create_param_tensors(
225 |                 conv_params, 'conv', 'bias')
226 |             initializer.append(bias_init)
227 |             inputs.append(bias_input)
228 |         conv_init, conv_input = self._create_param_tensors(
229 |             conv_params, 'conv', 'weights')
230 |         initializer.append(conv_init)
231 |         inputs.append(conv_input)
232 |         return initializer, inputs
233 | 
234 |     def _open_weights_file(self, weights_file_path):
235 |         """Opens a YOLOv3 DarkNet file stream and skips the header.
236 | 
237 |         Keyword argument:
238 |         weights_file_path -- path to the weights file.
239 |         """
240 |         weights_file = open(weights_file_path, 'rb')
241 |         length_header = 5
242 |         np.ndarray(
243 |             shape=(length_header, ), dtype='int32', buffer=weights_file.read(
244 |                 length_header * 4))
245 |         return weights_file
246 | 
247 |     def _create_param_tensors(self, conv_params, param_category, suffix):
248 |         """Creates the initializers with weights from the weights file together with
249 |         the input tensors.
250 | 
251 |         Keyword arguments:
252 |         conv_params -- a ConvParams object
253 |         param_category -- the category of parameters to be created ('bn' or 'conv')
254 |         suffix -- a string determining the sub-type of above param_category (e.g.,
255 |         'weights' or 'bias')
256 |         """
257 |         param_name, param_data, param_data_shape = self._load_one_param_type(
258 |             conv_params, param_category, suffix)
259 | 
260 |         initializer_tensor = helper.make_tensor(
261 |             param_name, TensorProto.FLOAT, param_data_shape, param_data)
262 |         input_tensor = helper.make_tensor_value_info(
263 |             param_name, TensorProto.FLOAT, param_data_shape)
264 |         return initializer_tensor, input_tensor
265 | 
266 |     def _load_one_param_type(self, conv_params, param_category, suffix):
267 |         """Deserializes the weights from a file stream in the DarkNet order.
268 | 
269 |         Keyword arguments:
270 |         conv_params -- a ConvParams object
271 |         param_category -- the category of parameters to be created ('bn' or 'conv')
272 |         suffix -- a string determining the sub-type of above param_category (e.g.,
273 |         'weights' or 'bias')
274 |         """
275 |         param_name = conv_params.generate_param_name(param_category, suffix)
276 |         channels_out, channels_in, filter_h, filter_w = conv_params.conv_weight_dims
277 |         if param_category == 'bn':
278 |             param_shape = [channels_out]
279 |         elif param_category == 'conv':
280 |             if suffix == 'weights':
281 |                 param_shape = [channels_out, channels_in, filter_h, filter_w]
282 |             elif suffix == 'bias':
283 |                 param_shape = [channels_out]
284 |         param_size = np.product(np.array(param_shape))
285 |         param_data = np.ndarray(
286 |             shape=param_shape,
287 |             dtype='float32',
288 |             buffer=self.weights_file.read(param_size * 4))
289 |         param_data = param_data.flatten().astype(float)
290 |         return param_name, param_data, param_shape
291 | 
292 | 
293 | class GraphBuilderONNX(object):
294 |     """Class for creating an ONNX graph from a previously generated list of layer dictionaries."""
295 | 
296 |     def __init__(self, output_tensors):
297 |         """Initialize with all DarkNet default parameters used creating YOLOv3,
298 |         and specify the output tensors as an OrderedDict for their output dimensions
299 |         with their names as keys.
300 | 
301 |         Keyword argument:
302 |         output_tensors -- the output tensors as an OrderedDict containing the keys'
303 |         output dimensions
304 |         """
305 |         self.output_tensors = output_tensors
306 |         self._nodes = list()
307 |         self.graph_def = None
308 |         self.input_tensor = None
309 |         self.epsilon_bn = 1e-5
310 |         self.momentum_bn = 0.99
311 |         self.alpha_lrelu = 0.1
312 |         self.param_dict = OrderedDict()
313 |         self.major_node_specs = list()
314 |         self.batch_size = 1
315 | 
316 |     def build_onnx_graph(
317 |             self,
318 |             layer_configs,
319 |             weights_file_path,
320 |             verbose=True):
321 |         """Iterate over all layer configs (parsed from the DarkNet representation
322 |         of YOLOv3-608), create an ONNX graph, populate it with weights from the weights
323 |         file and return the graph definition.
324 | 
325 |         Keyword arguments:
326 |         layer_configs -- an OrderedDict object with all parsed layers' configurations
327 |         weights_file_path -- location of the weights file
328 |         verbose -- toggles if the graph is printed after creation (default: True)
329 |         """
330 |         for layer_name in layer_configs.keys():
331 |             layer_dict = layer_configs[layer_name]
332 |             major_node_specs = self._make_onnx_node(layer_name, layer_dict)
333 |             if major_node_specs.name is not None:
334 |                 self.major_node_specs.append(major_node_specs)
335 |         outputs = list()
336 |         for tensor_name in self.output_tensors.keys():
337 |             output_dims = [self.batch_size, ] + \
338 |                 self.output_tensors[tensor_name]
339 |             output_tensor = helper.make_tensor_value_info(
340 |                 tensor_name, TensorProto.FLOAT, output_dims)
341 |             outputs.append(output_tensor)
342 |         inputs = [self.input_tensor]
343 |         weight_loader = WeightLoader(weights_file_path)
344 |         initializer = list()
345 |         for layer_name in self.param_dict.keys():
346 |             _, layer_type = layer_name.split('_', 1)
347 |             conv_params = self.param_dict[layer_name]
348 |             assert layer_type == 'convolutional'
349 |             initializer_layer, inputs_layer = weight_loader.load_conv_weights(
350 |                 conv_params)
351 |             initializer.extend(initializer_layer)
352 |             inputs.extend(inputs_layer)
353 |         del weight_loader
354 |         self.graph_def = helper.make_graph(
355 |             nodes=self._nodes,
356 |             name='YOLOv3-608',
357 |             inputs=inputs,
358 |             outputs=outputs,
359 |             initializer=initializer
360 |         )
361 |         if verbose:
362 |             print(helper.printable_graph(self.graph_def))
363 |         model_def = helper.make_model(self.graph_def,
364 |                                       producer_name='NVIDIA TensorRT sample')
365 |         return model_def
366 | 
367 |     def _make_onnx_node(self, layer_name, layer_dict):
368 |         """Take in a layer parameter dictionary, choose the correct function for
369 |         creating an ONNX node and store the information important to graph creation
370 |         as a MajorNodeSpec object.
371 | 
372 |         Keyword arguments:
373 |         layer_name -- the layer's name (also the corresponding key in layer_configs)
374 |         layer_dict -- a layer parameter dictionary (one element of layer_configs)
375 |         """
376 |         layer_type = layer_dict['type']
377 |         if self.input_tensor is None:
378 |             if layer_type == 'net':
379 |                 major_node_output_name, major_node_output_channels = self._make_input_tensor(
380 |                     layer_name, layer_dict)
381 |                 major_node_specs = MajorNodeSpecs(major_node_output_name,
382 |                                                   major_node_output_channels)
383 |             else:
384 |                 raise ValueError('The first node has to be of type "net".')
385 |         else:
386 |             node_creators = dict()
387 |             node_creators['convolutional'] = self._make_conv_node
388 |             node_creators['shortcut'] = self._make_shortcut_node
389 |             node_creators['route'] = self._make_route_node
390 |             node_creators['upsample'] = self._make_upsample_node
391 | 
392 |             if layer_type in node_creators.keys():
393 |                 major_node_output_name, major_node_output_channels = \
394 |                     node_creators[layer_type](layer_name, layer_dict)
395 |                 major_node_specs = MajorNodeSpecs(major_node_output_name,
396 |                                                   major_node_output_channels)
397 |             else:
398 |                 print(
399 |                     'Layer of type %s not supported, skipping ONNX node generation.' %
400 |                     layer_type)
401 |                 major_node_specs = MajorNodeSpecs(layer_name,
402 |                                                   None)
403 |         return major_node_specs
404 | 
405 |     def _make_input_tensor(self, layer_name, layer_dict):
406 |         """Create an ONNX input tensor from a 'net' layer and store the batch size.
407 | 
408 |         Keyword arguments:
409 |         layer_name -- the layer's name (also the corresponding key in layer_configs)
410 |         layer_dict -- a layer parameter dictionary (one element of layer_configs)
411 |         """
412 |         batch_size = layer_dict['batch']
413 |         channels = layer_dict['channels']
414 |         height = layer_dict['height']
415 |         width = layer_dict['width']
416 |         self.batch_size = batch_size
417 |         input_tensor = helper.make_tensor_value_info(
418 |             str(layer_name), TensorProto.FLOAT, [
419 |                 batch_size, channels, height, width])
420 |         self.input_tensor = input_tensor
421 |         return layer_name, channels
422 | 
423 |     def _get_previous_node_specs(self, target_index=-1):
424 |         """Get a previously generated ONNX node (skip those that were not generated).
425 |         Target index can be passed for jumping to a specific index.
426 | 
427 |         Keyword arguments:
428 |         target_index -- optional for jumping to a specific index (default: -1 for jumping
429 |         to previous element)
430 |         """
431 |         previous_node = None
432 |         for node in self.major_node_specs[target_index::-1]:
433 |             if node.created_onnx_node:
434 |                 previous_node = node
435 |                 break
436 |         assert previous_node is not None
437 |         return previous_node
438 | 
439 |     def _make_conv_node(self, layer_name, layer_dict):
440 |         """Create an ONNX Conv node with optional batch normalization and
441 |         activation nodes.
442 | 
443 |         Keyword arguments:
444 |         layer_name -- the layer's name (also the corresponding key in layer_configs)
445 |         layer_dict -- a layer parameter dictionary (one element of layer_configs)
446 |         """
447 |         previous_node_specs = self._get_previous_node_specs()
448 |         inputs = [previous_node_specs.name]
449 |         previous_channels = previous_node_specs.channels
450 |         kernel_size = layer_dict['size']
451 |         stride = layer_dict['stride']
452 |         filters = layer_dict['filters']
453 |         batch_normalize = False
454 |         if 'batch_normalize' in layer_dict.keys(
455 |         ) and layer_dict['batch_normalize'] == 1:
456 |             batch_normalize = True
457 | 
458 |         kernel_shape = [kernel_size, kernel_size]
459 |         weights_shape = [filters, previous_channels] + kernel_shape
460 |         conv_params = ConvParams(layer_name, batch_normalize, weights_shape)
461 | 
462 |         strides = [stride, stride]
463 |         dilations = [1, 1]
464 |         weights_name = conv_params.generate_param_name('conv', 'weights')
465 |         inputs.append(weights_name)
466 |         if not batch_normalize:
467 |             bias_name = conv_params.generate_param_name('conv', 'bias')
468 |             inputs.append(bias_name)
469 | 
470 |         conv_node = helper.make_node(
471 |             'Conv',
472 |             inputs=inputs,
473 |             outputs=[layer_name],
474 |             kernel_shape=kernel_shape,
475 |             strides=strides,
476 |             auto_pad='SAME_LOWER',
477 |             dilations=dilations,
478 |             name=layer_name
479 |         )
480 |         self._nodes.append(conv_node)
481 |         inputs = [layer_name]
482 |         layer_name_output = layer_name
483 | 
484 |         if batch_normalize:
485 |             layer_name_bn = layer_name + '_bn'
486 |             bn_param_suffixes = ['scale', 'bias', 'mean', 'var']
487 |             for suffix in bn_param_suffixes:
488 |                 bn_param_name = conv_params.generate_param_name('bn', suffix)
489 |                 inputs.append(bn_param_name)
490 |             batchnorm_node = helper.make_node(
491 |                 'BatchNormalization',
492 |                 inputs=inputs,
493 |                 outputs=[layer_name_bn],
494 |                 epsilon=self.epsilon_bn,
495 |                 momentum=self.momentum_bn,
496 |                 name=layer_name_bn
497 |             )
498 |             self._nodes.append(batchnorm_node)
499 |             inputs = [layer_name_bn]
500 |             layer_name_output = layer_name_bn
501 | 
502 |         if layer_dict['activation'] == 'leaky':
503 |             layer_name_lrelu = layer_name + '_lrelu'
504 | 
505 |             lrelu_node = helper.make_node(
506 |                 'LeakyRelu',
507 |                 inputs=inputs,
508 |                 outputs=[layer_name_lrelu],
509 |                 name=layer_name_lrelu,
510 |                 alpha=self.alpha_lrelu
511 |             )
512 |             self._nodes.append(lrelu_node)
513 |             inputs = [layer_name_lrelu]
514 |             layer_name_output = layer_name_lrelu
515 |         elif layer_dict['activation'] == 'linear':
516 |             pass
517 |         else:
518 |             print('Activation not supported.')
519 | 
520 |         self.param_dict[layer_name] = conv_params
521 |         return layer_name_output, filters
522 | 
523 |     def _make_shortcut_node(self, layer_name, layer_dict):
524 |         """Create an ONNX Add node with the shortcut properties from
525 |         the DarkNet-based graph.
526 | 
527 |         Keyword arguments:
528 |         layer_name -- the layer's name (also the corresponding key in layer_configs)
529 |         layer_dict -- a layer parameter dictionary (one element of layer_configs)
530 |         """
531 |         shortcut_index = layer_dict['from']
532 |         activation = layer_dict['activation']
533 |         assert activation == 'linear'
534 | 
535 |         first_node_specs = self._get_previous_node_specs()
536 |         second_node_specs = self._get_previous_node_specs(
537 |             target_index=shortcut_index)
538 |         assert first_node_specs.channels == second_node_specs.channels
539 |         channels = first_node_specs.channels
540 |         inputs = [first_node_specs.name, second_node_specs.name]
541 |         shortcut_node = helper.make_node(
542 |             'Add',
543 |             inputs=inputs,
544 |             outputs=[layer_name],
545 |             name=layer_name,
546 |         )
547 |         self._nodes.append(shortcut_node)
548 |         return layer_name, channels
549 | 
550 |     def _make_route_node(self, layer_name, layer_dict):
551 |         """If the 'layers' parameter from the DarkNet configuration is only one index, continue
552 |         node creation at the indicated (negative) index. Otherwise, create an ONNX Concat node
553 |         with the route properties from the DarkNet-based graph.
554 | 
555 |         Keyword arguments:
556 |         layer_name -- the layer's name (also the corresponding key in layer_configs)
557 |         layer_dict -- a layer parameter dictionary (one element of layer_configs)
558 |         """
559 |         route_node_indexes = layer_dict['layers']
560 |         if len(route_node_indexes) == 1:
561 |             split_index = route_node_indexes[0]
562 |             assert split_index < 0
563 |             # Increment by one because we skipped the YOLO layer:
564 |             split_index += 1
565 |             self.major_node_specs = self.major_node_specs[:split_index]
566 |             layer_name = None
567 |             channels = None
568 |         else:
569 |             inputs = list()
570 |             channels = 0
571 |             for index in route_node_indexes:
572 |                 if index > 0:
573 |                     # Increment by one because we count the input as a node (DarkNet
574 |                     # does not)
575 |                     index += 1
576 |                 route_node_specs = self._get_previous_node_specs(
577 |                     target_index=index)
578 |                 inputs.append(route_node_specs.name)
579 |                 channels += route_node_specs.channels
580 |             assert inputs
581 |             assert channels > 0
582 | 
583 |             route_node = helper.make_node(
584 |                 'Concat',
585 |                 axis=1,
586 |                 inputs=inputs,
587 |                 outputs=[layer_name],
588 |                 name=layer_name,
589 |             )
590 |             self._nodes.append(route_node)
591 |         return layer_name, channels
592 | 
593 |     def _make_upsample_node(self, layer_name, layer_dict):
594 |         """Create an ONNX Upsample node with the properties from
595 |         the DarkNet-based graph.
596 | 
597 |         Keyword arguments:
598 |         layer_name -- the layer's name (also the corresponding key in layer_configs)
599 |         layer_dict -- a layer parameter dictionary (one element of layer_configs)
600 |         """
601 |         upsample_factor = float(layer_dict['stride'])
602 |         previous_node_specs = self._get_previous_node_specs()
603 |         inputs = [previous_node_specs.name]
604 |         channels = previous_node_specs.channels
605 |         assert channels > 0
606 |         upsample_node = helper.make_node(
607 |             'Upsample',
608 |             mode='nearest',
609 |             # For ONNX versions <0.7.0, Upsample nodes accept different parameters than 'scales':
610 |             scales=[1.0, 1.0, upsample_factor, upsample_factor],
611 |             inputs=inputs,
612 |             outputs=[layer_name],
613 |             name=layer_name,
614 |         )
615 |         self._nodes.append(upsample_node)
616 |         return layer_name, channels
617 | 
618 | def main():
619 |     """Run the DarkNet-to-ONNX conversion for YOLOv3-608."""
620 |     # Have to use python 2 due to hashlib compatibility
621 |     if sys.version_info[0] > 2:
622 |         raise Exception("This is script is only compatible with python2, please re-run this script \
623 |     with python2. The rest of this sample can be run with either version of python")
624 | 
625 |     cfg_file_path = "yolov3-608.cfg"
626 | 
627 |     # These are the only layers DarkNetParser will extract parameters from. The three layers of
628 |     # type 'yolo' are not parsed in detail because they are included in the post-processing later:
629 |     supported_layers = ['net', 'convolutional', 'shortcut', 'route', 'upsample']
630 | 
631 |     # Create a DarkNetParser object, and the use it to generate an OrderedDict with all
632 |     # layer's configs from the cfg file:
633 |     parser = DarkNetParser(supported_layers)
634 |     layer_configs = parser.parse_cfg_file(cfg_file_path)
635 |     # We do not need the parser anymore after we got layer_configs:
636 |     del parser
637 | 
638 |     # In above layer_config, there are three outputs that we need to know the output
639 |     # shape of (in CHW format):
640 |     output_tensor_dims = OrderedDict()
641 |     #yolo-v3(608*608)
642 |     output_tensor_dims['082_convolutional'] = [255, 19, 19]
643 |     output_tensor_dims['094_convolutional'] = [255, 38, 38]
644 |     output_tensor_dims['106_convolutional'] = [255, 76, 76]
645 |     #yolo-v3(416*416)
646 |     # output_tensor_dims['082_convolutional'] = [255, 13, 13]
647 |     # output_tensor_dims['094_convolutional'] = [255, 26, 26]
648 |     # output_tensor_dims['106_convolutional'] = [255, 52, 52]
649 | 
650 |     # Create a GraphBuilderONNX object with the known output tensor dimensions:
651 |     builder = GraphBuilderONNX(output_tensor_dims)
652 | 
653 |     weights_file_path = "yolov3-608.weights"
654 | 
655 |     # Now generate an ONNX graph with weights from the previously parsed layer configurations
656 |     # and the weights file:
657 |     yolov3_model_def = builder.build_onnx_graph(
658 |         layer_configs=layer_configs,
659 |         weights_file_path=weights_file_path,
660 |         verbose=True)
661 |     # Once we have the model definition, we do not need the builder anymore:
662 |     del builder
663 | 
664 |     # Perform a sanity check on the ONNX model definition:
665 |     onnx.checker.check_model(yolov3_model_def)
666 | 
667 |     # Serialize the generated ONNX graph to this file:
668 |     output_file_path = 'yolov3-608.onnx'
669 |     onnx.save(yolov3_model_def, output_file_path)
670 | 
671 | if __name__ == '__main__':
672 |     main()


--------------------------------------------------------------------------------