├── 000000020247.jpg
├── 000000046804.jpg
├── 000000079565.jpg
├── 000000081988.jpg
├── example_01.jpg
├── example_02.jpg
├── README.md
├── convert-onnx
    ├── convert_onnx.py
    ├── backbone.py
    ├── yolact.py
    └── config.py
├── main_yolact.py
└── main.cpp


/000000020247.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc203/yolact-opencv-dnn-cpp-python/HEAD/000000020247.jpg


--------------------------------------------------------------------------------
/000000046804.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc203/yolact-opencv-dnn-cpp-python/HEAD/000000046804.jpg


--------------------------------------------------------------------------------
/000000079565.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc203/yolact-opencv-dnn-cpp-python/HEAD/000000079565.jpg


--------------------------------------------------------------------------------
/000000081988.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc203/yolact-opencv-dnn-cpp-python/HEAD/000000081988.jpg


--------------------------------------------------------------------------------
/example_01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc203/yolact-opencv-dnn-cpp-python/HEAD/example_01.jpg


--------------------------------------------------------------------------------
/example_02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc203/yolact-opencv-dnn-cpp-python/HEAD/example_02.jpg


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # yolact-opencv-dnn-cpp-python
 2 | 使用opencv部署yolact实例分割，包含C++和Python两种版本的程序。
 3 | 
 4 | onnx文件从百度云盘下载，
 5 | 链接:https://pan.baidu.com/s/1509Cn70a4iPS4UuCC4sduw 提取码:8tlj
 6 | 
 7 | 5月1日，提交了转换生成onnx文件的程序，在convert-onnx目录里，原始的.pth
 8 | 文件从百度云盘下载，链接：https://pan.baidu.com/s/1AVgOAPCChcQ0a46U7F7QlA 
 9 | 提取码：8hsp 
10 | 


--------------------------------------------------------------------------------
/convert-onnx/convert_onnx.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os
 3 | import cv2
 4 | from yolact import Yolact
 5 | 
 6 | if __name__=='__main__':
 7 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
 8 |     trained_model = 'yolact_base_54_800000.pth'
 9 |     net = Yolact()
10 |     net.load_weights(trained_model)
11 |     net.eval()
12 |     net.to(device)
13 | 
14 |     output_onnx = os.path.splitext(trained_model)[0] + '.onnx'
15 |     inputs = torch.randn(1, 3, 550, 550).to(device)
16 |     print('convert',output_onnx,'begin')
17 |     torch.onnx.export(net, inputs, output_onnx, verbose=False, opset_version=12, input_names=['image'],
18 |                       output_names=['loc', 'conf', 'mask', 'proto'])
19 |     print('convert', output_onnx, 'to onnx finish!!!')
20 | 
21 |     try:
22 |         dnnnet = cv2.dnn.readNet(output_onnx)
23 |         print('read sucess')
24 |     except:
25 |         print('read failed')
26 |         dnnnet = cv2.dnn.readNet(output_onnx)


--------------------------------------------------------------------------------
/main_yolact.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | import argparse
  4 | 
  5 | COCO_CLASSES = ('background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
  6 |                 'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
  7 |                 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
  8 |                 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
  9 |                 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
 10 |                 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
 11 |                 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
 12 |                 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
 13 |                 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
 14 |                 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
 15 |                 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
 16 |                 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
 17 |                 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
 18 |                 'scissors', 'teddy bear', 'hair drier', 'toothbrush')
 19 | 
 20 | colors = [
 21 |         [56, 0, 255],
 22 |         [226, 255, 0],
 23 |         [0, 94, 255],
 24 |         [0, 37, 255],
 25 |         [0, 255, 94],
 26 |         [255, 226, 0],
 27 |         [0, 18, 255],
 28 |         [255, 151, 0],
 29 |         [170, 0, 255],
 30 |         [0, 255, 56],
 31 |         [255, 0, 75],
 32 |         [0, 75, 255],
 33 |         [0, 255, 169],
 34 |         [255, 0, 207],
 35 |         [75, 255, 0],
 36 |         [207, 0, 255],
 37 |         [37, 0, 255],
 38 |         [0, 207, 255],
 39 |         [94, 0, 255],
 40 |         [0, 255, 113],
 41 |         [255, 18, 0],
 42 |         [255, 0, 56],
 43 |         [18, 0, 255],
 44 |         [0, 255, 226],
 45 |         [170, 255, 0],
 46 |         [255, 0, 245],
 47 |         [151, 255, 0],
 48 |         [132, 255, 0],
 49 |         [75, 0, 255],
 50 |         [151, 0, 255],
 51 |         [0, 151, 255],
 52 |         [132, 0, 255],
 53 |         [0, 255, 245],
 54 |         [255, 132, 0],
 55 |         [226, 0, 255],
 56 |         [255, 37, 0],
 57 |         [207, 255, 0],
 58 |         [0, 255, 207],
 59 |         [94, 255, 0],
 60 |         [0, 226, 255],
 61 |         [56, 255, 0],
 62 |         [255, 94, 0],
 63 |         [255, 113, 0],
 64 |         [0, 132, 255],
 65 |         [255, 0, 132],
 66 |         [255, 170, 0],
 67 |         [255, 0, 188],
 68 |         [113, 255, 0],
 69 |         [245, 0, 255],
 70 |         [113, 0, 255],
 71 |         [255, 188, 0],
 72 |         [0, 113, 255],
 73 |         [255, 0, 0],
 74 |         [0, 56, 255],
 75 |         [255, 0, 113],
 76 |         [0, 255, 188],
 77 |         [255, 0, 94],
 78 |         [255, 0, 18],
 79 |         [18, 255, 0],
 80 |         [0, 255, 132],
 81 |         [0, 188, 255],
 82 |         [0, 245, 255],
 83 |         [0, 169, 255],
 84 |         [37, 255, 0],
 85 |         [255, 0, 151],
 86 |         [188, 0, 255],
 87 |         [0, 255, 37],
 88 |         [0, 255, 0],
 89 |         [255, 0, 170],
 90 |         [255, 0, 37],
 91 |         [255, 75, 0],
 92 |         [0, 0, 255],
 93 |         [255, 207, 0],
 94 |         [255, 0, 226],
 95 |         [255, 245, 0],
 96 |         [188, 255, 0],
 97 |         [0, 255, 18],
 98 |         [0, 255, 75],
 99 |         [0, 255, 151],
100 |         [255, 56, 0],
101 |         [245, 255, 0],
102 |     ]
103 | 
104 | class yolact():
105 |     def __init__(self, confThreshold=0.5, nmsThreshold=0.5, keep_top_k=200):
106 |         self.target_size = 550
107 |         self.MEANS = np.array([103.94, 116.78, 123.68], dtype=np.float32).reshape(1, 1, 3)
108 |         self.STD = np.array([57.38, 57.12, 58.40], dtype=np.float32).reshape(1, 1, 3)
109 |         self.net = cv2.dnn.readNet('yolact_base_54_800000.onnx')
110 |         self.confidence_threshold = confThreshold
111 |         self.nms_threshold = nmsThreshold
112 |         self.keep_top_k = keep_top_k
113 |         self.conv_ws = [69, 35, 18, 9, 5]
114 |         self.conv_hs = [69, 35, 18, 9, 5]
115 |         self.aspect_ratios = [1, 0.5, 2]
116 |         self.scales = [24, 48, 96, 192, 384]
117 |         self.variances = [0.1, 0.2]
118 |         self.last_img_size = None
119 |         self.priors = self.make_priors()
120 | 
121 |     def make_priors(self):
122 |         """ Note that priors are [x,y,width,height] where (x,y) is the center of the box. """
123 |         if self.last_img_size != (self.target_size, self.target_size):
124 |             prior_data = []
125 | 
126 |             for conv_w, conv_h, scale in zip(self.conv_ws, self.conv_hs, self.scales):
127 |                 for i in range(conv_h):
128 |                     for j in range(conv_w):
129 |                         # +0.5 because priors are in center-size notation
130 |                         cx = (j + 0.5) / conv_w
131 |                         cy = (i + 0.5) / conv_h
132 | 
133 |                         for ar in self.aspect_ratios:
134 |                             ar = np.sqrt(ar)
135 | 
136 |                             w = scale * ar / self.target_size
137 |                             h = scale / ar / self.target_size
138 | 
139 |                             # This is for backward compatability with a bug where I made everything square by accident
140 |                             h = w
141 | 
142 |                             prior_data += [cx, cy, w, h]
143 | 
144 |             self.priors = np.array(prior_data).reshape(-1, 4)
145 |             self.last_img_size = (self.target_size, self.target_size)
146 |         return self.priors
147 | 
148 |     def decode(self, loc, priors, img_w, img_h):
149 |         boxes = np.concatenate(
150 |             (
151 |                 priors[:, :2] + loc[:, :2] * self.variances[0] * priors[:, 2:],
152 |                 priors[:, 2:] * np.exp(loc[:, 2:] * self.variances[1]),
153 |             ),
154 |             1,
155 |         )
156 |         boxes[:, :2] -= boxes[:, 2:] / 2
157 |         # boxes[:, 2:] += boxes[:, :2]
158 | 
159 |         # crop
160 |         np.where(boxes[:, 0] < 0, 0, boxes[:, 0])
161 |         np.where(boxes[:, 1] < 0, 0, boxes[:, 1])
162 |         np.where(boxes[:, 2] > 1, 1, boxes[:, 2])
163 |         np.where(boxes[:, 3] > 1, 1, boxes[:, 3])
164 | 
165 |         # decode to img size
166 |         boxes[:, 0] *= img_w
167 |         boxes[:, 1] *= img_h
168 |         boxes[:, 2] = boxes[:, 2] * img_w + 1
169 |         boxes[:, 3] = boxes[:, 3] * img_h + 1
170 |         return boxes
171 | 
172 |     def detect(self, srcimg):
173 |         img_h, img_w = srcimg.shape[:2]
174 |         img = cv2.resize(srcimg, (self.target_size, self.target_size), interpolation=cv2.INTER_LINEAR).astype(np.float32)
175 |         img = (img - self.MEANS) / self.STD
176 | 
177 |         blob = cv2.dnn.blobFromImage(img, swapRB=True)
178 |         # Sets the input to the network
179 |         self.net.setInput(blob)
180 |         # Runs the forward pass to get output of the output layers
181 |         loc_data, conf_preds, mask_data, proto_data = self.net.forward(self.net.getUnconnectedOutLayersNames())
182 | 
183 |         cur_scores = conf_preds[:, 1:]
184 |         num_class = cur_scores.shape[1]
185 |         classid = np.argmax(cur_scores, axis=1)
186 |         # conf_scores = np.max(cur_scores, axis=1)
187 |         conf_scores = cur_scores[range(cur_scores.shape[0]), classid]
188 | 
189 |         # filte by confidence_threshold
190 |         keep = conf_scores > self.confidence_threshold
191 |         conf_scores = conf_scores[keep]
192 |         classid = classid[keep]
193 |         loc_data = loc_data[keep, :]
194 |         prior_data = self.priors[keep, :]
195 |         masks = mask_data[keep, :]
196 |         boxes = self.decode(loc_data, prior_data, img_w, img_h)
197 |         indices = cv2.dnn.NMSBoxes(boxes.tolist(), conf_scores.tolist(), self.confidence_threshold, self.nms_threshold , top_k=self.keep_top_k)
198 |         for i in indices:
199 |             idx = i[0]
200 |             left, top, width, height = boxes[idx, :].astype(np.int32).tolist()
201 |             cv2.rectangle(srcimg, (left, top), (left+width, top+height), (0, 0, 255), thickness=1)
202 |             cv2.putText(srcimg, COCO_CLASSES[classid[idx]+1]+':'+str(round(conf_scores[idx], 2)), (left, top-5), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), thickness=2)
203 | 
204 |             # generate mask
205 |             mask = proto_data @ masks[idx, :].reshape(-1,1)
206 |             mask = 1 / (1 + np.exp(-mask))  ###sigmoid
207 | 
208 |             # Scale masks up to the full image
209 |             mask = cv2.resize(mask.squeeze(), (img_w, img_h), interpolation=cv2.INTER_LINEAR)
210 |             mask = mask > 0.5
211 |             srcimg[mask] = srcimg[mask] * 0.5 + np.array(colors[classid[idx]+1]) * 0.5
212 |         return srcimg
213 | 
214 | if __name__=='__main__':
215 |     parser = argparse.ArgumentParser(description='YOLACT COCO Evaluation')
216 |     parser.add_argument('--imgpath', default='000000046804.jpg', type=str, help='A path to an image to use for display.')
217 |     parser.add_argument('--confThreshold', default=0.5, type=float, help='class confidence')
218 |     parser.add_argument('--nmsThreshold', default=0.5, type=float, help='nms iou thresh')
219 |     args = parser.parse_args()
220 | 
221 |     myyolact = yolact()
222 |     srcimg = cv2.imread(args.imgpath)
223 |     srcimg = myyolact.detect(srcimg)
224 | 
225 |     cv2.namedWindow('yolact', cv2.WINDOW_NORMAL)
226 |     cv2.imshow('yolact', srcimg)
227 |     cv2.waitKey(0)
228 |     cv2.destroyAllWindows()


--------------------------------------------------------------------------------
/main.cpp:
--------------------------------------------------------------------------------
  1 | #define _CRT_SECURE_NO_WARNINGS
  2 | #include <fstream>
  3 | #include <iostream>
  4 | #include <opencv2/dnn.hpp>
  5 | #include <opencv2/imgproc.hpp>
  6 | #include <opencv2/highgui.hpp>
  7 | 
  8 | using namespace cv;
  9 | using namespace dnn;
 10 | using namespace std;
 11 | 
 12 | extern const char* class_names[] = { "background",
 13 | 										"person", "bicycle", "car", "motorcycle", "airplane", "bus",
 14 | 										"train", "truck", "boat", "traffic light", "fire hydrant",
 15 | 										"stop sign", "parking meter", "bench", "bird", "cat", "dog",
 16 | 										"horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
 17 | 										"backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
 18 | 										"skis", "snowboard", "sports ball", "kite", "baseball bat",
 19 | 										"baseball glove", "skateboard", "surfboard", "tennis racket",
 20 | 										"bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
 21 | 										"banana", "apple", "sandwich", "orange", "broccoli", "carrot",
 22 | 										"hot dog", "pizza", "donut", "cake", "chair", "couch",
 23 | 										"potted plant", "bed", "dining table", "toilet", "tv", "laptop",
 24 | 										"mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
 25 | 										"toaster", "sink", "refrigerator", "book", "clock", "vase",
 26 | 										"scissors", "teddy bear", "hair drier", "toothbrush"
 27 | };
 28 | 
 29 | extern const unsigned char colors[81][3] = {
 30 | 	{56, 0, 255},
 31 | 	{226, 255, 0},
 32 | 	{0, 94, 255},
 33 | 	{0, 37, 255},
 34 | 	{0, 255, 94},
 35 | 	{255, 226, 0},
 36 | 	{0, 18, 255},
 37 | 	{255, 151, 0},
 38 | 	{170, 0, 255},
 39 | 	{0, 255, 56},
 40 | 	{255, 0, 75},
 41 | 	{0, 75, 255},
 42 | 	{0, 255, 169},
 43 | 	{255, 0, 207},
 44 | 	{75, 255, 0},
 45 | 	{207, 0, 255},
 46 | 	{37, 0, 255},
 47 | 	{0, 207, 255},
 48 | 	{94, 0, 255},
 49 | 	{0, 255, 113},
 50 | 	{255, 18, 0},
 51 | 	{255, 0, 56},
 52 | 	{18, 0, 255},
 53 | 	{0, 255, 226},
 54 | 	{170, 255, 0},
 55 | 	{255, 0, 245},
 56 | 	{151, 255, 0},
 57 | 	{132, 255, 0},
 58 | 	{75, 0, 255},
 59 | 	{151, 0, 255},
 60 | 	{0, 151, 255},
 61 | 	{132, 0, 255},
 62 | 	{0, 255, 245},
 63 | 	{255, 132, 0},
 64 | 	{226, 0, 255},
 65 | 	{255, 37, 0},
 66 | 	{207, 255, 0},
 67 | 	{0, 255, 207},
 68 | 	{94, 255, 0},
 69 | 	{0, 226, 255},
 70 | 	{56, 255, 0},
 71 | 	{255, 94, 0},
 72 | 	{255, 113, 0},
 73 | 	{0, 132, 255},
 74 | 	{255, 0, 132},
 75 | 	{255, 170, 0},
 76 | 	{255, 0, 188},
 77 | 	{113, 255, 0},
 78 | 	{245, 0, 255},
 79 | 	{113, 0, 255},
 80 | 	{255, 188, 0},
 81 | 	{0, 113, 255},
 82 | 	{255, 0, 0},
 83 | 	{0, 56, 255},
 84 | 	{255, 0, 113},
 85 | 	{0, 255, 188},
 86 | 	{255, 0, 94},
 87 | 	{255, 0, 18},
 88 | 	{18, 255, 0},
 89 | 	{0, 255, 132},
 90 | 	{0, 188, 255},
 91 | 	{0, 245, 255},
 92 | 	{0, 169, 255},
 93 | 	{37, 255, 0},
 94 | 	{255, 0, 151},
 95 | 	{188, 0, 255},
 96 | 	{0, 255, 37},
 97 | 	{0, 255, 0},
 98 | 	{255, 0, 170},
 99 | 	{255, 0, 37},
100 | 	{255, 75, 0},
101 | 	{0, 0, 255},
102 | 	{255, 207, 0},
103 | 	{255, 0, 226},
104 | 	{255, 245, 0},
105 | 	{188, 255, 0},
106 | 	{0, 255, 18},
107 | 	{0, 255, 75},
108 | 	{0, 255, 151},
109 | 	{255, 56, 0},
110 | 	{245, 255, 0}
111 | };
112 | 
113 | class yolact
114 | {
115 | 	public:
116 | 		yolact(float confThreshold, float nmsThreshold, const int keep_top_k = 200);
117 | 		void detect(Mat& srcimg);
118 | 	private:
119 | 		const int target_size = 550;
120 | 		const float MEANS[3] = { 123.68, 116.78, 103.94 };
121 | 		const float STD[3] = { 58.40, 57.12, 57.38 };
122 | 		float confidence_threshold;
123 | 		float nms_threshold;
124 | 		int keep_top_k;
125 | 		const int conv_ws[5] = { 69, 35, 18, 9, 5 };
126 | 		const int conv_hs[5] = { 69, 35, 18, 9, 5 };
127 | 		const float aspect_ratios[3] = { 1.f, 0.5f, 2.f };
128 | 		const float scales[5] = { 24.f, 48.f, 96.f, 192.f, 384.f };
129 | 		const float var[4] = { 0.1f, 0.1f, 0.2f, 0.2f };
130 | 		const int mask_h = 138;
131 | 		const int mask_w = 138;
132 | 		int num_priors;
133 | 		float* priorbox;
134 | 		Net net;
135 | 		void normalize(Mat& img);
136 | 		void sigmoid(Mat& out, int length);
137 | };
138 | 
139 | yolact::yolact(float confThreshold, float nmsThreshold, const int keep_top_k)
140 | {
141 | 	this->confidence_threshold = confThreshold;
142 | 	this->nms_threshold = nmsThreshold;
143 | 	this->keep_top_k = keep_top_k;
144 | 	this->net = readNet("yolact_base_54_800000.onnx");
145 | 	this->num_priors = 0;
146 | 	int p = 0;
147 | 	for (p = 0; p < 5; p++)
148 | 	{
149 | 		this->num_priors += this->conv_ws[p] * this->conv_hs[p] * 3;
150 | 	}
151 | 	this->priorbox = new float[4 * this->num_priors];
152 | 	////generate priorbox
153 | 	float* pb = priorbox;
154 | 	for (p = 0; p < 5; p++)
155 | 	{
156 | 		int conv_w = this->conv_ws[p];
157 | 		int conv_h = this->conv_hs[p];
158 | 
159 | 		float scale = this->scales[p];
160 | 
161 | 		for (int i = 0; i < conv_h; i++)
162 | 		{
163 | 			for (int j = 0; j < conv_w; j++)
164 | 			{
165 | 				// +0.5 because priors are in center-size notation
166 | 				float cx = (j + 0.5f) / conv_w;
167 | 				float cy = (i + 0.5f) / conv_h;
168 | 
169 | 				for (int k = 0; k < 3; k++)
170 | 				{
171 | 					float ar = aspect_ratios[k];
172 | 
173 | 					ar = sqrt(ar);
174 | 
175 | 					float w = scale * ar / this->target_size;
176 | 					float h = scale / ar / this->target_size;
177 | 
178 | 					// This is for backward compatability with a bug where I made everything square by accident
179 | 					// cfg.backbone.use_square_anchors:
180 | 					h = w;
181 | 					pb[0] = cx;
182 | 					pb[1] = cy;
183 | 					pb[2] = w;
184 | 					pb[3] = h;
185 | 					pb += 4;
186 | 				}
187 | 			}
188 | 		}
189 | 	}
190 | }
191 | 
192 | void yolact::normalize(Mat& img)
193 | {
194 | 	img.convertTo(img, CV_32F);
195 | 	int i = 0, j = 0;
196 | 	for (i = 0; i < img.rows; i++)
197 | 	{
198 | 		float* pdata = (float*)(img.data + i * img.step);
199 | 		for (j = 0; j < img.cols; j++)
200 | 		{
201 | 			pdata[0] = (pdata[0] - this->MEANS[0]) / this->STD[0];
202 | 			pdata[1] = (pdata[1] - this->MEANS[1]) / this->STD[1];
203 | 			pdata[2] = (pdata[2] - this->MEANS[2]) / this->STD[2];
204 | 			pdata += 3;
205 | 		}
206 | 	}
207 | }
208 | 
209 | void yolact::sigmoid(Mat& out, int length)
210 | {
211 | 	float* pdata = (float*)(out.data);
212 | 	int i = 0;
213 | 	for (i = 0; i < length; i++)
214 | 	{
215 | 		pdata[i] = 1.0 / (1 + expf(-pdata[i]));
216 | 	}
217 | }
218 | 
219 | void yolact::detect(Mat& srcimg)
220 | {
221 | 	int img_w = srcimg.cols;
222 | 	int img_h = srcimg.rows;
223 | 	Mat img;
224 | 	resize(srcimg, img, Size(this->target_size, this->target_size), INTER_LINEAR);
225 | 	cvtColor(img, img, COLOR_BGR2RGB);
226 | 	this->normalize(img);
227 | 	Mat blob = blobFromImage(img);
228 | 	this->net.setInput(blob);
229 | 	vector<Mat> outs;
230 | 	this->net.forward(outs, this->net.getUnconnectedOutLayersNames());
231 | 	
232 | 	/////generate proposals
233 | 	vector<int> classIds;
234 | 	vector<float> confidences;
235 | 	vector<Rect> boxes;
236 | 	vector<int> maskIds;
237 | 	const int num_class = outs[1].cols;
238 | 	for (int i = 0; i < this->num_priors; i++)
239 | 	{
240 | 		Mat scores = outs[1].row(i).colRange(1, num_class);
241 | 		Point classIdPoint;
242 | 		double score;
243 | 		// Get the value and location of the maximum score
244 | 		minMaxLoc(scores, 0, &score, 0, &classIdPoint);
245 | 		if (score > this->confidence_threshold)
246 | 		{
247 | 			const float* loc = (float*)outs[0].data + i * 4;
248 | 			const float* pb = this->priorbox + i * 4;
249 | 			float pb_cx = pb[0];
250 | 			float pb_cy = pb[1];
251 | 			float pb_w = pb[2];
252 | 			float pb_h = pb[3];
253 | 
254 | 			float bbox_cx = var[0] * loc[0] * pb_w + pb_cx;
255 | 			float bbox_cy = var[1] * loc[1] * pb_h + pb_cy;
256 | 			float bbox_w = (float)(exp(var[2] * loc[2]) * pb_w);
257 | 			float bbox_h = (float)(exp(var[3] * loc[3]) * pb_h);
258 | 			float obj_x1 = bbox_cx - bbox_w * 0.5f;
259 | 			float obj_y1 = bbox_cy - bbox_h * 0.5f;
260 | 			float obj_x2 = bbox_cx + bbox_w * 0.5f;
261 | 			float obj_y2 = bbox_cy + bbox_h * 0.5f;
262 | 
263 | 			// clip
264 | 			obj_x1 = max(min(obj_x1 * img_w, (float)(img_w - 1)), 0.f);
265 | 			obj_y1 = max(min(obj_y1 * img_h, (float)(img_h - 1)), 0.f);
266 | 			obj_x2 = max(min(obj_x2 * img_w, (float)(img_w - 1)), 0.f);
267 | 			obj_y2 = max(min(obj_y2 * img_h, (float)(img_h - 1)), 0.f);
268 | 			classIds.push_back(classIdPoint.x);
269 | 			confidences.push_back(score);
270 | 			boxes.push_back(Rect((int)obj_x1, (int)obj_y1, (int)(obj_x2 - obj_x1 + 1), (int)(obj_y2 - obj_y1 + 1)));
271 | 			maskIds.push_back(i);
272 | 		}
273 | 	}
274 | 
275 | 	// Perform non maximum suppression to eliminate redundant overlapping boxes with
276 | 	// lower confidences
277 | 	vector<int> indices;
278 | 	NMSBoxes(boxes, confidences, this->confidence_threshold, this->nms_threshold, indices, 1.f, this->keep_top_k);
279 | 	for (size_t i = 0; i < indices.size(); ++i)
280 | 	{
281 | 		int idx = indices[i];
282 | 		Rect box = boxes[idx];
283 | 		int xmax = box.x + box.width;
284 | 		int ymax = box.y + box.height;
285 | 		rectangle(srcimg, Point(box.x, box.y), Point(xmax, ymax), Scalar(0, 0, 255), 3);
286 | 		//Get the label for the class name and its confidence
287 | 		char text[256];
288 | 		sprintf(text, "%s: %.2f", class_names[classIds[idx] + 1], confidences[idx]);
289 | 
290 | 
291 | 		//Display the label at the top of the bounding box
292 | 		int baseLine;
293 | 		Size labelSize = getTextSize(text, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
294 | 		int ymin = max(box.y, labelSize.height);
295 | 		//rectangle(frame, Point(left, top - int(1.5 * labelSize.height)), Point(left + int(1.5 * labelSize.width), top + baseLine), Scalar(0, 255, 0), FILLED);
296 | 		putText(srcimg, text, Point(box.x, ymin), FONT_HERSHEY_SIMPLEX, 0.75, Scalar(0, 255, 0), 1);
297 | 
298 | 		Mat mask(this->mask_h, this->mask_w, CV_32FC1);
299 | 		mask = cv::Scalar(0.f);
300 | 		int channel = outs[2].cols;
301 | 		int area = this->mask_h * this->mask_w;
302 | 		float* coeff = (float*)outs[2].data + maskIds[idx] * channel;
303 | 		float* pm = (float*)mask.data;
304 | 		const float* pmaskmap = (float*)outs[3].data;
305 | 		for (int j = 0; j < area; j++)
306 | 		{
307 | 			for (int p = 0; p < channel; p++)
308 | 			{
309 | 				pm[j] += pmaskmap[p] * coeff[p];
310 | 			}
311 | 			pmaskmap += channel;
312 | 		}
313 | 
314 | 		this->sigmoid(mask, area);
315 | 		Mat mask2;
316 | 		resize(mask, mask2, Size(img_w, img_h));
317 | 		// draw mask
318 | 		for (int y = 0; y < img_h; y++)
319 | 		{
320 | 			const float* pmask = (float*)mask2.data + y * img_w;
321 | 			uchar* p = srcimg.data + y * img_w * 3;
322 | 			for (int x = 0; x < img_w; x++)
323 | 			{
324 | 				if (pmask[x] > 0.5)
325 | 				{
326 | 					p[0] = (uchar)(p[0] * 0.5 + colors[classIds[idx] + 1][0] * 0.5);
327 | 					p[1] = (uchar)(p[1] * 0.5 + colors[classIds[idx] + 1][1] * 0.5);
328 | 					p[2] = (uchar)(p[2] * 0.5 + colors[classIds[idx] + 1][2] * 0.5);
329 | 				}
330 | 				p += 3;
331 | 			}
332 | 		}
333 | 	}
334 | }
335 | 
336 | int main()
337 | {
338 | 	yolact yolactnet(0.5, 0.5);
339 | 
340 | 	string imgpath = "000000046804.jpg";
341 | 	Mat srcimg = imread(imgpath);
342 | 	yolactnet.detect(srcimg);
343 | 
344 | 	static const string kWinName = "Deep learning object detection in OpenCV";
345 | 	namedWindow(kWinName, WINDOW_NORMAL);
346 | 	imshow(kWinName, srcimg);
347 | 	waitKey(0);
348 | 	destroyAllWindows();
349 | }


--------------------------------------------------------------------------------
/convert-onnx/backbone.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import pickle
  4 | from collections import OrderedDict
  5 | 
  6 | class Bottleneck(nn.Module):
  7 |     """ Adapted from torchvision.models.resnet """
  8 |     expansion = 4
  9 | 
 10 |     def __init__(self, inplanes, planes, stride=1, downsample=None, norm_layer=nn.BatchNorm2d, dilation=1,
 11 |                  use_dcn=False):
 12 |         super(Bottleneck, self).__init__()
 13 |         self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False, dilation=dilation)
 14 |         self.bn1 = norm_layer(planes)
 15 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
 16 |                                padding=dilation, bias=False, dilation=dilation)
 17 |         self.bn2 = norm_layer(planes)
 18 |         self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False, dilation=dilation)
 19 |         self.bn3 = norm_layer(planes * 4)
 20 |         self.relu = nn.ReLU(inplace=True)
 21 |         self.downsample = downsample
 22 |         self.stride = stride
 23 | 
 24 |     def forward(self, x):
 25 |         residual = x
 26 | 
 27 |         out = self.conv1(x)
 28 |         out = self.bn1(out)
 29 |         out = self.relu(out)
 30 | 
 31 |         out = self.conv2(out)
 32 |         out = self.bn2(out)
 33 |         out = self.relu(out)
 34 | 
 35 |         out = self.conv3(out)
 36 |         out = self.bn3(out)
 37 | 
 38 |         if self.downsample is not None:
 39 |             residual = self.downsample(x)
 40 | 
 41 |         out += residual
 42 |         out = self.relu(out)
 43 | 
 44 |         return out
 45 | 
 46 | 
 47 | class ResNetBackbone(nn.Module):
 48 |     """ Adapted from torchvision.models.resnet """
 49 | 
 50 |     def __init__(self, layers, dcn_layers=[0, 0, 0, 0], dcn_interval=1, atrous_layers=[], block=Bottleneck,
 51 |                  norm_layer=nn.BatchNorm2d):
 52 |         super().__init__()
 53 | 
 54 |         # These will be populated by _make_layer
 55 |         self.num_base_layers = len(layers)
 56 |         self.layers = nn.ModuleList()
 57 |         self.channels = []
 58 |         self.norm_layer = norm_layer
 59 |         self.dilation = 1
 60 |         self.atrous_layers = atrous_layers
 61 | 
 62 |         # From torchvision.models.resnet.Resnet
 63 |         self.inplanes = 64
 64 | 
 65 |         self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
 66 |         self.bn1 = norm_layer(64)
 67 |         self.relu = nn.ReLU(inplace=True)
 68 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
 69 | 
 70 |         self._make_layer(block, 64, layers[0], dcn_layers=dcn_layers[0], dcn_interval=dcn_interval)
 71 |         self._make_layer(block, 128, layers[1], stride=2, dcn_layers=dcn_layers[1], dcn_interval=dcn_interval)
 72 |         self._make_layer(block, 256, layers[2], stride=2, dcn_layers=dcn_layers[2], dcn_interval=dcn_interval)
 73 |         self._make_layer(block, 512, layers[3], stride=2, dcn_layers=dcn_layers[3], dcn_interval=dcn_interval)
 74 | 
 75 |         # This contains every module that should be initialized by loading in pretrained weights.
 76 |         # Any extra layers added onto this that won't be initialized by init_backbone will not be
 77 |         # in this list. That way, Yolact::init_weights knows which backbone weights to initialize
 78 |         # with xavier, and which ones to leave alone.
 79 |         self.backbone_modules = [m for m in self.modules() if isinstance(m, nn.Conv2d)]
 80 | 
 81 |     def _make_layer(self, block, planes, blocks, stride=1, dcn_layers=0, dcn_interval=1):
 82 |         """ Here one layer means a string of n Bottleneck blocks. """
 83 |         downsample = None
 84 | 
 85 |         # This is actually just to create the connection between layers, and not necessarily to
 86 |         # downsample. Even if the second condition is met, it only downsamples when stride != 1
 87 |         if stride != 1 or self.inplanes != planes * block.expansion:
 88 |             if len(self.layers) in self.atrous_layers:
 89 |                 self.dilation += 1
 90 |                 stride = 1
 91 | 
 92 |             downsample = nn.Sequential(
 93 |                 nn.Conv2d(self.inplanes, planes * block.expansion,
 94 |                           kernel_size=1, stride=stride, bias=False,
 95 |                           dilation=self.dilation),
 96 |                 self.norm_layer(planes * block.expansion),
 97 |             )
 98 | 
 99 |         layers = []
100 |         use_dcn = (dcn_layers >= blocks)
101 |         layers.append(block(self.inplanes, planes, stride, downsample, self.norm_layer, self.dilation, use_dcn=use_dcn))
102 |         self.inplanes = planes * block.expansion
103 |         for i in range(1, blocks):
104 |             use_dcn = ((i + dcn_layers) >= blocks) and (i % dcn_interval == 0)
105 |             layers.append(block(self.inplanes, planes, norm_layer=self.norm_layer, use_dcn=use_dcn))
106 |         layer = nn.Sequential(*layers)
107 | 
108 |         self.channels.append(planes * block.expansion)
109 |         self.layers.append(layer)
110 | 
111 |         return layer
112 | 
113 |     def forward(self, x):
114 |         """ Returns a list of convouts for each layer. """
115 | 
116 |         x = self.conv1(x)
117 |         x = self.bn1(x)
118 |         x = self.relu(x)
119 |         x = self.maxpool(x)
120 | 
121 |         outs = []
122 |         for layer in self.layers:
123 |             x = layer(x)
124 |             outs.append(x)
125 | 
126 |         return tuple(outs)
127 | 
128 |     def init_backbone(self, path):
129 |         """ Initializes the backbone weights for training. """
130 |         state_dict = torch.load(path)
131 | 
132 |         # Replace layer1 -> layers.0 etc.
133 |         keys = list(state_dict)
134 |         for key in keys:
135 |             if key.startswith('layer'):
136 |                 idx = int(key[5])
137 |                 new_key = 'layers.' + str(idx - 1) + key[6:]
138 |                 state_dict[new_key] = state_dict.pop(key)
139 | 
140 |         # Note: Using strict=False is berry scary. Triple check this.
141 |         self.load_state_dict(state_dict, strict=False)
142 | 
143 |     def add_layer(self, conv_channels=1024, downsample=2, depth=1, block=Bottleneck):
144 |         """ Add a downsample layer to the backbone as per what SSD does. """
145 |         self._make_layer(block, conv_channels // block.expansion, blocks=depth, stride=downsample)
146 | 
147 | 
148 | class ResNetBackboneGN(ResNetBackbone):
149 | 
150 |     def __init__(self, layers, num_groups=32):
151 |         super().__init__(layers, norm_layer=lambda x: nn.GroupNorm(num_groups, x))
152 | 
153 |     def init_backbone(self, path):
154 |         """ The path here comes from detectron. So we load it differently. """
155 |         with open(path, 'rb') as f:
156 |             state_dict = pickle.load(f, encoding='latin1')  # From the detectron source
157 |             state_dict = state_dict['blobs']
158 | 
159 |         our_state_dict_keys = list(self.state_dict().keys())
160 |         new_state_dict = {}
161 | 
162 |         gn_trans = lambda x: ('gn_s' if x == 'weight' else 'gn_b')
163 |         layeridx2res = lambda x: 'res' + str(int(x) + 2)
164 |         block2branch = lambda x: 'branch2' + ('a', 'b', 'c')[int(x[-1:]) - 1]
165 | 
166 |         # Transcribe each Detectron weights name to a Yolact weights name
167 |         for key in our_state_dict_keys:
168 |             parts = key.split('.')
169 |             transcribed_key = ''
170 | 
171 |             if (parts[0] == 'conv1'):
172 |                 transcribed_key = 'conv1_w'
173 |             elif (parts[0] == 'bn1'):
174 |                 transcribed_key = 'conv1_' + gn_trans(parts[1])
175 |             elif (parts[0] == 'layers'):
176 |                 if int(parts[1]) >= self.num_base_layers: continue
177 | 
178 |                 transcribed_key = layeridx2res(parts[1])
179 |                 transcribed_key += '_' + parts[2] + '_'
180 | 
181 |                 if parts[3] == 'downsample':
182 |                     transcribed_key += 'branch1_'
183 | 
184 |                     if parts[4] == '0':
185 |                         transcribed_key += 'w'
186 |                     else:
187 |                         transcribed_key += gn_trans(parts[5])
188 |                 else:
189 |                     transcribed_key += block2branch(parts[3]) + '_'
190 | 
191 |                     if 'conv' in parts[3]:
192 |                         transcribed_key += 'w'
193 |                     else:
194 |                         transcribed_key += gn_trans(parts[4])
195 | 
196 |             new_state_dict[key] = torch.Tensor(state_dict[transcribed_key])
197 | 
198 |         # strict=False because we may have extra unitialized layers at this point
199 |         self.load_state_dict(new_state_dict, strict=False)
200 | 
201 | 
202 | def darknetconvlayer(in_channels, out_channels, *args, **kwdargs):
203 |     """
204 |     Implements a conv, activation, then batch norm.
205 |     Arguments are passed into the conv layer.
206 |     """
207 |     return nn.Sequential(
208 |         nn.Conv2d(in_channels, out_channels, *args, **kwdargs, bias=False),
209 |         nn.BatchNorm2d(out_channels),
210 |         # Darknet uses 0.1 here.
211 |         # See https://github.com/pjreddie/darknet/blob/680d3bde1924c8ee2d1c1dea54d3e56a05ca9a26/src/activations.h#L39
212 |         nn.LeakyReLU(0.1, inplace=True)
213 |     )
214 | 
215 | 
216 | class DarkNetBlock(nn.Module):
217 |     """ Note: channels is the lesser of the two. The output will be expansion * channels. """
218 | 
219 |     expansion = 2
220 | 
221 |     def __init__(self, in_channels, channels):
222 |         super().__init__()
223 | 
224 |         self.conv1 = darknetconvlayer(in_channels, channels, kernel_size=1)
225 |         self.conv2 = darknetconvlayer(channels, channels * self.expansion, kernel_size=3, padding=1)
226 | 
227 |     def forward(self, x):
228 |         return self.conv2(self.conv1(x)) + x
229 | 
230 | 
231 | class DarkNetBackbone(nn.Module):
232 |     """
233 |     An implementation of YOLOv3's Darnet53 in
234 |     https://pjreddie.com/media/files/papers/YOLOv3.pdf
235 | 
236 |     This is based off of the implementation of Resnet above.
237 |     """
238 | 
239 |     def __init__(self, layers=[1, 2, 8, 8, 4], block=DarkNetBlock):
240 |         super().__init__()
241 | 
242 |         # These will be populated by _make_layer
243 |         self.num_base_layers = len(layers)
244 |         self.layers = nn.ModuleList()
245 |         self.channels = []
246 | 
247 |         self._preconv = darknetconvlayer(3, 32, kernel_size=3, padding=1)
248 |         self.in_channels = 32
249 | 
250 |         self._make_layer(block, 32, layers[0])
251 |         self._make_layer(block, 64, layers[1])
252 |         self._make_layer(block, 128, layers[2])
253 |         self._make_layer(block, 256, layers[3])
254 |         self._make_layer(block, 512, layers[4])
255 | 
256 |         # This contains every module that should be initialized by loading in pretrained weights.
257 |         # Any extra layers added onto this that won't be initialized by init_backbone will not be
258 |         # in this list. That way, Yolact::init_weights knows which backbone weights to initialize
259 |         # with xavier, and which ones to leave alone.
260 |         self.backbone_modules = [m for m in self.modules() if isinstance(m, nn.Conv2d)]
261 | 
262 |     def _make_layer(self, block, channels, num_blocks, stride=2):
263 |         """ Here one layer means a string of n blocks. """
264 |         layer_list = []
265 | 
266 |         # The downsample layer
267 |         layer_list.append(
268 |             darknetconvlayer(self.in_channels, channels * block.expansion,
269 |                              kernel_size=3, padding=1, stride=stride))
270 | 
271 |         # Each block inputs channels and outputs channels * expansion
272 |         self.in_channels = channels * block.expansion
273 |         layer_list += [block(self.in_channels, channels) for _ in range(num_blocks)]
274 | 
275 |         self.channels.append(self.in_channels)
276 |         self.layers.append(nn.Sequential(*layer_list))
277 | 
278 |     def forward(self, x):
279 |         """ Returns a list of convouts for each layer. """
280 | 
281 |         x = self._preconv(x)
282 | 
283 |         outs = []
284 |         for layer in self.layers:
285 |             x = layer(x)
286 |             outs.append(x)
287 | 
288 |         return tuple(outs)
289 | 
290 |     def add_layer(self, conv_channels=1024, stride=2, depth=1, block=DarkNetBlock):
291 |         """ Add a downsample layer to the backbone as per what SSD does. """
292 |         self._make_layer(block, conv_channels // block.expansion, num_blocks=depth, stride=stride)
293 | 
294 |     def init_backbone(self, path):
295 |         """ Initializes the backbone weights for training. """
296 |         # Note: Using strict=False is berry scary. Triple check this.
297 |         self.load_state_dict(torch.load(path), strict=False)
298 | 
299 | 
300 | class VGGBackbone(nn.Module):
301 |     """
302 |     Args:
303 |         - cfg: A list of layers given as lists. Layers can be either 'M' signifying
304 |                 a max pooling layer, a number signifying that many feature maps in
305 |                 a conv layer, or a tuple of 'M' or a number and a kwdargs dict to pass
306 |                 into the function that creates the layer (e.g. nn.MaxPool2d for 'M').
307 |         - extra_args: A list of lists of arguments to pass into add_layer.
308 |         - norm_layers: Layers indices that need to pass through an l2norm layer.
309 |     """
310 | 
311 |     def __init__(self, cfg, extra_args=[], norm_layers=[]):
312 |         super().__init__()
313 | 
314 |         self.channels = []
315 |         self.layers = nn.ModuleList()
316 |         self.in_channels = 3
317 |         self.extra_args = list(reversed(extra_args))  # So I can use it as a stack
318 | 
319 |         # Keeps track of what the corresponding key will be in the state dict of the
320 |         # pretrained model. For instance, layers.0.2 for us is 2 for the pretrained
321 |         # model but layers.1.1 is 5.
322 |         self.total_layer_count = 0
323 |         self.state_dict_lookup = {}
324 | 
325 |         for idx, layer_cfg in enumerate(cfg):
326 |             self._make_layer(layer_cfg)
327 | 
328 |         self.norms = nn.ModuleList([nn.BatchNorm2d(self.channels[l]) for l in norm_layers])
329 |         self.norm_lookup = {l: idx for idx, l in enumerate(norm_layers)}
330 | 
331 |         # These modules will be initialized by init_backbone,
332 |         # so don't overwrite their initialization later.
333 |         self.backbone_modules = [m for m in self.modules() if isinstance(m, nn.Conv2d)]
334 | 
335 |     def _make_layer(self, cfg):
336 |         """
337 |         Each layer is a sequence of conv layers usually preceded by a max pooling.
338 |         Adapted from torchvision.models.vgg.make_layers.
339 |         """
340 | 
341 |         layers = []
342 | 
343 |         for v in cfg:
344 |             # VGG in SSD requires some special layers, so allow layers to be tuples of
345 |             # (<M or num_features>, kwdargs dict)
346 |             args = None
347 |             if isinstance(v, tuple):
348 |                 args = v[1]
349 |                 v = v[0]
350 | 
351 |             # v should be either M or a number
352 |             if v == 'M':
353 |                 # Set default arguments
354 |                 if args is None:
355 |                     args = {'kernel_size': 2, 'stride': 2}
356 | 
357 |                 layers.append(nn.MaxPool2d(**args))
358 |             else:
359 |                 # See the comment in __init__ for an explanation of this
360 |                 cur_layer_idx = self.total_layer_count + len(layers)
361 |                 self.state_dict_lookup[cur_layer_idx] = '%d.%d' % (len(self.layers), len(layers))
362 | 
363 |                 # Set default arguments
364 |                 if args is None:
365 |                     args = {'kernel_size': 3, 'padding': 1}
366 | 
367 |                 # Add the layers
368 |                 layers.append(nn.Conv2d(self.in_channels, v, **args))
369 |                 layers.append(nn.ReLU(inplace=True))
370 |                 self.in_channels = v
371 | 
372 |         self.total_layer_count += len(layers)
373 |         self.channels.append(self.in_channels)
374 |         self.layers.append(nn.Sequential(*layers))
375 | 
376 |     def forward(self, x):
377 |         """ Returns a list of convouts for each layer. """
378 |         outs = []
379 | 
380 |         for idx, layer in enumerate(self.layers):
381 |             x = layer(x)
382 | 
383 |             # Apply an l2norm module to the selected layers
384 |             # Note that this differs from the original implemenetation
385 |             if idx in self.norm_lookup:
386 |                 x = self.norms[self.norm_lookup[idx]](x)
387 |             outs.append(x)
388 | 
389 |         return tuple(outs)
390 | 
391 |     def transform_key(self, k):
392 |         """ Transform e.g. features.24.bias to layers.4.1.bias """
393 |         vals = k.split('.')
394 |         layerIdx = self.state_dict_lookup[int(vals[0])]
395 |         return 'layers.%s.%s' % (layerIdx, vals[1])
396 | 
397 |     def init_backbone(self, path):
398 |         """ Initializes the backbone weights for training. """
399 |         state_dict = torch.load(path)
400 |         state_dict = OrderedDict([(self.transform_key(k), v) for k, v in state_dict.items()])
401 | 
402 |         self.load_state_dict(state_dict, strict=False)
403 | 
404 |     def add_layer(self, conv_channels=128, downsample=2):
405 |         """ Add a downsample layer to the backbone as per what SSD does. """
406 |         if len(self.extra_args) > 0:
407 |             conv_channels, downsample = self.extra_args.pop()
408 | 
409 |         padding = 1 if downsample > 1 else 0
410 | 
411 |         layer = nn.Sequential(
412 |             nn.Conv2d(self.in_channels, conv_channels, kernel_size=1),
413 |             nn.ReLU(inplace=True),
414 |             nn.Conv2d(conv_channels, conv_channels * 2, kernel_size=3, stride=downsample, padding=padding),
415 |             nn.ReLU(inplace=True)
416 |         )
417 | 
418 |         self.in_channels = conv_channels * 2
419 |         self.channels.append(self.in_channels)
420 |         self.layers.append(layer)
421 | 
422 | 
423 | def construct_backbone(cfg):
424 |     """ Constructs a backbone given a backbone config object (see config.py). """
425 |     backbone = cfg.type(*cfg.args)
426 | 
427 |     # Add downsampling layers until we reach the number we need
428 |     num_layers = max(cfg.selected_layers) + 1
429 | 
430 |     while len(backbone.layers) < num_layers:
431 |         backbone.add_layer()
432 | 
433 |     return backbone
434 | 


--------------------------------------------------------------------------------
/convert-onnx/yolact.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | from itertools import product
  6 | from math import sqrt
  7 | from typing import List
  8 | from collections import defaultdict
  9 | from config import cfg, mask_type
 10 | from backbone import construct_backbone
 11 | 
 12 | # This is required for Pytorch 1.0.1 on Windows to initialize Cuda on some driver versions.
 13 | # See the bug report here: https://github.com/pytorch/pytorch/issues/17108
 14 | device = 'cpu'
 15 | if torch.cuda.is_available():
 16 | 	torch.cuda.current_device()
 17 | 	device = 'cuda'
 18 | 
 19 | # As of March 10, 2019, Pytorch DataParallel still doesn't support JIT Script Modules
 20 | # use_jit = torch.cuda.device_count() <= 1
 21 | # if not use_jit:
 22 | #     print('Multiple GPUs detected! Turning off JIT.')
 23 | #
 24 | # ScriptModuleWrapper = torch.jit.ScriptModule if use_jit else nn.Module
 25 | # script_method_wrapper = torch.jit.script_method if use_jit else lambda fn, _rcn=None: fn
 26 | 
 27 | class Concat(nn.Module):
 28 | 	def __init__(self, nets, extra_params):
 29 | 		super().__init__()
 30 | 
 31 | 		self.nets = nn.ModuleList(nets)
 32 | 		self.extra_params = extra_params
 33 | 
 34 | 	def forward(self, x):
 35 | 		# Concat each along the channel dimension
 36 | 		return torch.cat([net(x) for net in self.nets], dim=1, **self.extra_params)
 37 | 
 38 | class InterpolateModule(nn.Module):
 39 | 	"""
 40 | 	This is a module version of F.interpolate (rip nn.Upsampling).
 41 | 	Any arguments you give it just get passed along for the ride.
 42 | 	"""
 43 | 
 44 | 	def __init__(self, *args, **kwdargs):
 45 | 		super().__init__()
 46 | 
 47 | 		self.args = args
 48 | 		self.kwdargs = kwdargs
 49 | 
 50 | 	def forward(self, x):
 51 | 		# return F.interpolate(x, *self.args, **self.kwdargs)
 52 | 		return F.interpolate(x, size=(int(x.shape[2] * self.kwdargs['scale_factor']), int(x.shape[3] * self.kwdargs['scale_factor'])), mode=self.kwdargs['mode'], align_corners=self.kwdargs['align_corners'])
 53 | 
 54 | def make_net(in_channels, conf, include_last_relu=True):
 55 | 	def make_layer(layer_cfg):
 56 | 		nonlocal in_channels
 57 | 
 58 | 		# Possible patterns:
 59 | 		# ( 256, 3, {}) -> conv
 60 | 		# ( 256,-2, {}) -> deconv
 61 | 		# (None,-2, {}) -> bilinear interpolate
 62 | 		# ('cat',[],{}) -> concat the subnetworks in the list
 63 | 		#
 64 | 		# You know it would have probably been simpler just to adopt a 'c' 'd' 'u' naming scheme.
 65 | 		# Whatever, it's too late now.
 66 | 		if isinstance(layer_cfg[0], str):
 67 | 			layer_name = layer_cfg[0]
 68 | 
 69 | 			if layer_name == 'cat':
 70 | 				nets = [make_net(in_channels, x) for x in layer_cfg[1]]
 71 | 				layer = Concat([net[0] for net in nets], layer_cfg[2])
 72 | 				num_channels = sum([net[1] for net in nets])
 73 | 		else:
 74 | 			num_channels = layer_cfg[0]
 75 | 			kernel_size = layer_cfg[1]
 76 | 
 77 | 			if kernel_size > 0:
 78 | 				layer = nn.Conv2d(in_channels, num_channels, kernel_size, **layer_cfg[2])
 79 | 			else:
 80 | 				if num_channels is None:
 81 | 					layer = InterpolateModule(scale_factor=-kernel_size, mode='bilinear', align_corners=False, **layer_cfg[2])
 82 | 					# layer = nn.Upsample(scale_factor=-kernel_size, mode='bilinear', align_corners=False)
 83 | 				else:
 84 | 					layer = nn.ConvTranspose2d(in_channels, num_channels, -kernel_size, **layer_cfg[2])
 85 | 
 86 | 		in_channels = num_channels if num_channels is not None else in_channels
 87 | 
 88 | 		# Don't return a ReLU layer if we're doing an upsample. This probably doesn't affect anything
 89 | 		# output-wise, but there's no need to go through a ReLU here.
 90 | 		# Commented out for backwards compatibility with previous models
 91 | 		# if num_channels is None:
 92 | 		#     return [layer]
 93 | 		# else:
 94 | 		return [layer, nn.ReLU(inplace=True)]
 95 | 
 96 | 	# Use sum to concat together all the component layer lists
 97 | 	net = sum([make_layer(x) for x in conf], [])
 98 | 	if not include_last_relu:
 99 | 		net = net[:-1]
100 | 
101 | 	return nn.Sequential(*(net)), in_channels
102 | 
103 | prior_cache = defaultdict(lambda: None)
104 | 
105 | class PredictionModule(nn.Module):
106 | 	def __init__(self, in_channels, out_channels=1024, aspect_ratios=[[1]], scales=[1], parent=None, index=0):
107 | 		super().__init__()
108 | 
109 | 		self.num_classes = cfg.num_classes
110 | 		self.mask_dim    = cfg.mask_dim # Defined by Yolact
111 | 		self.num_priors  = sum(len(x)*len(scales) for x in aspect_ratios)
112 | 		self.parent      = [parent] # Don't include this in the state dict
113 | 		self.index       = index
114 | 		self.num_heads   = cfg.num_heads # Defined by Yolact
115 | 
116 | 		if parent is None:
117 | 			self.upfeature, out_channels = make_net(in_channels, cfg.extra_head_net)
118 | 
119 | 			self.bbox_layer = nn.Conv2d(out_channels, self.num_priors * 4,                **cfg.head_layer_params)
120 | 			self.conf_layer = nn.Conv2d(out_channels, self.num_priors * self.num_classes, **cfg.head_layer_params)
121 | 			self.mask_layer = nn.Conv2d(out_channels, self.num_priors * self.mask_dim,    **cfg.head_layer_params)
122 | 
123 | 			# What is this ugly lambda doing in the middle of all this clean prediction module code?
124 | 			def make_extra(num_layers):
125 | 				if num_layers == 0:
126 | 					return lambda x: x
127 | 				else:
128 | 					# Looks more complicated than it is. This just creates an array of num_layers alternating conv-relu
129 | 					return nn.Sequential(*sum([[
130 | 						nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
131 | 						nn.ReLU(inplace=True)
132 | 					] for _ in range(num_layers)], []))
133 | 
134 | 			self.bbox_extra, self.conf_extra, self.mask_extra = [make_extra(x) for x in cfg.extra_layers]
135 | 
136 | 		self.aspect_ratios = aspect_ratios
137 | 		self.scales = scales
138 | 
139 | 		self.priors = None
140 | 		self.last_conv_size = None
141 | 		self.last_img_size = None
142 | 
143 | 	def forward(self, x):
144 | 		src = self if self.parent[0] is None else self.parent[0]
145 | 		x = src.upfeature(x)
146 | 
147 | 		bbox_x = src.bbox_extra(x)
148 | 		conf_x = src.conf_extra(x)
149 | 		mask_x = src.mask_extra(x)
150 | 
151 | 		bbox = src.bbox_layer(bbox_x).permute(0, 2, 3, 1).contiguous().view(x.size(0), -1, 4)
152 | 		conf = src.conf_layer(conf_x).permute(0, 2, 3, 1).contiguous().view(x.size(0), -1, self.num_classes)
153 | 
154 | 		mask = src.mask_layer(mask_x).permute(0, 2, 3, 1).contiguous().view(x.size(0), -1, self.mask_dim)
155 | 
156 | 		mask = torch.tanh(mask)
157 | 
158 | 		# conv_h = x.size(2)
159 | 		# conv_w = x.size(3)
160 | 		# priors = self.make_priors(conv_h, conv_w, x.device)
161 | 		# preds = { 'loc': bbox, 'conf': conf, 'mask': mask, 'priors': priors }
162 | 		# return preds
163 | 		return bbox, conf, mask
164 | 
165 | 	def make_priors(self, conv_h, conv_w, device):
166 | 		""" Note that priors are [x,y,width,height] where (x,y) is the center of the box. """
167 | 		global prior_cache
168 | 		size = (conv_h, conv_w)
169 | 
170 | 		if self.last_img_size != (cfg._tmp_img_w, cfg._tmp_img_h):
171 | 			prior_data = []
172 | 
173 | 			# Iteration order is important (it has to sync up with the convout)
174 | 			for j, i in product(range(conv_h), range(conv_w)):
175 | 				# +0.5 because priors are in center-size notation
176 | 				x = (i + 0.5) / conv_w
177 | 				y = (j + 0.5) / conv_h
178 | 
179 | 				for ars in self.aspect_ratios:
180 | 					for scale in self.scales:
181 | 						for ar in ars:
182 | 							if not cfg.backbone.preapply_sqrt:
183 | 								ar = sqrt(ar)
184 | 
185 | 							if cfg.backbone.use_pixel_scales:
186 | 								w = scale * ar / cfg.max_size
187 | 								h = scale / ar / cfg.max_size
188 | 							else:
189 | 								w = scale * ar / conv_w
190 | 								h = scale / ar / conv_h
191 | 
192 | 							# This is for backward compatability with a bug where I made everything square by accident
193 | 							if cfg.backbone.use_square_anchors:
194 | 								h = w
195 | 
196 | 							prior_data += [x, y, w, h]
197 | 
198 | 			self.priors = torch.Tensor(prior_data).view(-1, 4).detach().to(device)
199 | 			# self.priors = torch.Tensor(prior_data).view(-1, 4).detach()
200 | 			self.priors.requires_grad = False
201 | 			self.last_img_size = (cfg._tmp_img_w, cfg._tmp_img_h)
202 | 			self.last_conv_size = (conv_w, conv_h)
203 | 			prior_cache[size] = None
204 | 		elif self.priors.device != device:
205 | 			# This whole weird situation is so that DataParalell doesn't copy the priors each iteration
206 | 			if prior_cache[size] is None:
207 | 				prior_cache[size] = {}
208 | 
209 | 			if device not in prior_cache[size]:
210 | 				prior_cache[size][device] = self.priors.to(device)
211 | 
212 | 			self.priors = prior_cache[size][device]
213 | 
214 | 		return self.priors
215 | 
216 | class FPN(nn.Module):
217 | 	"""
218 | 	Implements a general version of the FPN introduced in
219 | 	https://arxiv.org/pdf/1612.03144.pdf
220 | 
221 | 	Parameters (in cfg.fpn):
222 | 		- num_features (int): The number of output features in the fpn layers.
223 | 		- interpolation_mode (str): The mode to pass to F.interpolate.
224 | 		- num_downsample (int): The number of downsampled layers to add onto the selected layers.
225 | 								These extra layers are downsampled from the last selected layer.
226 | 
227 | 	Args:
228 | 		- in_channels (list): For each conv layer you supply in the forward pass,
229 | 							  how many features will it have?
230 | 	"""
231 | 	__constants__ = ['interpolation_mode', 'num_downsample', 'use_conv_downsample', 'relu_pred_layers',
232 | 					 'lat_layers', 'pred_layers', 'downsample_layers', 'relu_downsample_layers']
233 | 
234 | 	def __init__(self, in_channels):
235 | 		super().__init__()
236 | 
237 | 		self.lat_layers  = nn.ModuleList([
238 | 			nn.Conv2d(x, cfg.fpn.num_features, kernel_size=1)
239 | 			for x in reversed(in_channels)
240 | 		])
241 | 
242 | 		# This is here for backwards compatability
243 | 		padding = 1 if cfg.fpn.pad else 0
244 | 		self.pred_layers = nn.ModuleList([
245 | 			nn.Conv2d(cfg.fpn.num_features, cfg.fpn.num_features, kernel_size=3, padding=padding)
246 | 			for _ in in_channels
247 | 		])
248 | 
249 | 		if cfg.fpn.use_conv_downsample:
250 | 			self.downsample_layers = nn.ModuleList([
251 | 				nn.Conv2d(cfg.fpn.num_features, cfg.fpn.num_features, kernel_size=3, padding=1, stride=2)
252 | 				for _ in range(cfg.fpn.num_downsample)
253 | 			])
254 | 
255 | 		self.interpolation_mode     = cfg.fpn.interpolation_mode
256 | 		self.num_downsample         = cfg.fpn.num_downsample
257 | 		self.use_conv_downsample    = cfg.fpn.use_conv_downsample
258 | 		self.relu_downsample_layers = cfg.fpn.relu_downsample_layers
259 | 		self.relu_pred_layers       = cfg.fpn.relu_pred_layers
260 | 
261 | 	# @script_method_wrapper
262 | 	def forward(self, convouts:List[torch.Tensor]):
263 | 		"""
264 | 		Args:
265 | 			- convouts (list): A list of convouts for the corresponding layers in in_channels.
266 | 		Returns:
267 | 			- A list of FPN convouts in the same order as x with extra downsample layers if requested.
268 | 		"""
269 | 
270 | 		out = []
271 | 		x = torch.zeros(1, device=convouts[0].device)
272 | 		for i in range(len(convouts)):
273 | 			out.append(x)
274 | 
275 | 		# For backward compatability, the conv layers are stored in reverse but the input and output is
276 | 		# given in the correct order. Thus, use j=-i-1 for the input and output and i for the conv layers.
277 | 		j = len(convouts)
278 | 		for lat_layer in self.lat_layers:
279 | 			j -= 1
280 | 
281 | 			if j < len(convouts) - 1:
282 | 				x = F.interpolate(x, size=(int(convouts[j].shape[2]), int(convouts[j].shape[3])), mode=self.interpolation_mode, align_corners=False)
283 | 
284 | 			x = x + lat_layer(convouts[j])
285 | 			out[j] = x
286 | 
287 | 		# This janky second loop is here because TorchScript.
288 | 		j = len(convouts)
289 | 		for pred_layer in self.pred_layers:
290 | 			j -= 1
291 | 			out[j] = pred_layer(out[j])
292 | 
293 | 			if self.relu_pred_layers:
294 | 				F.relu(out[j], inplace=True)
295 | 
296 | 		cur_idx = len(out)
297 | 
298 | 		# In the original paper, this takes care of P6
299 | 		if self.use_conv_downsample:
300 | 			for downsample_layer in self.downsample_layers:
301 | 				out.append(downsample_layer(out[-1]))
302 | 		else:
303 | 			for idx in range(self.num_downsample):
304 | 				# Note: this is an untested alternative to out.append(out[-1][:, :, ::2, ::2]). Thanks TorchScript.
305 | 				out.append(nn.functional.max_pool2d(out[-1], 1, stride=2))
306 | 
307 | 		if self.relu_downsample_layers:
308 | 			for idx in range(len(out) - cur_idx):
309 | 				out[idx] = F.relu(out[idx + cur_idx], inplace=False)
310 | 
311 | 		return out
312 | 
313 | class FastMaskIoUNet(nn.Module):
314 | 
315 | 	def __init__(self):
316 | 		super().__init__()
317 | 		input_channels = 1
318 | 		last_layer = [(cfg.num_classes-1, 1, {})]
319 | 		self.maskiou_net, _ = make_net(input_channels, cfg.maskiou_net + last_layer, include_last_relu=True)
320 | 
321 | 	def forward(self, x):
322 | 		x = self.maskiou_net(x)
323 | 		maskiou_p = F.max_pool2d(x, kernel_size=x.size()[2:]).squeeze(-1).squeeze(-1)
324 | 
325 | 		return maskiou_p
326 | 
327 | 
328 | 
329 | class Yolact(nn.Module):
330 | 	"""
331 | 
332 | 
333 | 	██╗   ██╗ ██████╗ ██╗      █████╗  ██████╗████████╗
334 | 	╚██╗ ██╔╝██╔═══██╗██║     ██╔══██╗██╔════╝╚══██╔══╝
335 | 	 ╚████╔╝ ██║   ██║██║     ███████║██║        ██║
336 | 	  ╚██╔╝  ██║   ██║██║     ██╔══██║██║        ██║
337 | 	   ██║   ╚██████╔╝███████╗██║  ██║╚██████╗   ██║
338 | 	   ╚═╝    ╚═════╝ ╚══════╝╚═╝  ╚═╝ ╚═════╝   ╚═╝
339 | 
340 | 
341 | 	You can set the arguments by changing them in the backbone config object in config.py.
342 | 
343 | 	Parameters (in cfg.backbone):
344 | 		- selected_layers: The indices of the conv layers to use for prediction.
345 | 		- pred_scales:     A list with len(selected_layers) containing tuples of scales (see PredictionModule)
346 | 		- pred_aspect_ratios: A list of lists of aspect ratios with len(selected_layers) (see PredictionModule)
347 | 	"""
348 | 
349 | 	def __init__(self):
350 | 		super().__init__()
351 | 
352 | 		self.backbone = construct_backbone(cfg.backbone)
353 | 		# Compute mask_dim here and add it back to the config. Make sure Yolact's constructor is called early!
354 | 		if cfg.mask_type == mask_type.direct:
355 | 			cfg.mask_dim = cfg.mask_size**2
356 | 		elif cfg.mask_type == mask_type.lincomb:
357 | 			if cfg.mask_proto_use_grid:
358 | 				self.grid = torch.Tensor(np.load(cfg.mask_proto_grid_file))
359 | 				self.num_grids = self.grid.size(0)
360 | 			else:
361 | 				self.num_grids = 0
362 | 
363 | 			self.proto_src = cfg.mask_proto_src
364 | 
365 | 			if self.proto_src is None: in_channels = 3
366 | 			elif cfg.fpn is not None: in_channels = cfg.fpn.num_features
367 | 			else: in_channels = self.backbone.channels[self.proto_src]
368 | 			in_channels += self.num_grids
369 | 
370 | 			# The include_last_relu=false here is because we might want to change it to another function
371 | 			self.proto_net, cfg.mask_dim = make_net(in_channels, cfg.mask_proto_net, include_last_relu=False)
372 | 
373 | 			if cfg.mask_proto_bias:
374 | 				cfg.mask_dim += 1
375 | 
376 | 
377 | 		self.selected_layers = cfg.backbone.selected_layers
378 | 		src_channels = self.backbone.channels
379 | 
380 | 		if cfg.use_maskiou:
381 | 			self.maskiou_net = FastMaskIoUNet()
382 | 
383 | 		# if cfg.fpn is not None:
384 | 		#     # Some hacky rewiring to accomodate the FPN
385 | 		#     self.fpn = FPN([src_channels[i] for i in self.selected_layers])
386 | 		#     self.selected_layers = list(range(len(self.selected_layers) + cfg.fpn.num_downsample))
387 | 		#     src_channels = [cfg.fpn.num_features] * len(self.selected_layers)
388 | 		self.fpn = FPN([src_channels[i] for i in self.selected_layers])
389 | 		self.selected_layers = list(range(len(self.selected_layers) + cfg.fpn.num_downsample))
390 | 		src_channels = [cfg.fpn.num_features] * len(self.selected_layers)
391 | 
392 | 		self.prediction_layers = nn.ModuleList()
393 | 		cfg.num_heads = len(self.selected_layers)
394 | 
395 | 		for idx, layer_idx in enumerate(self.selected_layers):
396 | 			# If we're sharing prediction module weights, have every module's parent be the first one
397 | 			parent = None
398 | 			if cfg.share_prediction_module and idx > 0:
399 | 				parent = self.prediction_layers[0]
400 | 
401 | 			pred = PredictionModule(src_channels[layer_idx], src_channels[layer_idx],
402 | 									aspect_ratios = cfg.backbone.pred_aspect_ratios[idx],
403 | 									scales        = cfg.backbone.pred_scales[idx],
404 | 									parent        = parent,
405 | 									index         = idx)
406 | 			self.prediction_layers.append(pred)
407 | 
408 | 		# Extra parameters for the extra losses
409 | 		if cfg.use_class_existence_loss:
410 | 			# This comes from the smallest layer selected
411 | 			# Also note that cfg.num_classes includes background
412 | 			self.class_existence_fc = nn.Linear(src_channels[-1], cfg.num_classes - 1)
413 | 
414 | 		if cfg.use_semantic_segmentation_loss:
415 | 			self.semantic_seg_conv = nn.Conv2d(src_channels[0], cfg.num_classes-1, kernel_size=1)
416 | 
417 | 		# # For use in evaluation
418 | 		# self.detect = Detect(cfg.num_classes, bkg_label=0, top_k=cfg.nms_top_k,
419 | 		#     conf_thresh=cfg.nms_conf_thresh, nms_thresh=cfg.nms_thresh)
420 | 
421 | 	def load_weights(self, path):
422 | 		""" Loads weights from a compressed save file. """
423 | 		state_dict = torch.load(path, map_location=device)
424 | 
425 | 		# For backward compatability, remove these (the new variable is called layers)
426 | 		for key in list(state_dict.keys()):
427 | 			if key.startswith('backbone.layer') and not key.startswith('backbone.layers'):
428 | 				del state_dict[key]
429 | 
430 | 			# Also for backward compatibility with v1.0 weights, do this check
431 | 			if key.startswith('fpn.downsample_layers.'):
432 | 				if cfg.fpn is not None and int(key.split('.')[2]) >= cfg.fpn.num_downsample:
433 | 					del state_dict[key]
434 | 		self.load_state_dict(state_dict)
435 | 
436 | 	def forward(self, x):
437 | 		""" The input should be of size [batch_size, 3, img_h, img_w] """
438 | 		_, _, img_h, img_w = x.size()
439 | 		cfg._tmp_img_h = img_h
440 | 		cfg._tmp_img_w = img_w
441 | 
442 | 		outs = self.backbone(x)
443 | 
444 | 		outs = [outs[i] for i in cfg.backbone.selected_layers]
445 | 		outs = self.fpn(outs)
446 | 
447 | 		proto_x = x if self.proto_src is None else outs[self.proto_src]
448 | 
449 | 		if self.num_grids > 0:
450 | 			grids = self.grid.repeat(proto_x.size(0), 1, 1, 1)
451 | 			proto_x = torch.cat([proto_x, grids], dim=1)
452 | 
453 | 		proto_out = self.proto_net(proto_x)
454 | 		proto_out = F.relu(proto_out)
455 | 
456 | 		# Move the features last so the multiplication is easy
457 | 		proto_out = proto_out.permute(0, 2, 3, 1).contiguous()
458 | 		loc, conf, mask = [], [], []
459 | 		for idx, pred_layer in zip(self.selected_layers, self.prediction_layers):
460 | 			pred_x = outs[idx]
461 | 			# A hack for the way dataparallel works
462 | 			if cfg.share_prediction_module and pred_layer is not self.prediction_layers[0]:
463 | 				pred_layer.parent = [self.prediction_layers[0]]
464 | 
465 | 			p = pred_layer(pred_x)    ###loc, conf, mask
466 | 			loc.append(p[0])
467 | 			conf.append(p[1])
468 | 			mask.append(p[2])
469 | 		loc = torch.cat(loc, -2)
470 | 		conf = torch.cat(conf, -2)
471 | 		mask = torch.cat(mask, -2)
472 | 		conf = F.softmax(conf, -1)
473 | 
474 | 		_, *s = loc.shape
475 | 		loc = loc.view(s)
476 | 		_, *s = conf.shape
477 | 		conf = conf.view(s)
478 | 		_, *s = mask.shape
479 | 		mask = mask.view(s)
480 | 		_, *s = proto_out.shape
481 | 		proto_out = proto_out.view(s)
482 | 		return loc, conf, mask, proto_out
483 | 


--------------------------------------------------------------------------------
/convert-onnx/config.py:
--------------------------------------------------------------------------------
  1 | from backbone import ResNetBackbone, VGGBackbone, ResNetBackboneGN, DarkNetBackbone
  2 | from math import sqrt
  3 | import torch
  4 | 
  5 | # for making bounding boxes pretty
  6 | COLORS = ((244, 67, 54),
  7 |           (233, 30, 99),
  8 |           (156, 39, 176),
  9 |           (103, 58, 183),
 10 |           (63, 81, 181),
 11 |           (33, 150, 243),
 12 |           (3, 169, 244),
 13 |           (0, 188, 212),
 14 |           (0, 150, 136),
 15 |           (76, 175, 80),
 16 |           (139, 195, 74),
 17 |           (205, 220, 57),
 18 |           (255, 235, 59),
 19 |           (255, 193, 7),
 20 |           (255, 152, 0),
 21 |           (255, 87, 34),
 22 |           (121, 85, 72),
 23 |           (158, 158, 158),
 24 |           (96, 125, 139))
 25 | 
 26 | # These are in BGR and are for ImageNet
 27 | MEANS = (103.94, 116.78, 123.68)
 28 | STD = (57.38, 57.12, 58.40)
 29 | 
 30 | COCO_CLASSES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
 31 |                 'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
 32 |                 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
 33 |                 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
 34 |                 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
 35 |                 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
 36 |                 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
 37 |                 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
 38 |                 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
 39 |                 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
 40 |                 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
 41 |                 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
 42 |                 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
 43 |                 'scissors', 'teddy bear', 'hair drier', 'toothbrush')
 44 | 
 45 | COCO_LABEL_MAP = {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8,
 46 |                   9: 9, 10: 10, 11: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16,
 47 |                   18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24,
 48 |                   27: 25, 28: 26, 31: 27, 32: 28, 33: 29, 34: 30, 35: 31, 36: 32,
 49 |                   37: 33, 38: 34, 39: 35, 40: 36, 41: 37, 42: 38, 43: 39, 44: 40,
 50 |                   46: 41, 47: 42, 48: 43, 49: 44, 50: 45, 51: 46, 52: 47, 53: 48,
 51 |                   54: 49, 55: 50, 56: 51, 57: 52, 58: 53, 59: 54, 60: 55, 61: 56,
 52 |                   62: 57, 63: 58, 64: 59, 65: 60, 67: 61, 70: 62, 72: 63, 73: 64,
 53 |                   74: 65, 75: 66, 76: 67, 77: 68, 78: 69, 79: 70, 80: 71, 81: 72,
 54 |                   82: 73, 84: 74, 85: 75, 86: 76, 87: 77, 88: 78, 89: 79, 90: 80}
 55 | 
 56 | 
 57 | # ----------------------- CONFIG CLASS ----------------------- #
 58 | 
 59 | class Config(object):
 60 |     """
 61 |     Holds the configuration for anything you want it to.
 62 |     To get the currently active config, call get_cfg().
 63 | 
 64 |     To use, just do cfg.x instead of cfg['x'].
 65 |     I made this because doing cfg['x'] all the time is dumb.
 66 |     """
 67 | 
 68 |     def __init__(self, config_dict):
 69 |         for key, val in config_dict.items():
 70 |             self.__setattr__(key, val)
 71 | 
 72 |     def copy(self, new_config_dict={}):
 73 |         """
 74 |         Copies this config into a new config object, making
 75 |         the changes given by new_config_dict.
 76 |         """
 77 | 
 78 |         ret = Config(vars(self))
 79 | 
 80 |         for key, val in new_config_dict.items():
 81 |             ret.__setattr__(key, val)
 82 | 
 83 |         return ret
 84 | 
 85 |     def replace(self, new_config_dict):
 86 |         """
 87 |         Copies new_config_dict into this config object.
 88 |         Note: new_config_dict can also be a config object.
 89 |         """
 90 |         if isinstance(new_config_dict, Config):
 91 |             new_config_dict = vars(new_config_dict)
 92 | 
 93 |         for key, val in new_config_dict.items():
 94 |             self.__setattr__(key, val)
 95 | 
 96 |     def print(self):
 97 |         for k, v in vars(self).items():
 98 |             print(k, ' = ', v)
 99 | 
100 | 
101 | # ----------------------- DATASETS ----------------------- #
102 | 
103 | dataset_base = Config({
104 |     'name': 'Base Dataset',
105 | 
106 |     # Training images and annotations
107 |     'train_images': './data/coco/images/',
108 |     'train_info': 'path_to_annotation_file',
109 | 
110 |     # Validation images and annotations.
111 |     'valid_images': './data/coco/images/',
112 |     'valid_info': 'path_to_annotation_file',
113 | 
114 |     # Whether or not to load GT. If this is False, eval.py quantitative evaluation won't work.
115 |     'has_gt': True,
116 | 
117 |     # A list of names for each of you classes.
118 |     'class_names': COCO_CLASSES,
119 | 
120 |     # COCO class ids aren't sequential, so this is a bandage fix. If your ids aren't sequential,
121 |     # provide a map from category_id -> index in class_names + 1 (the +1 is there because it's 1-indexed).
122 |     # If not specified, this just assumes category ids start at 1 and increase sequentially.
123 |     'label_map': None
124 | })
125 | 
126 | coco2014_dataset = dataset_base.copy({
127 |     'name': 'COCO 2014',
128 | 
129 |     'train_info': './data/coco/annotations/instances_train2014.json',
130 |     'valid_info': './data/coco/annotations/instances_val2014.json',
131 | 
132 |     'label_map': COCO_LABEL_MAP
133 | })
134 | 
135 | coco2017_dataset = dataset_base.copy({
136 |     'name': 'COCO 2017',
137 | 
138 |     'train_info': './data/coco/annotations/instances_train2017.json',
139 |     'valid_info': './data/coco/annotations/instances_val2017.json',
140 | 
141 |     'label_map': COCO_LABEL_MAP
142 | })
143 | 
144 | coco2017_testdev_dataset = dataset_base.copy({
145 |     'name': 'COCO 2017 Test-Dev',
146 | 
147 |     'valid_info': './data/coco/annotations/image_info_test-dev2017.json',
148 |     'has_gt': False,
149 | 
150 |     'label_map': COCO_LABEL_MAP
151 | })
152 | 
153 | PASCAL_CLASSES = ("aeroplane", "bicycle", "bird", "boat", "bottle",
154 |                   "bus", "car", "cat", "chair", "cow", "diningtable",
155 |                   "dog", "horse", "motorbike", "person", "pottedplant",
156 |                   "sheep", "sofa", "train", "tvmonitor")
157 | 
158 | pascal_sbd_dataset = dataset_base.copy({
159 |     'name': 'Pascal SBD 2012',
160 | 
161 |     'train_images': './data/sbd/img',
162 |     'valid_images': './data/sbd/img',
163 | 
164 |     'train_info': './data/sbd/pascal_sbd_train.json',
165 |     'valid_info': './data/sbd/pascal_sbd_val.json',
166 | 
167 |     'class_names': PASCAL_CLASSES,
168 | })
169 | 
170 | # ----------------------- TRANSFORMS ----------------------- #
171 | 
172 | resnet_transform = Config({
173 |     'channel_order': 'RGB',
174 |     'normalize': True,
175 |     'subtract_means': False,
176 |     'to_float': False,
177 | })
178 | 
179 | vgg_transform = Config({
180 |     # Note that though vgg is traditionally BGR,
181 |     # the channel order of vgg_reducedfc.pth is RGB.
182 |     'channel_order': 'RGB',
183 |     'normalize': False,
184 |     'subtract_means': True,
185 |     'to_float': False,
186 | })
187 | 
188 | darknet_transform = Config({
189 |     'channel_order': 'RGB',
190 |     'normalize': False,
191 |     'subtract_means': False,
192 |     'to_float': True,
193 | })
194 | 
195 | # ----------------------- BACKBONES ----------------------- #
196 | 
197 | backbone_base = Config({
198 |     'name': 'Base Backbone',
199 |     'path': 'path/to/pretrained/weights',
200 |     'type': object,
201 |     'args': tuple(),
202 |     'transform': resnet_transform,
203 | 
204 |     'selected_layers': list(),
205 |     'pred_scales': list(),
206 |     'pred_aspect_ratios': list(),
207 | 
208 |     'use_pixel_scales': False,
209 |     'preapply_sqrt': True,
210 |     'use_square_anchors': False,
211 | })
212 | 
213 | resnet101_backbone = backbone_base.copy({
214 |     'name': 'ResNet101',
215 |     'path': 'resnet101_reducedfc.pth',
216 |     'type': ResNetBackbone,
217 |     'args': ([3, 4, 23, 3],),
218 |     'transform': resnet_transform,
219 | 
220 |     'selected_layers': list(range(2, 8)),
221 |     'pred_scales': [[1]] * 6,
222 |     'pred_aspect_ratios': [[[0.66685089, 1.7073535, 0.87508774, 1.16524493, 0.49059086]]] * 6,
223 | })
224 | 
225 | resnet101_gn_backbone = backbone_base.copy({
226 |     'name': 'ResNet101_GN',
227 |     'path': 'R-101-GN.pkl',
228 |     'type': ResNetBackboneGN,
229 |     'args': ([3, 4, 23, 3],),
230 |     'transform': resnet_transform,
231 | 
232 |     'selected_layers': list(range(2, 8)),
233 |     'pred_scales': [[1]] * 6,
234 |     'pred_aspect_ratios': [[[0.66685089, 1.7073535, 0.87508774, 1.16524493, 0.49059086]]] * 6,
235 | })
236 | 
237 | resnet101_dcn_inter3_backbone = resnet101_backbone.copy({
238 |     'name': 'ResNet101_DCN_Interval3',
239 |     'args': ([3, 4, 23, 3], [0, 4, 23, 3], 3),
240 | })
241 | 
242 | resnet50_backbone = resnet101_backbone.copy({
243 |     'name': 'ResNet50',
244 |     'path': 'resnet50-19c8e357.pth',
245 |     'type': ResNetBackbone,
246 |     'args': ([3, 4, 6, 3],),
247 |     'transform': resnet_transform,
248 | })
249 | 
250 | resnet50_dcnv2_backbone = resnet50_backbone.copy({
251 |     'name': 'ResNet50_DCNv2',
252 |     'args': ([3, 4, 6, 3], [0, 4, 6, 3]),
253 | })
254 | 
255 | darknet53_backbone = backbone_base.copy({
256 |     'name': 'DarkNet53',
257 |     'path': 'darknet53.pth',
258 |     'type': DarkNetBackbone,
259 |     'args': ([1, 2, 8, 8, 4],),
260 |     'transform': darknet_transform,
261 | 
262 |     'selected_layers': list(range(3, 9)),
263 |     'pred_scales': [[3.5, 4.95], [3.6, 4.90], [3.3, 4.02], [2.7, 3.10], [2.1, 2.37], [1.8, 1.92]],
264 |     'pred_aspect_ratios': [[[1, sqrt(2), 1 / sqrt(2), sqrt(3), 1 / sqrt(3)][:n], [1]] for n in [3, 5, 5, 5, 3, 3]],
265 | })
266 | 
267 | vgg16_arch = [[64, 64],
268 |               ['M', 128, 128],
269 |               ['M', 256, 256, 256],
270 |               [('M', {'kernel_size': 2, 'stride': 2, 'ceil_mode': True}), 512, 512, 512],
271 |               ['M', 512, 512, 512],
272 |               [('M', {'kernel_size': 3, 'stride': 1, 'padding': 1}),
273 |                (1024, {'kernel_size': 3, 'padding': 6, 'dilation': 6}),
274 |                (1024, {'kernel_size': 1})]]
275 | 
276 | vgg16_backbone = backbone_base.copy({
277 |     'name': 'VGG16',
278 |     'path': 'vgg16_reducedfc.pth',
279 |     'type': VGGBackbone,
280 |     'args': (vgg16_arch, [(256, 2), (128, 2), (128, 1), (128, 1)], [3]),
281 |     'transform': vgg_transform,
282 | 
283 |     'selected_layers': [3] + list(range(5, 10)),
284 |     'pred_scales': [[5, 4]] * 6,
285 |     'pred_aspect_ratios': [[[1], [1, sqrt(2), 1 / sqrt(2), sqrt(3), 1 / sqrt(3)][:n]] for n in [3, 5, 5, 5, 3, 3]],
286 | })
287 | 
288 | # ----------------------- MASK BRANCH TYPES ----------------------- #
289 | 
290 | mask_type = Config({
291 |     # Direct produces masks directly as the output of each pred module.
292 |     # This is denoted as fc-mask in the paper.
293 |     # Parameters: mask_size, use_gt_bboxes
294 |     'direct': 0,
295 | 
296 |     # Lincomb produces coefficients as the output of each pred module then uses those coefficients
297 |     # to linearly combine features from a prototype network to create image-sized masks.
298 |     # Parameters:
299 |     #   - masks_to_train (int): Since we're producing (near) full image masks, it'd take too much
300 |     #                           vram to backprop on every single mask. Thus we select only a subset.
301 |     #   - mask_proto_src (int): The input layer to the mask prototype generation network. This is an
302 |     #                           index in backbone.layers. Use to use the image itself instead.
303 |     #   - mask_proto_net (list<tuple>): A list of layers in the mask proto network with the last one
304 |     #                                   being where the masks are taken from. Each conv layer is in
305 |     #                                   the form (num_features, kernel_size, **kwdargs). An empty
306 |     #                                   list means to use the source for prototype masks. If the
307 |     #                                   kernel_size is negative, this creates a deconv layer instead.
308 |     #                                   If the kernel_size is negative and the num_features is None,
309 |     #                                   this creates a simple bilinear interpolation layer instead.
310 |     #   - mask_proto_bias (bool): Whether to include an extra coefficient that corresponds to a proto
311 |     #                             mask of all ones.
312 |     #   - mask_proto_prototype_activation (func): The activation to apply to each prototype mask.
313 |     #   - mask_proto_mask_activation (func): After summing the prototype masks with the predicted
314 |     #                                        coeffs, what activation to apply to the final mask.
315 |     #   - mask_proto_coeff_activation (func): The activation to apply to the mask coefficients.
316 |     #   - mask_proto_crop (bool): If True, crop the mask with the predicted bbox during training.
317 |     #   - mask_proto_crop_expand (float): If cropping, the percent to expand the cropping bbox by
318 |     #                                     in each direction. This is to make the model less reliant
319 |     #                                     on perfect bbox predictions.
320 |     #   - mask_proto_loss (str [l1|disj]): If not None, apply an l1 or disjunctive regularization
321 |     #                                      loss directly to the prototype masks.
322 |     #   - mask_proto_binarize_downsampled_gt (bool): Binarize GT after dowsnampling during training?
323 |     #   - mask_proto_normalize_mask_loss_by_sqrt_area (bool): Whether to normalize mask loss by sqrt(sum(gt))
324 |     #   - mask_proto_reweight_mask_loss (bool): Reweight mask loss such that background is divided by
325 |     #                                           #background and foreground is divided by #foreground.
326 |     #   - mask_proto_grid_file (str): The path to the grid file to use with the next option.
327 |     #                                 This should be a numpy.dump file with shape [numgrids, h, w]
328 |     #                                 where h and w are w.r.t. the mask_proto_src convout.
329 |     #   - mask_proto_use_grid (bool): Whether to add extra grid features to the proto_net input.
330 |     #   - mask_proto_coeff_gate (bool): Add an extra set of sigmoided coefficients that is multiplied
331 |     #                                   into the predicted coefficients in order to "gate" them.
332 |     #   - mask_proto_prototypes_as_features (bool): For each prediction module, downsample the prototypes
333 |     #                                 to the convout size of that module and supply the prototypes as input
334 |     #                                 in addition to the already supplied backbone features.
335 |     #   - mask_proto_prototypes_as_features_no_grad (bool): If the above is set, don't backprop gradients to
336 |     #                                 to the prototypes from the network head.
337 |     #   - mask_proto_remove_empty_masks (bool): Remove masks that are downsampled to 0 during loss calculations.
338 |     #   - mask_proto_reweight_coeff (float): The coefficient to multiple the forground pixels with if reweighting.
339 |     #   - mask_proto_coeff_diversity_loss (bool): Apply coefficient diversity loss on the coefficients so that the same
340 |     #                                             instance has similar coefficients.
341 |     #   - mask_proto_coeff_diversity_alpha (float): The weight to use for the coefficient diversity loss.
342 |     #   - mask_proto_normalize_emulate_roi_pooling (bool): Normalize the mask loss to emulate roi pooling's affect on loss.
343 |     #   - mask_proto_double_loss (bool): Whether to use the old loss in addition to any special new losses.
344 |     #   - mask_proto_double_loss_alpha (float): The alpha to weight the above loss.
345 |     #   - mask_proto_split_prototypes_by_head (bool): If true, this will give each prediction head its own prototypes.
346 |     #   - mask_proto_crop_with_pred_box (bool): Whether to crop with the predicted box or the gt box.
347 |     'lincomb': 1,
348 | })
349 | 
350 | # ----------------------- ACTIVATION FUNCTIONS ----------------------- #
351 | 
352 | activation_func = Config({
353 |     'tanh': torch.tanh,
354 |     'sigmoid': torch.sigmoid,
355 |     'softmax': lambda x: torch.nn.functional.softmax(x, dim=-1),
356 |     'relu': lambda x: torch.nn.functional.relu(x, inplace=True),
357 |     'none': lambda x: x,
358 | })
359 | 
360 | # ----------------------- FPN DEFAULTS ----------------------- #
361 | 
362 | fpn_base = Config({
363 |     # The number of features to have in each FPN layer
364 |     'num_features': 256,
365 | 
366 |     # The upsampling mode used
367 |     'interpolation_mode': 'bilinear',
368 | 
369 |     # The number of extra layers to be produced by downsampling starting at P5
370 |     'num_downsample': 1,
371 | 
372 |     # Whether to down sample with a 3x3 stride 2 conv layer instead of just a stride 2 selection
373 |     'use_conv_downsample': False,
374 | 
375 |     # Whether to pad the pred layers with 1 on each side (I forgot to add this at the start)
376 |     # This is just here for backwards compatibility
377 |     'pad': True,
378 | 
379 |     # Whether to add relu to the downsampled layers.
380 |     'relu_downsample_layers': False,
381 | 
382 |     # Whether to add relu to the regular layers
383 |     'relu_pred_layers': True,
384 | })
385 | 
386 | # ----------------------- CONFIG DEFAULTS ----------------------- #
387 | 
388 | coco_base_config = Config({
389 |     'dataset': coco2014_dataset,
390 |     'num_classes': 81,  # This should include the background class
391 | 
392 |     'max_iter': 400000,
393 | 
394 |     # The maximum number of detections for evaluation
395 |     'max_num_detections': 100,
396 | 
397 |     # dw' = momentum * dw - lr * (grad + decay * w)
398 |     'lr': 1e-3,
399 |     'momentum': 0.9,
400 |     'decay': 5e-4,
401 | 
402 |     # For each lr step, what to multiply the lr with
403 |     'gamma': 0.1,
404 |     'lr_steps': (280000, 360000, 400000),
405 | 
406 |     # Initial learning rate to linearly warmup from (if until > 0)
407 |     'lr_warmup_init': 1e-4,
408 | 
409 |     # If > 0 then increase the lr linearly from warmup_init to lr each iter for until iters
410 |     'lr_warmup_until': 500,
411 | 
412 |     # The terms to scale the respective loss by
413 |     'conf_alpha': 1,
414 |     'bbox_alpha': 1.5,
415 |     'mask_alpha': 0.4 / 256 * 140 * 140,  # Some funky equation. Don't worry about it.
416 | 
417 |     # Eval.py sets this if you just want to run YOLACT as a detector
418 |     'eval_mask_branch': True,
419 | 
420 |     # Top_k examples to consider for NMS
421 |     'nms_top_k': 200,
422 |     # Examples with confidence less than this are not considered by NMS
423 |     'nms_conf_thresh': 0.05,
424 |     # Boxes with IoU overlap greater than this threshold will be culled during NMS
425 |     'nms_thresh': 0.5,
426 | 
427 |     # See mask_type for details.
428 |     'mask_type': mask_type.direct,
429 |     'mask_size': 16,
430 |     'masks_to_train': 100,
431 |     'mask_proto_src': None,
432 |     'mask_proto_net': [(256, 3, {}), (256, 3, {})],
433 |     'mask_proto_bias': False,
434 |     'mask_proto_prototype_activation': activation_func.relu,
435 |     'mask_proto_mask_activation': activation_func.sigmoid,
436 |     'mask_proto_coeff_activation': activation_func.tanh,
437 |     'mask_proto_crop': True,
438 |     'mask_proto_crop_expand': 0,
439 |     'mask_proto_loss': None,
440 |     'mask_proto_binarize_downsampled_gt': True,
441 |     'mask_proto_normalize_mask_loss_by_sqrt_area': False,
442 |     'mask_proto_reweight_mask_loss': False,
443 |     'mask_proto_grid_file': 'data/grid.npy',
444 |     'mask_proto_use_grid': False,
445 |     'mask_proto_coeff_gate': False,
446 |     'mask_proto_prototypes_as_features': False,
447 |     'mask_proto_prototypes_as_features_no_grad': False,
448 |     'mask_proto_remove_empty_masks': False,
449 |     'mask_proto_reweight_coeff': 1,
450 |     'mask_proto_coeff_diversity_loss': False,
451 |     'mask_proto_coeff_diversity_alpha': 1,
452 |     'mask_proto_normalize_emulate_roi_pooling': False,
453 |     'mask_proto_double_loss': False,
454 |     'mask_proto_double_loss_alpha': 1,
455 |     'mask_proto_split_prototypes_by_head': False,
456 |     'mask_proto_crop_with_pred_box': False,
457 | 
458 |     # SSD data augmentation parameters
459 |     # Randomize hue, vibrance, etc.
460 |     'augment_photometric_distort': True,
461 |     # Have a chance to scale down the image and pad (to emulate smaller detections)
462 |     'augment_expand': True,
463 |     # Potentialy sample a random crop from the image and put it in a random place
464 |     'augment_random_sample_crop': True,
465 |     # Mirror the image with a probability of 1/2
466 |     'augment_random_mirror': True,
467 |     # Flip the image vertically with a probability of 1/2
468 |     'augment_random_flip': False,
469 |     # With uniform probability, rotate the image [0,90,180,270] degrees
470 |     'augment_random_rot90': False,
471 | 
472 |     # Discard detections with width and height smaller than this (in absolute width and height)
473 |     'discard_box_width': 4 / 550,
474 |     'discard_box_height': 4 / 550,
475 | 
476 |     # If using batchnorm anywhere in the backbone, freeze the batchnorm layer during training.
477 |     # Note: any additional batch norm layers after the backbone will not be frozen.
478 |     'freeze_bn': False,
479 | 
480 |     # Set this to a config object if you want an FPN (inherit from fpn_base). See fpn_base for details.
481 |     'fpn': None,
482 | 
483 |     # Use the same weights for each network head
484 |     'share_prediction_module': False,
485 | 
486 |     # For hard negative mining, instead of using the negatives that are leastl confidently background,
487 |     # use negatives that are most confidently not background.
488 |     'ohem_use_most_confident': False,
489 | 
490 |     # Use focal loss as described in https://arxiv.org/pdf/1708.02002.pdf instead of OHEM
491 |     'use_focal_loss': False,
492 |     'focal_loss_alpha': 0.25,
493 |     'focal_loss_gamma': 2,
494 | 
495 |     # The initial bias toward forground objects, as specified in the focal loss paper
496 |     'focal_loss_init_pi': 0.01,
497 | 
498 |     # Keeps track of the average number of examples for each class, and weights the loss for that class accordingly.
499 |     'use_class_balanced_conf': False,
500 | 
501 |     # Whether to use sigmoid focal loss instead of softmax, all else being the same.
502 |     'use_sigmoid_focal_loss': False,
503 | 
504 |     # Use class[0] to be the objectness score and class[1:] to be the softmax predicted class.
505 |     # Note: at the moment this is only implemented if use_focal_loss is on.
506 |     'use_objectness_score': False,
507 | 
508 |     # Adds a global pool + fc layer to the smallest selected layer that predicts the existence of each of the 80 classes.
509 |     # This branch is only evaluated during training time and is just there for multitask learning.
510 |     'use_class_existence_loss': False,
511 |     'class_existence_alpha': 1,
512 | 
513 |     # Adds a 1x1 convolution directly to the biggest selected layer that predicts a semantic segmentations for each of the 80 classes.
514 |     # This branch is only evaluated during training time and is just there for multitask learning.
515 |     'use_semantic_segmentation_loss': False,
516 |     'semantic_segmentation_alpha': 1,
517 | 
518 |     # Adds another branch to the netwok to predict Mask IoU.
519 |     'use_mask_scoring': False,
520 |     'mask_scoring_alpha': 1,
521 | 
522 |     # Match gt boxes using the Box2Pix change metric instead of the standard IoU metric.
523 |     # Note that the threshold you set for iou_threshold should be negative with this setting on.
524 |     'use_change_matching': False,
525 | 
526 |     # Uses the same network format as mask_proto_net, except this time it's for adding extra head layers before the final
527 |     # prediction in prediction modules. If this is none, no extra layers will be added.
528 |     'extra_head_net': None,
529 | 
530 |     # What params should the final head layers have (the ones that predict box, confidence, and mask coeffs)
531 |     'head_layer_params': {'kernel_size': 3, 'padding': 1},
532 | 
533 |     # Add extra layers between the backbone and the network heads
534 |     # The order is (bbox, conf, mask)
535 |     'extra_layers': (0, 0, 0),
536 | 
537 |     # During training, to match detections with gt, first compute the maximum gt IoU for each prior.
538 |     # Then, any of those priors whose maximum overlap is over the positive threshold, mark as positive.
539 |     # For any priors whose maximum is less than the negative iou threshold, mark them as negative.
540 |     # The rest are neutral and not used in calculating the loss.
541 |     'positive_iou_threshold': 0.5,
542 |     'negative_iou_threshold': 0.5,
543 | 
544 |     # When using ohem, the ratio between positives and negatives (3 means 3 negatives to 1 positive)
545 |     'ohem_negpos_ratio': 3,
546 | 
547 |     # If less than 1, anchors treated as a negative that have a crowd iou over this threshold with
548 |     # the crowd boxes will be treated as a neutral.
549 |     'crowd_iou_threshold': 1,
550 | 
551 |     # This is filled in at runtime by Yolact's __init__, so don't touch it
552 |     'mask_dim': None,
553 | 
554 |     # Input image size.
555 |     'max_size': 300,
556 | 
557 |     # Whether or not to do post processing on the cpu at test time
558 |     'force_cpu_nms': True,
559 | 
560 |     # Whether to use mask coefficient cosine similarity nms instead of bbox iou nms
561 |     'use_coeff_nms': False,
562 | 
563 |     # Whether or not to have a separate branch whose sole purpose is to act as the coefficients for coeff_diversity_loss
564 |     # Remember to turn on coeff_diversity_loss, or these extra coefficients won't do anything!
565 |     # To see their effect, also remember to turn on use_coeff_nms.
566 |     'use_instance_coeff': False,
567 |     'num_instance_coeffs': 64,
568 | 
569 |     # Whether or not to tie the mask loss / box loss to 0
570 |     'train_masks': True,
571 |     'train_boxes': True,
572 |     # If enabled, the gt masks will be cropped using the gt bboxes instead of the predicted ones.
573 |     # This speeds up training time considerably but results in much worse mAP at test time.
574 |     'use_gt_bboxes': False,
575 | 
576 |     # Whether or not to preserve aspect ratio when resizing the image.
577 |     # If True, this will resize all images to be max_size^2 pixels in area while keeping aspect ratio.
578 |     # If False, all images are resized to max_size x max_size
579 |     'preserve_aspect_ratio': False,
580 | 
581 |     # Whether or not to use the prediction module (c) from DSSD
582 |     'use_prediction_module': False,
583 | 
584 |     # Whether or not to use the predicted coordinate scheme from Yolo v2
585 |     'use_yolo_regressors': False,
586 | 
587 |     # For training, bboxes are considered "positive" if their anchors have a 0.5 IoU overlap
588 |     # or greater with a ground truth box. If this is true, instead of using the anchor boxes
589 |     # for this IoU computation, the matching function will use the predicted bbox coordinates.
590 |     # Don't turn this on if you're not using yolo regressors!
591 |     'use_prediction_matching': False,
592 | 
593 |     # A list of settings to apply after the specified iteration. Each element of the list should look like
594 |     # (iteration, config_dict) where config_dict is a dictionary you'd pass into a config object's init.
595 |     'delayed_settings': [],
596 | 
597 |     # Use command-line arguments to set this.
598 |     'no_jit': False,
599 | 
600 |     'backbone': None,
601 |     'name': 'base_config',
602 | 
603 |     # Fast Mask Re-scoring Network
604 |     # Inspried by Mask Scoring R-CNN (https://arxiv.org/abs/1903.00241)
605 |     # Do not crop out the mask with bbox but slide a convnet on the image-size mask,
606 |     # then use global pooling to get the final mask score
607 |     'use_maskiou': False,
608 | 
609 |     # Archecture for the mask iou network. A (num_classes-1, 1, {}) layer is appended to the end.
610 |     'maskiou_net': [],
611 | 
612 |     # Discard predicted masks whose area is less than this
613 |     'discard_mask_area': -1,
614 | 
615 |     'maskiou_alpha': 1.0,
616 |     'rescore_mask': False,
617 |     'rescore_bbox': False,
618 |     'maskious_to_train': -1,
619 | })
620 | 
621 | # ----------------------- YOLACT v1.0 CONFIGS ----------------------- #
622 | 
623 | yolact_base_config = coco_base_config.copy({
624 |     'name': 'yolact_base',
625 | 
626 |     # Dataset stuff
627 |     'dataset': coco2017_dataset,
628 |     'num_classes': len(coco2017_dataset.class_names) + 1,
629 | 
630 |     # Image Size
631 |     'max_size': 550,
632 | 
633 |     # Training params
634 |     'lr_steps': (280000, 600000, 700000, 750000),
635 |     'max_iter': 800000,
636 | 
637 |     # Backbone Settings
638 |     'backbone': resnet101_backbone.copy({
639 |         'selected_layers': list(range(1, 4)),
640 |         'use_pixel_scales': True,
641 |         'preapply_sqrt': False,
642 |         'use_square_anchors': True,  # This is for backward compatability with a bug
643 | 
644 |         'pred_aspect_ratios': [[[1, 1 / 2, 2]]] * 5,
645 |         'pred_scales': [[24], [48], [96], [192], [384]],
646 |     }),
647 | 
648 |     # FPN Settings
649 |     'fpn': fpn_base.copy({
650 |         'use_conv_downsample': True,
651 |         'num_downsample': 2,
652 |     }),
653 | 
654 |     # Mask Settings
655 |     'mask_type': mask_type.lincomb,
656 |     'mask_alpha': 6.125,
657 |     'mask_proto_src': 0,
658 |     'mask_proto_net': [(256, 3, {'padding': 1})] * 3 + [(None, -2, {}), (256, 3, {'padding': 1})] + [(32, 1, {})],
659 |     'mask_proto_normalize_emulate_roi_pooling': True,
660 | 
661 |     # Other stuff
662 |     'share_prediction_module': True,
663 |     'extra_head_net': [(256, 3, {'padding': 1})],
664 | 
665 |     'positive_iou_threshold': 0.5,
666 |     'negative_iou_threshold': 0.4,
667 | 
668 |     'crowd_iou_threshold': 0.7,
669 | 
670 |     'use_semantic_segmentation_loss': True,
671 | })
672 | 
673 | yolact_im400_config = yolact_base_config.copy({
674 |     'name': 'yolact_im400',
675 | 
676 |     'max_size': 400,
677 |     'backbone': yolact_base_config.backbone.copy({
678 |         'pred_scales': [[int(x[0] / yolact_base_config.max_size * 400)] for x in
679 |                         yolact_base_config.backbone.pred_scales],
680 |     }),
681 | })
682 | 
683 | yolact_im700_config = yolact_base_config.copy({
684 |     'name': 'yolact_im700',
685 | 
686 |     'masks_to_train': 300,
687 |     'max_size': 700,
688 |     'backbone': yolact_base_config.backbone.copy({
689 |         'pred_scales': [[int(x[0] / yolact_base_config.max_size * 700)] for x in
690 |                         yolact_base_config.backbone.pred_scales],
691 |     }),
692 | })
693 | 
694 | yolact_darknet53_config = yolact_base_config.copy({
695 |     'name': 'yolact_darknet53',
696 | 
697 |     'backbone': darknet53_backbone.copy({
698 |         'selected_layers': list(range(2, 5)),
699 | 
700 |         'pred_scales': yolact_base_config.backbone.pred_scales,
701 |         'pred_aspect_ratios': yolact_base_config.backbone.pred_aspect_ratios,
702 |         'use_pixel_scales': True,
703 |         'preapply_sqrt': False,
704 |         'use_square_anchors': True,  # This is for backward compatability with a bug
705 |     }),
706 | })
707 | 
708 | yolact_resnet50_config = yolact_base_config.copy({
709 |     'name': 'yolact_resnet50',
710 | 
711 |     'backbone': resnet50_backbone.copy({
712 |         'selected_layers': list(range(1, 4)),
713 | 
714 |         'pred_scales': yolact_base_config.backbone.pred_scales,
715 |         'pred_aspect_ratios': yolact_base_config.backbone.pred_aspect_ratios,
716 |         'use_pixel_scales': True,
717 |         'preapply_sqrt': False,
718 |         'use_square_anchors': True,  # This is for backward compatability with a bug
719 |     }),
720 | })
721 | 
722 | yolact_resnet50_pascal_config = yolact_resnet50_config.copy({
723 |     'name': None,  # Will default to yolact_resnet50_pascal
724 | 
725 |     # Dataset stuff
726 |     'dataset': pascal_sbd_dataset,
727 |     'num_classes': len(pascal_sbd_dataset.class_names) + 1,
728 | 
729 |     'max_iter': 120000,
730 |     'lr_steps': (60000, 100000),
731 | 
732 |     'backbone': yolact_resnet50_config.backbone.copy({
733 |         'pred_scales': [[32], [64], [128], [256], [512]],
734 |         'use_square_anchors': False,
735 |     })
736 | })
737 | 
738 | # ----------------------- YOLACT++ CONFIGS ----------------------- #
739 | 
740 | yolact_plus_base_config = yolact_base_config.copy({
741 |     'name': 'yolact_plus_base',
742 | 
743 |     'backbone': resnet101_dcn_inter3_backbone.copy({
744 |         'selected_layers': list(range(1, 4)),
745 | 
746 |         'pred_aspect_ratios': [[[1, 1 / 2, 2]]] * 5,
747 |         'pred_scales': [[i * 2 ** (j / 3.0) for j in range(3)] for i in [24, 48, 96, 192, 384]],
748 |         'use_pixel_scales': True,
749 |         'preapply_sqrt': False,
750 |         'use_square_anchors': False,
751 |     }),
752 | 
753 |     'use_maskiou': True,
754 |     'maskiou_net': [(8, 3, {'stride': 2}), (16, 3, {'stride': 2}), (32, 3, {'stride': 2}), (64, 3, {'stride': 2}),
755 |                     (128, 3, {'stride': 2})],
756 |     'maskiou_alpha': 25,
757 |     'rescore_bbox': False,
758 |     'rescore_mask': True,
759 | 
760 |     'discard_mask_area': 5 * 5,
761 | })
762 | 
763 | yolact_plus_resnet50_config = yolact_plus_base_config.copy({
764 |     'name': 'yolact_plus_resnet50',
765 | 
766 |     'backbone': resnet50_dcnv2_backbone.copy({
767 |         'selected_layers': list(range(1, 4)),
768 | 
769 |         'pred_aspect_ratios': [[[1, 1 / 2, 2]]] * 5,
770 |         'pred_scales': [[i * 2 ** (j / 3.0) for j in range(3)] for i in [24, 48, 96, 192, 384]],
771 |         'use_pixel_scales': True,
772 |         'preapply_sqrt': False,
773 |         'use_square_anchors': False,
774 |     }),
775 | })
776 | 
777 | # Default config
778 | cfg = yolact_base_config.copy()
779 | 
780 | 
781 | def set_cfg(config_name: str):
782 |     """ Sets the active config. Works even if cfg is already imported! """
783 |     global cfg
784 | 
785 |     # Note this is not just an eval because I'm lazy, but also because it can
786 |     # be used like ssd300_config.copy({'max_size': 400}) for extreme fine-tuning
787 |     cfg.replace(eval(config_name))
788 | 
789 |     if cfg.name is None:
790 |         cfg.name = config_name.split('_config')[0]
791 | 
792 | 
793 | def set_dataset(dataset_name: str):
794 |     """ Sets the dataset of the current config. """
795 |     cfg.dataset = eval(dataset_name)
796 | 
797 | 


--------------------------------------------------------------------------------