├── 000000020247.jpg ├── 000000046804.jpg ├── 000000079565.jpg ├── 000000081988.jpg ├── example_01.jpg ├── example_02.jpg ├── README.md ├── convert-onnx ├── convert_onnx.py ├── backbone.py ├── yolact.py └── config.py ├── main_yolact.py └── main.cpp /000000020247.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpc203/yolact-opencv-dnn-cpp-python/HEAD/000000020247.jpg -------------------------------------------------------------------------------- /000000046804.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpc203/yolact-opencv-dnn-cpp-python/HEAD/000000046804.jpg -------------------------------------------------------------------------------- /000000079565.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpc203/yolact-opencv-dnn-cpp-python/HEAD/000000079565.jpg -------------------------------------------------------------------------------- /000000081988.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpc203/yolact-opencv-dnn-cpp-python/HEAD/000000081988.jpg -------------------------------------------------------------------------------- /example_01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpc203/yolact-opencv-dnn-cpp-python/HEAD/example_01.jpg -------------------------------------------------------------------------------- /example_02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpc203/yolact-opencv-dnn-cpp-python/HEAD/example_02.jpg -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # yolact-opencv-dnn-cpp-python 2 | 使用opencv部署yolact实例分割,包含C++和Python两种版本的程序。 3 | 4 | onnx文件从百度云盘下载, 5 | 链接:https://pan.baidu.com/s/1509Cn70a4iPS4UuCC4sduw 提取码:8tlj 6 | 7 | 5月1日,提交了转换生成onnx文件的程序,在convert-onnx目录里,原始的.pth 8 | 文件从百度云盘下载,链接:https://pan.baidu.com/s/1AVgOAPCChcQ0a46U7F7QlA 9 | 提取码:8hsp 10 | -------------------------------------------------------------------------------- /convert-onnx/convert_onnx.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | import cv2 4 | from yolact import Yolact 5 | 6 | if __name__=='__main__': 7 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 8 | trained_model = 'yolact_base_54_800000.pth' 9 | net = Yolact() 10 | net.load_weights(trained_model) 11 | net.eval() 12 | net.to(device) 13 | 14 | output_onnx = os.path.splitext(trained_model)[0] + '.onnx' 15 | inputs = torch.randn(1, 3, 550, 550).to(device) 16 | print('convert',output_onnx,'begin') 17 | torch.onnx.export(net, inputs, output_onnx, verbose=False, opset_version=12, input_names=['image'], 18 | output_names=['loc', 'conf', 'mask', 'proto']) 19 | print('convert', output_onnx, 'to onnx finish!!!') 20 | 21 | try: 22 | dnnnet = cv2.dnn.readNet(output_onnx) 23 | print('read sucess') 24 | except: 25 | print('read failed') 26 | dnnnet = cv2.dnn.readNet(output_onnx) -------------------------------------------------------------------------------- /main_yolact.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import argparse 4 | 5 | COCO_CLASSES = ('background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 6 | 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 7 | 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 8 | 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 9 | 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 10 | 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 11 | 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 12 | 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 13 | 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 14 | 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 15 | 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 16 | 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 17 | 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 18 | 'scissors', 'teddy bear', 'hair drier', 'toothbrush') 19 | 20 | colors = [ 21 | [56, 0, 255], 22 | [226, 255, 0], 23 | [0, 94, 255], 24 | [0, 37, 255], 25 | [0, 255, 94], 26 | [255, 226, 0], 27 | [0, 18, 255], 28 | [255, 151, 0], 29 | [170, 0, 255], 30 | [0, 255, 56], 31 | [255, 0, 75], 32 | [0, 75, 255], 33 | [0, 255, 169], 34 | [255, 0, 207], 35 | [75, 255, 0], 36 | [207, 0, 255], 37 | [37, 0, 255], 38 | [0, 207, 255], 39 | [94, 0, 255], 40 | [0, 255, 113], 41 | [255, 18, 0], 42 | [255, 0, 56], 43 | [18, 0, 255], 44 | [0, 255, 226], 45 | [170, 255, 0], 46 | [255, 0, 245], 47 | [151, 255, 0], 48 | [132, 255, 0], 49 | [75, 0, 255], 50 | [151, 0, 255], 51 | [0, 151, 255], 52 | [132, 0, 255], 53 | [0, 255, 245], 54 | [255, 132, 0], 55 | [226, 0, 255], 56 | [255, 37, 0], 57 | [207, 255, 0], 58 | [0, 255, 207], 59 | [94, 255, 0], 60 | [0, 226, 255], 61 | [56, 255, 0], 62 | [255, 94, 0], 63 | [255, 113, 0], 64 | [0, 132, 255], 65 | [255, 0, 132], 66 | [255, 170, 0], 67 | [255, 0, 188], 68 | [113, 255, 0], 69 | [245, 0, 255], 70 | [113, 0, 255], 71 | [255, 188, 0], 72 | [0, 113, 255], 73 | [255, 0, 0], 74 | [0, 56, 255], 75 | [255, 0, 113], 76 | [0, 255, 188], 77 | [255, 0, 94], 78 | [255, 0, 18], 79 | [18, 255, 0], 80 | [0, 255, 132], 81 | [0, 188, 255], 82 | [0, 245, 255], 83 | [0, 169, 255], 84 | [37, 255, 0], 85 | [255, 0, 151], 86 | [188, 0, 255], 87 | [0, 255, 37], 88 | [0, 255, 0], 89 | [255, 0, 170], 90 | [255, 0, 37], 91 | [255, 75, 0], 92 | [0, 0, 255], 93 | [255, 207, 0], 94 | [255, 0, 226], 95 | [255, 245, 0], 96 | [188, 255, 0], 97 | [0, 255, 18], 98 | [0, 255, 75], 99 | [0, 255, 151], 100 | [255, 56, 0], 101 | [245, 255, 0], 102 | ] 103 | 104 | class yolact(): 105 | def __init__(self, confThreshold=0.5, nmsThreshold=0.5, keep_top_k=200): 106 | self.target_size = 550 107 | self.MEANS = np.array([103.94, 116.78, 123.68], dtype=np.float32).reshape(1, 1, 3) 108 | self.STD = np.array([57.38, 57.12, 58.40], dtype=np.float32).reshape(1, 1, 3) 109 | self.net = cv2.dnn.readNet('yolact_base_54_800000.onnx') 110 | self.confidence_threshold = confThreshold 111 | self.nms_threshold = nmsThreshold 112 | self.keep_top_k = keep_top_k 113 | self.conv_ws = [69, 35, 18, 9, 5] 114 | self.conv_hs = [69, 35, 18, 9, 5] 115 | self.aspect_ratios = [1, 0.5, 2] 116 | self.scales = [24, 48, 96, 192, 384] 117 | self.variances = [0.1, 0.2] 118 | self.last_img_size = None 119 | self.priors = self.make_priors() 120 | 121 | def make_priors(self): 122 | """ Note that priors are [x,y,width,height] where (x,y) is the center of the box. """ 123 | if self.last_img_size != (self.target_size, self.target_size): 124 | prior_data = [] 125 | 126 | for conv_w, conv_h, scale in zip(self.conv_ws, self.conv_hs, self.scales): 127 | for i in range(conv_h): 128 | for j in range(conv_w): 129 | # +0.5 because priors are in center-size notation 130 | cx = (j + 0.5) / conv_w 131 | cy = (i + 0.5) / conv_h 132 | 133 | for ar in self.aspect_ratios: 134 | ar = np.sqrt(ar) 135 | 136 | w = scale * ar / self.target_size 137 | h = scale / ar / self.target_size 138 | 139 | # This is for backward compatability with a bug where I made everything square by accident 140 | h = w 141 | 142 | prior_data += [cx, cy, w, h] 143 | 144 | self.priors = np.array(prior_data).reshape(-1, 4) 145 | self.last_img_size = (self.target_size, self.target_size) 146 | return self.priors 147 | 148 | def decode(self, loc, priors, img_w, img_h): 149 | boxes = np.concatenate( 150 | ( 151 | priors[:, :2] + loc[:, :2] * self.variances[0] * priors[:, 2:], 152 | priors[:, 2:] * np.exp(loc[:, 2:] * self.variances[1]), 153 | ), 154 | 1, 155 | ) 156 | boxes[:, :2] -= boxes[:, 2:] / 2 157 | # boxes[:, 2:] += boxes[:, :2] 158 | 159 | # crop 160 | np.where(boxes[:, 0] < 0, 0, boxes[:, 0]) 161 | np.where(boxes[:, 1] < 0, 0, boxes[:, 1]) 162 | np.where(boxes[:, 2] > 1, 1, boxes[:, 2]) 163 | np.where(boxes[:, 3] > 1, 1, boxes[:, 3]) 164 | 165 | # decode to img size 166 | boxes[:, 0] *= img_w 167 | boxes[:, 1] *= img_h 168 | boxes[:, 2] = boxes[:, 2] * img_w + 1 169 | boxes[:, 3] = boxes[:, 3] * img_h + 1 170 | return boxes 171 | 172 | def detect(self, srcimg): 173 | img_h, img_w = srcimg.shape[:2] 174 | img = cv2.resize(srcimg, (self.target_size, self.target_size), interpolation=cv2.INTER_LINEAR).astype(np.float32) 175 | img = (img - self.MEANS) / self.STD 176 | 177 | blob = cv2.dnn.blobFromImage(img, swapRB=True) 178 | # Sets the input to the network 179 | self.net.setInput(blob) 180 | # Runs the forward pass to get output of the output layers 181 | loc_data, conf_preds, mask_data, proto_data = self.net.forward(self.net.getUnconnectedOutLayersNames()) 182 | 183 | cur_scores = conf_preds[:, 1:] 184 | num_class = cur_scores.shape[1] 185 | classid = np.argmax(cur_scores, axis=1) 186 | # conf_scores = np.max(cur_scores, axis=1) 187 | conf_scores = cur_scores[range(cur_scores.shape[0]), classid] 188 | 189 | # filte by confidence_threshold 190 | keep = conf_scores > self.confidence_threshold 191 | conf_scores = conf_scores[keep] 192 | classid = classid[keep] 193 | loc_data = loc_data[keep, :] 194 | prior_data = self.priors[keep, :] 195 | masks = mask_data[keep, :] 196 | boxes = self.decode(loc_data, prior_data, img_w, img_h) 197 | indices = cv2.dnn.NMSBoxes(boxes.tolist(), conf_scores.tolist(), self.confidence_threshold, self.nms_threshold , top_k=self.keep_top_k) 198 | for i in indices: 199 | idx = i[0] 200 | left, top, width, height = boxes[idx, :].astype(np.int32).tolist() 201 | cv2.rectangle(srcimg, (left, top), (left+width, top+height), (0, 0, 255), thickness=1) 202 | cv2.putText(srcimg, COCO_CLASSES[classid[idx]+1]+':'+str(round(conf_scores[idx], 2)), (left, top-5), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), thickness=2) 203 | 204 | # generate mask 205 | mask = proto_data @ masks[idx, :].reshape(-1,1) 206 | mask = 1 / (1 + np.exp(-mask)) ###sigmoid 207 | 208 | # Scale masks up to the full image 209 | mask = cv2.resize(mask.squeeze(), (img_w, img_h), interpolation=cv2.INTER_LINEAR) 210 | mask = mask > 0.5 211 | srcimg[mask] = srcimg[mask] * 0.5 + np.array(colors[classid[idx]+1]) * 0.5 212 | return srcimg 213 | 214 | if __name__=='__main__': 215 | parser = argparse.ArgumentParser(description='YOLACT COCO Evaluation') 216 | parser.add_argument('--imgpath', default='000000046804.jpg', type=str, help='A path to an image to use for display.') 217 | parser.add_argument('--confThreshold', default=0.5, type=float, help='class confidence') 218 | parser.add_argument('--nmsThreshold', default=0.5, type=float, help='nms iou thresh') 219 | args = parser.parse_args() 220 | 221 | myyolact = yolact() 222 | srcimg = cv2.imread(args.imgpath) 223 | srcimg = myyolact.detect(srcimg) 224 | 225 | cv2.namedWindow('yolact', cv2.WINDOW_NORMAL) 226 | cv2.imshow('yolact', srcimg) 227 | cv2.waitKey(0) 228 | cv2.destroyAllWindows() -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | #define _CRT_SECURE_NO_WARNINGS 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace cv; 9 | using namespace dnn; 10 | using namespace std; 11 | 12 | extern const char* class_names[] = { "background", 13 | "person", "bicycle", "car", "motorcycle", "airplane", "bus", 14 | "train", "truck", "boat", "traffic light", "fire hydrant", 15 | "stop sign", "parking meter", "bench", "bird", "cat", "dog", 16 | "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", 17 | "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", 18 | "skis", "snowboard", "sports ball", "kite", "baseball bat", 19 | "baseball glove", "skateboard", "surfboard", "tennis racket", 20 | "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", 21 | "banana", "apple", "sandwich", "orange", "broccoli", "carrot", 22 | "hot dog", "pizza", "donut", "cake", "chair", "couch", 23 | "potted plant", "bed", "dining table", "toilet", "tv", "laptop", 24 | "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", 25 | "toaster", "sink", "refrigerator", "book", "clock", "vase", 26 | "scissors", "teddy bear", "hair drier", "toothbrush" 27 | }; 28 | 29 | extern const unsigned char colors[81][3] = { 30 | {56, 0, 255}, 31 | {226, 255, 0}, 32 | {0, 94, 255}, 33 | {0, 37, 255}, 34 | {0, 255, 94}, 35 | {255, 226, 0}, 36 | {0, 18, 255}, 37 | {255, 151, 0}, 38 | {170, 0, 255}, 39 | {0, 255, 56}, 40 | {255, 0, 75}, 41 | {0, 75, 255}, 42 | {0, 255, 169}, 43 | {255, 0, 207}, 44 | {75, 255, 0}, 45 | {207, 0, 255}, 46 | {37, 0, 255}, 47 | {0, 207, 255}, 48 | {94, 0, 255}, 49 | {0, 255, 113}, 50 | {255, 18, 0}, 51 | {255, 0, 56}, 52 | {18, 0, 255}, 53 | {0, 255, 226}, 54 | {170, 255, 0}, 55 | {255, 0, 245}, 56 | {151, 255, 0}, 57 | {132, 255, 0}, 58 | {75, 0, 255}, 59 | {151, 0, 255}, 60 | {0, 151, 255}, 61 | {132, 0, 255}, 62 | {0, 255, 245}, 63 | {255, 132, 0}, 64 | {226, 0, 255}, 65 | {255, 37, 0}, 66 | {207, 255, 0}, 67 | {0, 255, 207}, 68 | {94, 255, 0}, 69 | {0, 226, 255}, 70 | {56, 255, 0}, 71 | {255, 94, 0}, 72 | {255, 113, 0}, 73 | {0, 132, 255}, 74 | {255, 0, 132}, 75 | {255, 170, 0}, 76 | {255, 0, 188}, 77 | {113, 255, 0}, 78 | {245, 0, 255}, 79 | {113, 0, 255}, 80 | {255, 188, 0}, 81 | {0, 113, 255}, 82 | {255, 0, 0}, 83 | {0, 56, 255}, 84 | {255, 0, 113}, 85 | {0, 255, 188}, 86 | {255, 0, 94}, 87 | {255, 0, 18}, 88 | {18, 255, 0}, 89 | {0, 255, 132}, 90 | {0, 188, 255}, 91 | {0, 245, 255}, 92 | {0, 169, 255}, 93 | {37, 255, 0}, 94 | {255, 0, 151}, 95 | {188, 0, 255}, 96 | {0, 255, 37}, 97 | {0, 255, 0}, 98 | {255, 0, 170}, 99 | {255, 0, 37}, 100 | {255, 75, 0}, 101 | {0, 0, 255}, 102 | {255, 207, 0}, 103 | {255, 0, 226}, 104 | {255, 245, 0}, 105 | {188, 255, 0}, 106 | {0, 255, 18}, 107 | {0, 255, 75}, 108 | {0, 255, 151}, 109 | {255, 56, 0}, 110 | {245, 255, 0} 111 | }; 112 | 113 | class yolact 114 | { 115 | public: 116 | yolact(float confThreshold, float nmsThreshold, const int keep_top_k = 200); 117 | void detect(Mat& srcimg); 118 | private: 119 | const int target_size = 550; 120 | const float MEANS[3] = { 123.68, 116.78, 103.94 }; 121 | const float STD[3] = { 58.40, 57.12, 57.38 }; 122 | float confidence_threshold; 123 | float nms_threshold; 124 | int keep_top_k; 125 | const int conv_ws[5] = { 69, 35, 18, 9, 5 }; 126 | const int conv_hs[5] = { 69, 35, 18, 9, 5 }; 127 | const float aspect_ratios[3] = { 1.f, 0.5f, 2.f }; 128 | const float scales[5] = { 24.f, 48.f, 96.f, 192.f, 384.f }; 129 | const float var[4] = { 0.1f, 0.1f, 0.2f, 0.2f }; 130 | const int mask_h = 138; 131 | const int mask_w = 138; 132 | int num_priors; 133 | float* priorbox; 134 | Net net; 135 | void normalize(Mat& img); 136 | void sigmoid(Mat& out, int length); 137 | }; 138 | 139 | yolact::yolact(float confThreshold, float nmsThreshold, const int keep_top_k) 140 | { 141 | this->confidence_threshold = confThreshold; 142 | this->nms_threshold = nmsThreshold; 143 | this->keep_top_k = keep_top_k; 144 | this->net = readNet("yolact_base_54_800000.onnx"); 145 | this->num_priors = 0; 146 | int p = 0; 147 | for (p = 0; p < 5; p++) 148 | { 149 | this->num_priors += this->conv_ws[p] * this->conv_hs[p] * 3; 150 | } 151 | this->priorbox = new float[4 * this->num_priors]; 152 | ////generate priorbox 153 | float* pb = priorbox; 154 | for (p = 0; p < 5; p++) 155 | { 156 | int conv_w = this->conv_ws[p]; 157 | int conv_h = this->conv_hs[p]; 158 | 159 | float scale = this->scales[p]; 160 | 161 | for (int i = 0; i < conv_h; i++) 162 | { 163 | for (int j = 0; j < conv_w; j++) 164 | { 165 | // +0.5 because priors are in center-size notation 166 | float cx = (j + 0.5f) / conv_w; 167 | float cy = (i + 0.5f) / conv_h; 168 | 169 | for (int k = 0; k < 3; k++) 170 | { 171 | float ar = aspect_ratios[k]; 172 | 173 | ar = sqrt(ar); 174 | 175 | float w = scale * ar / this->target_size; 176 | float h = scale / ar / this->target_size; 177 | 178 | // This is for backward compatability with a bug where I made everything square by accident 179 | // cfg.backbone.use_square_anchors: 180 | h = w; 181 | pb[0] = cx; 182 | pb[1] = cy; 183 | pb[2] = w; 184 | pb[3] = h; 185 | pb += 4; 186 | } 187 | } 188 | } 189 | } 190 | } 191 | 192 | void yolact::normalize(Mat& img) 193 | { 194 | img.convertTo(img, CV_32F); 195 | int i = 0, j = 0; 196 | for (i = 0; i < img.rows; i++) 197 | { 198 | float* pdata = (float*)(img.data + i * img.step); 199 | for (j = 0; j < img.cols; j++) 200 | { 201 | pdata[0] = (pdata[0] - this->MEANS[0]) / this->STD[0]; 202 | pdata[1] = (pdata[1] - this->MEANS[1]) / this->STD[1]; 203 | pdata[2] = (pdata[2] - this->MEANS[2]) / this->STD[2]; 204 | pdata += 3; 205 | } 206 | } 207 | } 208 | 209 | void yolact::sigmoid(Mat& out, int length) 210 | { 211 | float* pdata = (float*)(out.data); 212 | int i = 0; 213 | for (i = 0; i < length; i++) 214 | { 215 | pdata[i] = 1.0 / (1 + expf(-pdata[i])); 216 | } 217 | } 218 | 219 | void yolact::detect(Mat& srcimg) 220 | { 221 | int img_w = srcimg.cols; 222 | int img_h = srcimg.rows; 223 | Mat img; 224 | resize(srcimg, img, Size(this->target_size, this->target_size), INTER_LINEAR); 225 | cvtColor(img, img, COLOR_BGR2RGB); 226 | this->normalize(img); 227 | Mat blob = blobFromImage(img); 228 | this->net.setInput(blob); 229 | vector outs; 230 | this->net.forward(outs, this->net.getUnconnectedOutLayersNames()); 231 | 232 | /////generate proposals 233 | vector classIds; 234 | vector confidences; 235 | vector boxes; 236 | vector maskIds; 237 | const int num_class = outs[1].cols; 238 | for (int i = 0; i < this->num_priors; i++) 239 | { 240 | Mat scores = outs[1].row(i).colRange(1, num_class); 241 | Point classIdPoint; 242 | double score; 243 | // Get the value and location of the maximum score 244 | minMaxLoc(scores, 0, &score, 0, &classIdPoint); 245 | if (score > this->confidence_threshold) 246 | { 247 | const float* loc = (float*)outs[0].data + i * 4; 248 | const float* pb = this->priorbox + i * 4; 249 | float pb_cx = pb[0]; 250 | float pb_cy = pb[1]; 251 | float pb_w = pb[2]; 252 | float pb_h = pb[3]; 253 | 254 | float bbox_cx = var[0] * loc[0] * pb_w + pb_cx; 255 | float bbox_cy = var[1] * loc[1] * pb_h + pb_cy; 256 | float bbox_w = (float)(exp(var[2] * loc[2]) * pb_w); 257 | float bbox_h = (float)(exp(var[3] * loc[3]) * pb_h); 258 | float obj_x1 = bbox_cx - bbox_w * 0.5f; 259 | float obj_y1 = bbox_cy - bbox_h * 0.5f; 260 | float obj_x2 = bbox_cx + bbox_w * 0.5f; 261 | float obj_y2 = bbox_cy + bbox_h * 0.5f; 262 | 263 | // clip 264 | obj_x1 = max(min(obj_x1 * img_w, (float)(img_w - 1)), 0.f); 265 | obj_y1 = max(min(obj_y1 * img_h, (float)(img_h - 1)), 0.f); 266 | obj_x2 = max(min(obj_x2 * img_w, (float)(img_w - 1)), 0.f); 267 | obj_y2 = max(min(obj_y2 * img_h, (float)(img_h - 1)), 0.f); 268 | classIds.push_back(classIdPoint.x); 269 | confidences.push_back(score); 270 | boxes.push_back(Rect((int)obj_x1, (int)obj_y1, (int)(obj_x2 - obj_x1 + 1), (int)(obj_y2 - obj_y1 + 1))); 271 | maskIds.push_back(i); 272 | } 273 | } 274 | 275 | // Perform non maximum suppression to eliminate redundant overlapping boxes with 276 | // lower confidences 277 | vector indices; 278 | NMSBoxes(boxes, confidences, this->confidence_threshold, this->nms_threshold, indices, 1.f, this->keep_top_k); 279 | for (size_t i = 0; i < indices.size(); ++i) 280 | { 281 | int idx = indices[i]; 282 | Rect box = boxes[idx]; 283 | int xmax = box.x + box.width; 284 | int ymax = box.y + box.height; 285 | rectangle(srcimg, Point(box.x, box.y), Point(xmax, ymax), Scalar(0, 0, 255), 3); 286 | //Get the label for the class name and its confidence 287 | char text[256]; 288 | sprintf(text, "%s: %.2f", class_names[classIds[idx] + 1], confidences[idx]); 289 | 290 | 291 | //Display the label at the top of the bounding box 292 | int baseLine; 293 | Size labelSize = getTextSize(text, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); 294 | int ymin = max(box.y, labelSize.height); 295 | //rectangle(frame, Point(left, top - int(1.5 * labelSize.height)), Point(left + int(1.5 * labelSize.width), top + baseLine), Scalar(0, 255, 0), FILLED); 296 | putText(srcimg, text, Point(box.x, ymin), FONT_HERSHEY_SIMPLEX, 0.75, Scalar(0, 255, 0), 1); 297 | 298 | Mat mask(this->mask_h, this->mask_w, CV_32FC1); 299 | mask = cv::Scalar(0.f); 300 | int channel = outs[2].cols; 301 | int area = this->mask_h * this->mask_w; 302 | float* coeff = (float*)outs[2].data + maskIds[idx] * channel; 303 | float* pm = (float*)mask.data; 304 | const float* pmaskmap = (float*)outs[3].data; 305 | for (int j = 0; j < area; j++) 306 | { 307 | for (int p = 0; p < channel; p++) 308 | { 309 | pm[j] += pmaskmap[p] * coeff[p]; 310 | } 311 | pmaskmap += channel; 312 | } 313 | 314 | this->sigmoid(mask, area); 315 | Mat mask2; 316 | resize(mask, mask2, Size(img_w, img_h)); 317 | // draw mask 318 | for (int y = 0; y < img_h; y++) 319 | { 320 | const float* pmask = (float*)mask2.data + y * img_w; 321 | uchar* p = srcimg.data + y * img_w * 3; 322 | for (int x = 0; x < img_w; x++) 323 | { 324 | if (pmask[x] > 0.5) 325 | { 326 | p[0] = (uchar)(p[0] * 0.5 + colors[classIds[idx] + 1][0] * 0.5); 327 | p[1] = (uchar)(p[1] * 0.5 + colors[classIds[idx] + 1][1] * 0.5); 328 | p[2] = (uchar)(p[2] * 0.5 + colors[classIds[idx] + 1][2] * 0.5); 329 | } 330 | p += 3; 331 | } 332 | } 333 | } 334 | } 335 | 336 | int main() 337 | { 338 | yolact yolactnet(0.5, 0.5); 339 | 340 | string imgpath = "000000046804.jpg"; 341 | Mat srcimg = imread(imgpath); 342 | yolactnet.detect(srcimg); 343 | 344 | static const string kWinName = "Deep learning object detection in OpenCV"; 345 | namedWindow(kWinName, WINDOW_NORMAL); 346 | imshow(kWinName, srcimg); 347 | waitKey(0); 348 | destroyAllWindows(); 349 | } -------------------------------------------------------------------------------- /convert-onnx/backbone.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import pickle 4 | from collections import OrderedDict 5 | 6 | class Bottleneck(nn.Module): 7 | """ Adapted from torchvision.models.resnet """ 8 | expansion = 4 9 | 10 | def __init__(self, inplanes, planes, stride=1, downsample=None, norm_layer=nn.BatchNorm2d, dilation=1, 11 | use_dcn=False): 12 | super(Bottleneck, self).__init__() 13 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False, dilation=dilation) 14 | self.bn1 = norm_layer(planes) 15 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, 16 | padding=dilation, bias=False, dilation=dilation) 17 | self.bn2 = norm_layer(planes) 18 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False, dilation=dilation) 19 | self.bn3 = norm_layer(planes * 4) 20 | self.relu = nn.ReLU(inplace=True) 21 | self.downsample = downsample 22 | self.stride = stride 23 | 24 | def forward(self, x): 25 | residual = x 26 | 27 | out = self.conv1(x) 28 | out = self.bn1(out) 29 | out = self.relu(out) 30 | 31 | out = self.conv2(out) 32 | out = self.bn2(out) 33 | out = self.relu(out) 34 | 35 | out = self.conv3(out) 36 | out = self.bn3(out) 37 | 38 | if self.downsample is not None: 39 | residual = self.downsample(x) 40 | 41 | out += residual 42 | out = self.relu(out) 43 | 44 | return out 45 | 46 | 47 | class ResNetBackbone(nn.Module): 48 | """ Adapted from torchvision.models.resnet """ 49 | 50 | def __init__(self, layers, dcn_layers=[0, 0, 0, 0], dcn_interval=1, atrous_layers=[], block=Bottleneck, 51 | norm_layer=nn.BatchNorm2d): 52 | super().__init__() 53 | 54 | # These will be populated by _make_layer 55 | self.num_base_layers = len(layers) 56 | self.layers = nn.ModuleList() 57 | self.channels = [] 58 | self.norm_layer = norm_layer 59 | self.dilation = 1 60 | self.atrous_layers = atrous_layers 61 | 62 | # From torchvision.models.resnet.Resnet 63 | self.inplanes = 64 64 | 65 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) 66 | self.bn1 = norm_layer(64) 67 | self.relu = nn.ReLU(inplace=True) 68 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 69 | 70 | self._make_layer(block, 64, layers[0], dcn_layers=dcn_layers[0], dcn_interval=dcn_interval) 71 | self._make_layer(block, 128, layers[1], stride=2, dcn_layers=dcn_layers[1], dcn_interval=dcn_interval) 72 | self._make_layer(block, 256, layers[2], stride=2, dcn_layers=dcn_layers[2], dcn_interval=dcn_interval) 73 | self._make_layer(block, 512, layers[3], stride=2, dcn_layers=dcn_layers[3], dcn_interval=dcn_interval) 74 | 75 | # This contains every module that should be initialized by loading in pretrained weights. 76 | # Any extra layers added onto this that won't be initialized by init_backbone will not be 77 | # in this list. That way, Yolact::init_weights knows which backbone weights to initialize 78 | # with xavier, and which ones to leave alone. 79 | self.backbone_modules = [m for m in self.modules() if isinstance(m, nn.Conv2d)] 80 | 81 | def _make_layer(self, block, planes, blocks, stride=1, dcn_layers=0, dcn_interval=1): 82 | """ Here one layer means a string of n Bottleneck blocks. """ 83 | downsample = None 84 | 85 | # This is actually just to create the connection between layers, and not necessarily to 86 | # downsample. Even if the second condition is met, it only downsamples when stride != 1 87 | if stride != 1 or self.inplanes != planes * block.expansion: 88 | if len(self.layers) in self.atrous_layers: 89 | self.dilation += 1 90 | stride = 1 91 | 92 | downsample = nn.Sequential( 93 | nn.Conv2d(self.inplanes, planes * block.expansion, 94 | kernel_size=1, stride=stride, bias=False, 95 | dilation=self.dilation), 96 | self.norm_layer(planes * block.expansion), 97 | ) 98 | 99 | layers = [] 100 | use_dcn = (dcn_layers >= blocks) 101 | layers.append(block(self.inplanes, planes, stride, downsample, self.norm_layer, self.dilation, use_dcn=use_dcn)) 102 | self.inplanes = planes * block.expansion 103 | for i in range(1, blocks): 104 | use_dcn = ((i + dcn_layers) >= blocks) and (i % dcn_interval == 0) 105 | layers.append(block(self.inplanes, planes, norm_layer=self.norm_layer, use_dcn=use_dcn)) 106 | layer = nn.Sequential(*layers) 107 | 108 | self.channels.append(planes * block.expansion) 109 | self.layers.append(layer) 110 | 111 | return layer 112 | 113 | def forward(self, x): 114 | """ Returns a list of convouts for each layer. """ 115 | 116 | x = self.conv1(x) 117 | x = self.bn1(x) 118 | x = self.relu(x) 119 | x = self.maxpool(x) 120 | 121 | outs = [] 122 | for layer in self.layers: 123 | x = layer(x) 124 | outs.append(x) 125 | 126 | return tuple(outs) 127 | 128 | def init_backbone(self, path): 129 | """ Initializes the backbone weights for training. """ 130 | state_dict = torch.load(path) 131 | 132 | # Replace layer1 -> layers.0 etc. 133 | keys = list(state_dict) 134 | for key in keys: 135 | if key.startswith('layer'): 136 | idx = int(key[5]) 137 | new_key = 'layers.' + str(idx - 1) + key[6:] 138 | state_dict[new_key] = state_dict.pop(key) 139 | 140 | # Note: Using strict=False is berry scary. Triple check this. 141 | self.load_state_dict(state_dict, strict=False) 142 | 143 | def add_layer(self, conv_channels=1024, downsample=2, depth=1, block=Bottleneck): 144 | """ Add a downsample layer to the backbone as per what SSD does. """ 145 | self._make_layer(block, conv_channels // block.expansion, blocks=depth, stride=downsample) 146 | 147 | 148 | class ResNetBackboneGN(ResNetBackbone): 149 | 150 | def __init__(self, layers, num_groups=32): 151 | super().__init__(layers, norm_layer=lambda x: nn.GroupNorm(num_groups, x)) 152 | 153 | def init_backbone(self, path): 154 | """ The path here comes from detectron. So we load it differently. """ 155 | with open(path, 'rb') as f: 156 | state_dict = pickle.load(f, encoding='latin1') # From the detectron source 157 | state_dict = state_dict['blobs'] 158 | 159 | our_state_dict_keys = list(self.state_dict().keys()) 160 | new_state_dict = {} 161 | 162 | gn_trans = lambda x: ('gn_s' if x == 'weight' else 'gn_b') 163 | layeridx2res = lambda x: 'res' + str(int(x) + 2) 164 | block2branch = lambda x: 'branch2' + ('a', 'b', 'c')[int(x[-1:]) - 1] 165 | 166 | # Transcribe each Detectron weights name to a Yolact weights name 167 | for key in our_state_dict_keys: 168 | parts = key.split('.') 169 | transcribed_key = '' 170 | 171 | if (parts[0] == 'conv1'): 172 | transcribed_key = 'conv1_w' 173 | elif (parts[0] == 'bn1'): 174 | transcribed_key = 'conv1_' + gn_trans(parts[1]) 175 | elif (parts[0] == 'layers'): 176 | if int(parts[1]) >= self.num_base_layers: continue 177 | 178 | transcribed_key = layeridx2res(parts[1]) 179 | transcribed_key += '_' + parts[2] + '_' 180 | 181 | if parts[3] == 'downsample': 182 | transcribed_key += 'branch1_' 183 | 184 | if parts[4] == '0': 185 | transcribed_key += 'w' 186 | else: 187 | transcribed_key += gn_trans(parts[5]) 188 | else: 189 | transcribed_key += block2branch(parts[3]) + '_' 190 | 191 | if 'conv' in parts[3]: 192 | transcribed_key += 'w' 193 | else: 194 | transcribed_key += gn_trans(parts[4]) 195 | 196 | new_state_dict[key] = torch.Tensor(state_dict[transcribed_key]) 197 | 198 | # strict=False because we may have extra unitialized layers at this point 199 | self.load_state_dict(new_state_dict, strict=False) 200 | 201 | 202 | def darknetconvlayer(in_channels, out_channels, *args, **kwdargs): 203 | """ 204 | Implements a conv, activation, then batch norm. 205 | Arguments are passed into the conv layer. 206 | """ 207 | return nn.Sequential( 208 | nn.Conv2d(in_channels, out_channels, *args, **kwdargs, bias=False), 209 | nn.BatchNorm2d(out_channels), 210 | # Darknet uses 0.1 here. 211 | # See https://github.com/pjreddie/darknet/blob/680d3bde1924c8ee2d1c1dea54d3e56a05ca9a26/src/activations.h#L39 212 | nn.LeakyReLU(0.1, inplace=True) 213 | ) 214 | 215 | 216 | class DarkNetBlock(nn.Module): 217 | """ Note: channels is the lesser of the two. The output will be expansion * channels. """ 218 | 219 | expansion = 2 220 | 221 | def __init__(self, in_channels, channels): 222 | super().__init__() 223 | 224 | self.conv1 = darknetconvlayer(in_channels, channels, kernel_size=1) 225 | self.conv2 = darknetconvlayer(channels, channels * self.expansion, kernel_size=3, padding=1) 226 | 227 | def forward(self, x): 228 | return self.conv2(self.conv1(x)) + x 229 | 230 | 231 | class DarkNetBackbone(nn.Module): 232 | """ 233 | An implementation of YOLOv3's Darnet53 in 234 | https://pjreddie.com/media/files/papers/YOLOv3.pdf 235 | 236 | This is based off of the implementation of Resnet above. 237 | """ 238 | 239 | def __init__(self, layers=[1, 2, 8, 8, 4], block=DarkNetBlock): 240 | super().__init__() 241 | 242 | # These will be populated by _make_layer 243 | self.num_base_layers = len(layers) 244 | self.layers = nn.ModuleList() 245 | self.channels = [] 246 | 247 | self._preconv = darknetconvlayer(3, 32, kernel_size=3, padding=1) 248 | self.in_channels = 32 249 | 250 | self._make_layer(block, 32, layers[0]) 251 | self._make_layer(block, 64, layers[1]) 252 | self._make_layer(block, 128, layers[2]) 253 | self._make_layer(block, 256, layers[3]) 254 | self._make_layer(block, 512, layers[4]) 255 | 256 | # This contains every module that should be initialized by loading in pretrained weights. 257 | # Any extra layers added onto this that won't be initialized by init_backbone will not be 258 | # in this list. That way, Yolact::init_weights knows which backbone weights to initialize 259 | # with xavier, and which ones to leave alone. 260 | self.backbone_modules = [m for m in self.modules() if isinstance(m, nn.Conv2d)] 261 | 262 | def _make_layer(self, block, channels, num_blocks, stride=2): 263 | """ Here one layer means a string of n blocks. """ 264 | layer_list = [] 265 | 266 | # The downsample layer 267 | layer_list.append( 268 | darknetconvlayer(self.in_channels, channels * block.expansion, 269 | kernel_size=3, padding=1, stride=stride)) 270 | 271 | # Each block inputs channels and outputs channels * expansion 272 | self.in_channels = channels * block.expansion 273 | layer_list += [block(self.in_channels, channels) for _ in range(num_blocks)] 274 | 275 | self.channels.append(self.in_channels) 276 | self.layers.append(nn.Sequential(*layer_list)) 277 | 278 | def forward(self, x): 279 | """ Returns a list of convouts for each layer. """ 280 | 281 | x = self._preconv(x) 282 | 283 | outs = [] 284 | for layer in self.layers: 285 | x = layer(x) 286 | outs.append(x) 287 | 288 | return tuple(outs) 289 | 290 | def add_layer(self, conv_channels=1024, stride=2, depth=1, block=DarkNetBlock): 291 | """ Add a downsample layer to the backbone as per what SSD does. """ 292 | self._make_layer(block, conv_channels // block.expansion, num_blocks=depth, stride=stride) 293 | 294 | def init_backbone(self, path): 295 | """ Initializes the backbone weights for training. """ 296 | # Note: Using strict=False is berry scary. Triple check this. 297 | self.load_state_dict(torch.load(path), strict=False) 298 | 299 | 300 | class VGGBackbone(nn.Module): 301 | """ 302 | Args: 303 | - cfg: A list of layers given as lists. Layers can be either 'M' signifying 304 | a max pooling layer, a number signifying that many feature maps in 305 | a conv layer, or a tuple of 'M' or a number and a kwdargs dict to pass 306 | into the function that creates the layer (e.g. nn.MaxPool2d for 'M'). 307 | - extra_args: A list of lists of arguments to pass into add_layer. 308 | - norm_layers: Layers indices that need to pass through an l2norm layer. 309 | """ 310 | 311 | def __init__(self, cfg, extra_args=[], norm_layers=[]): 312 | super().__init__() 313 | 314 | self.channels = [] 315 | self.layers = nn.ModuleList() 316 | self.in_channels = 3 317 | self.extra_args = list(reversed(extra_args)) # So I can use it as a stack 318 | 319 | # Keeps track of what the corresponding key will be in the state dict of the 320 | # pretrained model. For instance, layers.0.2 for us is 2 for the pretrained 321 | # model but layers.1.1 is 5. 322 | self.total_layer_count = 0 323 | self.state_dict_lookup = {} 324 | 325 | for idx, layer_cfg in enumerate(cfg): 326 | self._make_layer(layer_cfg) 327 | 328 | self.norms = nn.ModuleList([nn.BatchNorm2d(self.channels[l]) for l in norm_layers]) 329 | self.norm_lookup = {l: idx for idx, l in enumerate(norm_layers)} 330 | 331 | # These modules will be initialized by init_backbone, 332 | # so don't overwrite their initialization later. 333 | self.backbone_modules = [m for m in self.modules() if isinstance(m, nn.Conv2d)] 334 | 335 | def _make_layer(self, cfg): 336 | """ 337 | Each layer is a sequence of conv layers usually preceded by a max pooling. 338 | Adapted from torchvision.models.vgg.make_layers. 339 | """ 340 | 341 | layers = [] 342 | 343 | for v in cfg: 344 | # VGG in SSD requires some special layers, so allow layers to be tuples of 345 | # (, kwdargs dict) 346 | args = None 347 | if isinstance(v, tuple): 348 | args = v[1] 349 | v = v[0] 350 | 351 | # v should be either M or a number 352 | if v == 'M': 353 | # Set default arguments 354 | if args is None: 355 | args = {'kernel_size': 2, 'stride': 2} 356 | 357 | layers.append(nn.MaxPool2d(**args)) 358 | else: 359 | # See the comment in __init__ for an explanation of this 360 | cur_layer_idx = self.total_layer_count + len(layers) 361 | self.state_dict_lookup[cur_layer_idx] = '%d.%d' % (len(self.layers), len(layers)) 362 | 363 | # Set default arguments 364 | if args is None: 365 | args = {'kernel_size': 3, 'padding': 1} 366 | 367 | # Add the layers 368 | layers.append(nn.Conv2d(self.in_channels, v, **args)) 369 | layers.append(nn.ReLU(inplace=True)) 370 | self.in_channels = v 371 | 372 | self.total_layer_count += len(layers) 373 | self.channels.append(self.in_channels) 374 | self.layers.append(nn.Sequential(*layers)) 375 | 376 | def forward(self, x): 377 | """ Returns a list of convouts for each layer. """ 378 | outs = [] 379 | 380 | for idx, layer in enumerate(self.layers): 381 | x = layer(x) 382 | 383 | # Apply an l2norm module to the selected layers 384 | # Note that this differs from the original implemenetation 385 | if idx in self.norm_lookup: 386 | x = self.norms[self.norm_lookup[idx]](x) 387 | outs.append(x) 388 | 389 | return tuple(outs) 390 | 391 | def transform_key(self, k): 392 | """ Transform e.g. features.24.bias to layers.4.1.bias """ 393 | vals = k.split('.') 394 | layerIdx = self.state_dict_lookup[int(vals[0])] 395 | return 'layers.%s.%s' % (layerIdx, vals[1]) 396 | 397 | def init_backbone(self, path): 398 | """ Initializes the backbone weights for training. """ 399 | state_dict = torch.load(path) 400 | state_dict = OrderedDict([(self.transform_key(k), v) for k, v in state_dict.items()]) 401 | 402 | self.load_state_dict(state_dict, strict=False) 403 | 404 | def add_layer(self, conv_channels=128, downsample=2): 405 | """ Add a downsample layer to the backbone as per what SSD does. """ 406 | if len(self.extra_args) > 0: 407 | conv_channels, downsample = self.extra_args.pop() 408 | 409 | padding = 1 if downsample > 1 else 0 410 | 411 | layer = nn.Sequential( 412 | nn.Conv2d(self.in_channels, conv_channels, kernel_size=1), 413 | nn.ReLU(inplace=True), 414 | nn.Conv2d(conv_channels, conv_channels * 2, kernel_size=3, stride=downsample, padding=padding), 415 | nn.ReLU(inplace=True) 416 | ) 417 | 418 | self.in_channels = conv_channels * 2 419 | self.channels.append(self.in_channels) 420 | self.layers.append(layer) 421 | 422 | 423 | def construct_backbone(cfg): 424 | """ Constructs a backbone given a backbone config object (see config.py). """ 425 | backbone = cfg.type(*cfg.args) 426 | 427 | # Add downsampling layers until we reach the number we need 428 | num_layers = max(cfg.selected_layers) + 1 429 | 430 | while len(backbone.layers) < num_layers: 431 | backbone.add_layer() 432 | 433 | return backbone 434 | -------------------------------------------------------------------------------- /convert-onnx/yolact.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | from itertools import product 6 | from math import sqrt 7 | from typing import List 8 | from collections import defaultdict 9 | from config import cfg, mask_type 10 | from backbone import construct_backbone 11 | 12 | # This is required for Pytorch 1.0.1 on Windows to initialize Cuda on some driver versions. 13 | # See the bug report here: https://github.com/pytorch/pytorch/issues/17108 14 | device = 'cpu' 15 | if torch.cuda.is_available(): 16 | torch.cuda.current_device() 17 | device = 'cuda' 18 | 19 | # As of March 10, 2019, Pytorch DataParallel still doesn't support JIT Script Modules 20 | # use_jit = torch.cuda.device_count() <= 1 21 | # if not use_jit: 22 | # print('Multiple GPUs detected! Turning off JIT.') 23 | # 24 | # ScriptModuleWrapper = torch.jit.ScriptModule if use_jit else nn.Module 25 | # script_method_wrapper = torch.jit.script_method if use_jit else lambda fn, _rcn=None: fn 26 | 27 | class Concat(nn.Module): 28 | def __init__(self, nets, extra_params): 29 | super().__init__() 30 | 31 | self.nets = nn.ModuleList(nets) 32 | self.extra_params = extra_params 33 | 34 | def forward(self, x): 35 | # Concat each along the channel dimension 36 | return torch.cat([net(x) for net in self.nets], dim=1, **self.extra_params) 37 | 38 | class InterpolateModule(nn.Module): 39 | """ 40 | This is a module version of F.interpolate (rip nn.Upsampling). 41 | Any arguments you give it just get passed along for the ride. 42 | """ 43 | 44 | def __init__(self, *args, **kwdargs): 45 | super().__init__() 46 | 47 | self.args = args 48 | self.kwdargs = kwdargs 49 | 50 | def forward(self, x): 51 | # return F.interpolate(x, *self.args, **self.kwdargs) 52 | return F.interpolate(x, size=(int(x.shape[2] * self.kwdargs['scale_factor']), int(x.shape[3] * self.kwdargs['scale_factor'])), mode=self.kwdargs['mode'], align_corners=self.kwdargs['align_corners']) 53 | 54 | def make_net(in_channels, conf, include_last_relu=True): 55 | def make_layer(layer_cfg): 56 | nonlocal in_channels 57 | 58 | # Possible patterns: 59 | # ( 256, 3, {}) -> conv 60 | # ( 256,-2, {}) -> deconv 61 | # (None,-2, {}) -> bilinear interpolate 62 | # ('cat',[],{}) -> concat the subnetworks in the list 63 | # 64 | # You know it would have probably been simpler just to adopt a 'c' 'd' 'u' naming scheme. 65 | # Whatever, it's too late now. 66 | if isinstance(layer_cfg[0], str): 67 | layer_name = layer_cfg[0] 68 | 69 | if layer_name == 'cat': 70 | nets = [make_net(in_channels, x) for x in layer_cfg[1]] 71 | layer = Concat([net[0] for net in nets], layer_cfg[2]) 72 | num_channels = sum([net[1] for net in nets]) 73 | else: 74 | num_channels = layer_cfg[0] 75 | kernel_size = layer_cfg[1] 76 | 77 | if kernel_size > 0: 78 | layer = nn.Conv2d(in_channels, num_channels, kernel_size, **layer_cfg[2]) 79 | else: 80 | if num_channels is None: 81 | layer = InterpolateModule(scale_factor=-kernel_size, mode='bilinear', align_corners=False, **layer_cfg[2]) 82 | # layer = nn.Upsample(scale_factor=-kernel_size, mode='bilinear', align_corners=False) 83 | else: 84 | layer = nn.ConvTranspose2d(in_channels, num_channels, -kernel_size, **layer_cfg[2]) 85 | 86 | in_channels = num_channels if num_channels is not None else in_channels 87 | 88 | # Don't return a ReLU layer if we're doing an upsample. This probably doesn't affect anything 89 | # output-wise, but there's no need to go through a ReLU here. 90 | # Commented out for backwards compatibility with previous models 91 | # if num_channels is None: 92 | # return [layer] 93 | # else: 94 | return [layer, nn.ReLU(inplace=True)] 95 | 96 | # Use sum to concat together all the component layer lists 97 | net = sum([make_layer(x) for x in conf], []) 98 | if not include_last_relu: 99 | net = net[:-1] 100 | 101 | return nn.Sequential(*(net)), in_channels 102 | 103 | prior_cache = defaultdict(lambda: None) 104 | 105 | class PredictionModule(nn.Module): 106 | def __init__(self, in_channels, out_channels=1024, aspect_ratios=[[1]], scales=[1], parent=None, index=0): 107 | super().__init__() 108 | 109 | self.num_classes = cfg.num_classes 110 | self.mask_dim = cfg.mask_dim # Defined by Yolact 111 | self.num_priors = sum(len(x)*len(scales) for x in aspect_ratios) 112 | self.parent = [parent] # Don't include this in the state dict 113 | self.index = index 114 | self.num_heads = cfg.num_heads # Defined by Yolact 115 | 116 | if parent is None: 117 | self.upfeature, out_channels = make_net(in_channels, cfg.extra_head_net) 118 | 119 | self.bbox_layer = nn.Conv2d(out_channels, self.num_priors * 4, **cfg.head_layer_params) 120 | self.conf_layer = nn.Conv2d(out_channels, self.num_priors * self.num_classes, **cfg.head_layer_params) 121 | self.mask_layer = nn.Conv2d(out_channels, self.num_priors * self.mask_dim, **cfg.head_layer_params) 122 | 123 | # What is this ugly lambda doing in the middle of all this clean prediction module code? 124 | def make_extra(num_layers): 125 | if num_layers == 0: 126 | return lambda x: x 127 | else: 128 | # Looks more complicated than it is. This just creates an array of num_layers alternating conv-relu 129 | return nn.Sequential(*sum([[ 130 | nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1), 131 | nn.ReLU(inplace=True) 132 | ] for _ in range(num_layers)], [])) 133 | 134 | self.bbox_extra, self.conf_extra, self.mask_extra = [make_extra(x) for x in cfg.extra_layers] 135 | 136 | self.aspect_ratios = aspect_ratios 137 | self.scales = scales 138 | 139 | self.priors = None 140 | self.last_conv_size = None 141 | self.last_img_size = None 142 | 143 | def forward(self, x): 144 | src = self if self.parent[0] is None else self.parent[0] 145 | x = src.upfeature(x) 146 | 147 | bbox_x = src.bbox_extra(x) 148 | conf_x = src.conf_extra(x) 149 | mask_x = src.mask_extra(x) 150 | 151 | bbox = src.bbox_layer(bbox_x).permute(0, 2, 3, 1).contiguous().view(x.size(0), -1, 4) 152 | conf = src.conf_layer(conf_x).permute(0, 2, 3, 1).contiguous().view(x.size(0), -1, self.num_classes) 153 | 154 | mask = src.mask_layer(mask_x).permute(0, 2, 3, 1).contiguous().view(x.size(0), -1, self.mask_dim) 155 | 156 | mask = torch.tanh(mask) 157 | 158 | # conv_h = x.size(2) 159 | # conv_w = x.size(3) 160 | # priors = self.make_priors(conv_h, conv_w, x.device) 161 | # preds = { 'loc': bbox, 'conf': conf, 'mask': mask, 'priors': priors } 162 | # return preds 163 | return bbox, conf, mask 164 | 165 | def make_priors(self, conv_h, conv_w, device): 166 | """ Note that priors are [x,y,width,height] where (x,y) is the center of the box. """ 167 | global prior_cache 168 | size = (conv_h, conv_w) 169 | 170 | if self.last_img_size != (cfg._tmp_img_w, cfg._tmp_img_h): 171 | prior_data = [] 172 | 173 | # Iteration order is important (it has to sync up with the convout) 174 | for j, i in product(range(conv_h), range(conv_w)): 175 | # +0.5 because priors are in center-size notation 176 | x = (i + 0.5) / conv_w 177 | y = (j + 0.5) / conv_h 178 | 179 | for ars in self.aspect_ratios: 180 | for scale in self.scales: 181 | for ar in ars: 182 | if not cfg.backbone.preapply_sqrt: 183 | ar = sqrt(ar) 184 | 185 | if cfg.backbone.use_pixel_scales: 186 | w = scale * ar / cfg.max_size 187 | h = scale / ar / cfg.max_size 188 | else: 189 | w = scale * ar / conv_w 190 | h = scale / ar / conv_h 191 | 192 | # This is for backward compatability with a bug where I made everything square by accident 193 | if cfg.backbone.use_square_anchors: 194 | h = w 195 | 196 | prior_data += [x, y, w, h] 197 | 198 | self.priors = torch.Tensor(prior_data).view(-1, 4).detach().to(device) 199 | # self.priors = torch.Tensor(prior_data).view(-1, 4).detach() 200 | self.priors.requires_grad = False 201 | self.last_img_size = (cfg._tmp_img_w, cfg._tmp_img_h) 202 | self.last_conv_size = (conv_w, conv_h) 203 | prior_cache[size] = None 204 | elif self.priors.device != device: 205 | # This whole weird situation is so that DataParalell doesn't copy the priors each iteration 206 | if prior_cache[size] is None: 207 | prior_cache[size] = {} 208 | 209 | if device not in prior_cache[size]: 210 | prior_cache[size][device] = self.priors.to(device) 211 | 212 | self.priors = prior_cache[size][device] 213 | 214 | return self.priors 215 | 216 | class FPN(nn.Module): 217 | """ 218 | Implements a general version of the FPN introduced in 219 | https://arxiv.org/pdf/1612.03144.pdf 220 | 221 | Parameters (in cfg.fpn): 222 | - num_features (int): The number of output features in the fpn layers. 223 | - interpolation_mode (str): The mode to pass to F.interpolate. 224 | - num_downsample (int): The number of downsampled layers to add onto the selected layers. 225 | These extra layers are downsampled from the last selected layer. 226 | 227 | Args: 228 | - in_channels (list): For each conv layer you supply in the forward pass, 229 | how many features will it have? 230 | """ 231 | __constants__ = ['interpolation_mode', 'num_downsample', 'use_conv_downsample', 'relu_pred_layers', 232 | 'lat_layers', 'pred_layers', 'downsample_layers', 'relu_downsample_layers'] 233 | 234 | def __init__(self, in_channels): 235 | super().__init__() 236 | 237 | self.lat_layers = nn.ModuleList([ 238 | nn.Conv2d(x, cfg.fpn.num_features, kernel_size=1) 239 | for x in reversed(in_channels) 240 | ]) 241 | 242 | # This is here for backwards compatability 243 | padding = 1 if cfg.fpn.pad else 0 244 | self.pred_layers = nn.ModuleList([ 245 | nn.Conv2d(cfg.fpn.num_features, cfg.fpn.num_features, kernel_size=3, padding=padding) 246 | for _ in in_channels 247 | ]) 248 | 249 | if cfg.fpn.use_conv_downsample: 250 | self.downsample_layers = nn.ModuleList([ 251 | nn.Conv2d(cfg.fpn.num_features, cfg.fpn.num_features, kernel_size=3, padding=1, stride=2) 252 | for _ in range(cfg.fpn.num_downsample) 253 | ]) 254 | 255 | self.interpolation_mode = cfg.fpn.interpolation_mode 256 | self.num_downsample = cfg.fpn.num_downsample 257 | self.use_conv_downsample = cfg.fpn.use_conv_downsample 258 | self.relu_downsample_layers = cfg.fpn.relu_downsample_layers 259 | self.relu_pred_layers = cfg.fpn.relu_pred_layers 260 | 261 | # @script_method_wrapper 262 | def forward(self, convouts:List[torch.Tensor]): 263 | """ 264 | Args: 265 | - convouts (list): A list of convouts for the corresponding layers in in_channels. 266 | Returns: 267 | - A list of FPN convouts in the same order as x with extra downsample layers if requested. 268 | """ 269 | 270 | out = [] 271 | x = torch.zeros(1, device=convouts[0].device) 272 | for i in range(len(convouts)): 273 | out.append(x) 274 | 275 | # For backward compatability, the conv layers are stored in reverse but the input and output is 276 | # given in the correct order. Thus, use j=-i-1 for the input and output and i for the conv layers. 277 | j = len(convouts) 278 | for lat_layer in self.lat_layers: 279 | j -= 1 280 | 281 | if j < len(convouts) - 1: 282 | x = F.interpolate(x, size=(int(convouts[j].shape[2]), int(convouts[j].shape[3])), mode=self.interpolation_mode, align_corners=False) 283 | 284 | x = x + lat_layer(convouts[j]) 285 | out[j] = x 286 | 287 | # This janky second loop is here because TorchScript. 288 | j = len(convouts) 289 | for pred_layer in self.pred_layers: 290 | j -= 1 291 | out[j] = pred_layer(out[j]) 292 | 293 | if self.relu_pred_layers: 294 | F.relu(out[j], inplace=True) 295 | 296 | cur_idx = len(out) 297 | 298 | # In the original paper, this takes care of P6 299 | if self.use_conv_downsample: 300 | for downsample_layer in self.downsample_layers: 301 | out.append(downsample_layer(out[-1])) 302 | else: 303 | for idx in range(self.num_downsample): 304 | # Note: this is an untested alternative to out.append(out[-1][:, :, ::2, ::2]). Thanks TorchScript. 305 | out.append(nn.functional.max_pool2d(out[-1], 1, stride=2)) 306 | 307 | if self.relu_downsample_layers: 308 | for idx in range(len(out) - cur_idx): 309 | out[idx] = F.relu(out[idx + cur_idx], inplace=False) 310 | 311 | return out 312 | 313 | class FastMaskIoUNet(nn.Module): 314 | 315 | def __init__(self): 316 | super().__init__() 317 | input_channels = 1 318 | last_layer = [(cfg.num_classes-1, 1, {})] 319 | self.maskiou_net, _ = make_net(input_channels, cfg.maskiou_net + last_layer, include_last_relu=True) 320 | 321 | def forward(self, x): 322 | x = self.maskiou_net(x) 323 | maskiou_p = F.max_pool2d(x, kernel_size=x.size()[2:]).squeeze(-1).squeeze(-1) 324 | 325 | return maskiou_p 326 | 327 | 328 | 329 | class Yolact(nn.Module): 330 | """ 331 | 332 | 333 | ██╗ ██╗ ██████╗ ██╗ █████╗ ██████╗████████╗ 334 | ╚██╗ ██╔╝██╔═══██╗██║ ██╔══██╗██╔════╝╚══██╔══╝ 335 | ╚████╔╝ ██║ ██║██║ ███████║██║ ██║ 336 | ╚██╔╝ ██║ ██║██║ ██╔══██║██║ ██║ 337 | ██║ ╚██████╔╝███████╗██║ ██║╚██████╗ ██║ 338 | ╚═╝ ╚═════╝ ╚══════╝╚═╝ ╚═╝ ╚═════╝ ╚═╝ 339 | 340 | 341 | You can set the arguments by changing them in the backbone config object in config.py. 342 | 343 | Parameters (in cfg.backbone): 344 | - selected_layers: The indices of the conv layers to use for prediction. 345 | - pred_scales: A list with len(selected_layers) containing tuples of scales (see PredictionModule) 346 | - pred_aspect_ratios: A list of lists of aspect ratios with len(selected_layers) (see PredictionModule) 347 | """ 348 | 349 | def __init__(self): 350 | super().__init__() 351 | 352 | self.backbone = construct_backbone(cfg.backbone) 353 | # Compute mask_dim here and add it back to the config. Make sure Yolact's constructor is called early! 354 | if cfg.mask_type == mask_type.direct: 355 | cfg.mask_dim = cfg.mask_size**2 356 | elif cfg.mask_type == mask_type.lincomb: 357 | if cfg.mask_proto_use_grid: 358 | self.grid = torch.Tensor(np.load(cfg.mask_proto_grid_file)) 359 | self.num_grids = self.grid.size(0) 360 | else: 361 | self.num_grids = 0 362 | 363 | self.proto_src = cfg.mask_proto_src 364 | 365 | if self.proto_src is None: in_channels = 3 366 | elif cfg.fpn is not None: in_channels = cfg.fpn.num_features 367 | else: in_channels = self.backbone.channels[self.proto_src] 368 | in_channels += self.num_grids 369 | 370 | # The include_last_relu=false here is because we might want to change it to another function 371 | self.proto_net, cfg.mask_dim = make_net(in_channels, cfg.mask_proto_net, include_last_relu=False) 372 | 373 | if cfg.mask_proto_bias: 374 | cfg.mask_dim += 1 375 | 376 | 377 | self.selected_layers = cfg.backbone.selected_layers 378 | src_channels = self.backbone.channels 379 | 380 | if cfg.use_maskiou: 381 | self.maskiou_net = FastMaskIoUNet() 382 | 383 | # if cfg.fpn is not None: 384 | # # Some hacky rewiring to accomodate the FPN 385 | # self.fpn = FPN([src_channels[i] for i in self.selected_layers]) 386 | # self.selected_layers = list(range(len(self.selected_layers) + cfg.fpn.num_downsample)) 387 | # src_channels = [cfg.fpn.num_features] * len(self.selected_layers) 388 | self.fpn = FPN([src_channels[i] for i in self.selected_layers]) 389 | self.selected_layers = list(range(len(self.selected_layers) + cfg.fpn.num_downsample)) 390 | src_channels = [cfg.fpn.num_features] * len(self.selected_layers) 391 | 392 | self.prediction_layers = nn.ModuleList() 393 | cfg.num_heads = len(self.selected_layers) 394 | 395 | for idx, layer_idx in enumerate(self.selected_layers): 396 | # If we're sharing prediction module weights, have every module's parent be the first one 397 | parent = None 398 | if cfg.share_prediction_module and idx > 0: 399 | parent = self.prediction_layers[0] 400 | 401 | pred = PredictionModule(src_channels[layer_idx], src_channels[layer_idx], 402 | aspect_ratios = cfg.backbone.pred_aspect_ratios[idx], 403 | scales = cfg.backbone.pred_scales[idx], 404 | parent = parent, 405 | index = idx) 406 | self.prediction_layers.append(pred) 407 | 408 | # Extra parameters for the extra losses 409 | if cfg.use_class_existence_loss: 410 | # This comes from the smallest layer selected 411 | # Also note that cfg.num_classes includes background 412 | self.class_existence_fc = nn.Linear(src_channels[-1], cfg.num_classes - 1) 413 | 414 | if cfg.use_semantic_segmentation_loss: 415 | self.semantic_seg_conv = nn.Conv2d(src_channels[0], cfg.num_classes-1, kernel_size=1) 416 | 417 | # # For use in evaluation 418 | # self.detect = Detect(cfg.num_classes, bkg_label=0, top_k=cfg.nms_top_k, 419 | # conf_thresh=cfg.nms_conf_thresh, nms_thresh=cfg.nms_thresh) 420 | 421 | def load_weights(self, path): 422 | """ Loads weights from a compressed save file. """ 423 | state_dict = torch.load(path, map_location=device) 424 | 425 | # For backward compatability, remove these (the new variable is called layers) 426 | for key in list(state_dict.keys()): 427 | if key.startswith('backbone.layer') and not key.startswith('backbone.layers'): 428 | del state_dict[key] 429 | 430 | # Also for backward compatibility with v1.0 weights, do this check 431 | if key.startswith('fpn.downsample_layers.'): 432 | if cfg.fpn is not None and int(key.split('.')[2]) >= cfg.fpn.num_downsample: 433 | del state_dict[key] 434 | self.load_state_dict(state_dict) 435 | 436 | def forward(self, x): 437 | """ The input should be of size [batch_size, 3, img_h, img_w] """ 438 | _, _, img_h, img_w = x.size() 439 | cfg._tmp_img_h = img_h 440 | cfg._tmp_img_w = img_w 441 | 442 | outs = self.backbone(x) 443 | 444 | outs = [outs[i] for i in cfg.backbone.selected_layers] 445 | outs = self.fpn(outs) 446 | 447 | proto_x = x if self.proto_src is None else outs[self.proto_src] 448 | 449 | if self.num_grids > 0: 450 | grids = self.grid.repeat(proto_x.size(0), 1, 1, 1) 451 | proto_x = torch.cat([proto_x, grids], dim=1) 452 | 453 | proto_out = self.proto_net(proto_x) 454 | proto_out = F.relu(proto_out) 455 | 456 | # Move the features last so the multiplication is easy 457 | proto_out = proto_out.permute(0, 2, 3, 1).contiguous() 458 | loc, conf, mask = [], [], [] 459 | for idx, pred_layer in zip(self.selected_layers, self.prediction_layers): 460 | pred_x = outs[idx] 461 | # A hack for the way dataparallel works 462 | if cfg.share_prediction_module and pred_layer is not self.prediction_layers[0]: 463 | pred_layer.parent = [self.prediction_layers[0]] 464 | 465 | p = pred_layer(pred_x) ###loc, conf, mask 466 | loc.append(p[0]) 467 | conf.append(p[1]) 468 | mask.append(p[2]) 469 | loc = torch.cat(loc, -2) 470 | conf = torch.cat(conf, -2) 471 | mask = torch.cat(mask, -2) 472 | conf = F.softmax(conf, -1) 473 | 474 | _, *s = loc.shape 475 | loc = loc.view(s) 476 | _, *s = conf.shape 477 | conf = conf.view(s) 478 | _, *s = mask.shape 479 | mask = mask.view(s) 480 | _, *s = proto_out.shape 481 | proto_out = proto_out.view(s) 482 | return loc, conf, mask, proto_out 483 | -------------------------------------------------------------------------------- /convert-onnx/config.py: -------------------------------------------------------------------------------- 1 | from backbone import ResNetBackbone, VGGBackbone, ResNetBackboneGN, DarkNetBackbone 2 | from math import sqrt 3 | import torch 4 | 5 | # for making bounding boxes pretty 6 | COLORS = ((244, 67, 54), 7 | (233, 30, 99), 8 | (156, 39, 176), 9 | (103, 58, 183), 10 | (63, 81, 181), 11 | (33, 150, 243), 12 | (3, 169, 244), 13 | (0, 188, 212), 14 | (0, 150, 136), 15 | (76, 175, 80), 16 | (139, 195, 74), 17 | (205, 220, 57), 18 | (255, 235, 59), 19 | (255, 193, 7), 20 | (255, 152, 0), 21 | (255, 87, 34), 22 | (121, 85, 72), 23 | (158, 158, 158), 24 | (96, 125, 139)) 25 | 26 | # These are in BGR and are for ImageNet 27 | MEANS = (103.94, 116.78, 123.68) 28 | STD = (57.38, 57.12, 58.40) 29 | 30 | COCO_CLASSES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 31 | 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 32 | 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 33 | 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 34 | 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 35 | 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 36 | 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 37 | 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 38 | 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 39 | 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 40 | 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 41 | 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 42 | 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 43 | 'scissors', 'teddy bear', 'hair drier', 'toothbrush') 44 | 45 | COCO_LABEL_MAP = {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 46 | 9: 9, 10: 10, 11: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 47 | 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 48 | 27: 25, 28: 26, 31: 27, 32: 28, 33: 29, 34: 30, 35: 31, 36: 32, 49 | 37: 33, 38: 34, 39: 35, 40: 36, 41: 37, 42: 38, 43: 39, 44: 40, 50 | 46: 41, 47: 42, 48: 43, 49: 44, 50: 45, 51: 46, 52: 47, 53: 48, 51 | 54: 49, 55: 50, 56: 51, 57: 52, 58: 53, 59: 54, 60: 55, 61: 56, 52 | 62: 57, 63: 58, 64: 59, 65: 60, 67: 61, 70: 62, 72: 63, 73: 64, 53 | 74: 65, 75: 66, 76: 67, 77: 68, 78: 69, 79: 70, 80: 71, 81: 72, 54 | 82: 73, 84: 74, 85: 75, 86: 76, 87: 77, 88: 78, 89: 79, 90: 80} 55 | 56 | 57 | # ----------------------- CONFIG CLASS ----------------------- # 58 | 59 | class Config(object): 60 | """ 61 | Holds the configuration for anything you want it to. 62 | To get the currently active config, call get_cfg(). 63 | 64 | To use, just do cfg.x instead of cfg['x']. 65 | I made this because doing cfg['x'] all the time is dumb. 66 | """ 67 | 68 | def __init__(self, config_dict): 69 | for key, val in config_dict.items(): 70 | self.__setattr__(key, val) 71 | 72 | def copy(self, new_config_dict={}): 73 | """ 74 | Copies this config into a new config object, making 75 | the changes given by new_config_dict. 76 | """ 77 | 78 | ret = Config(vars(self)) 79 | 80 | for key, val in new_config_dict.items(): 81 | ret.__setattr__(key, val) 82 | 83 | return ret 84 | 85 | def replace(self, new_config_dict): 86 | """ 87 | Copies new_config_dict into this config object. 88 | Note: new_config_dict can also be a config object. 89 | """ 90 | if isinstance(new_config_dict, Config): 91 | new_config_dict = vars(new_config_dict) 92 | 93 | for key, val in new_config_dict.items(): 94 | self.__setattr__(key, val) 95 | 96 | def print(self): 97 | for k, v in vars(self).items(): 98 | print(k, ' = ', v) 99 | 100 | 101 | # ----------------------- DATASETS ----------------------- # 102 | 103 | dataset_base = Config({ 104 | 'name': 'Base Dataset', 105 | 106 | # Training images and annotations 107 | 'train_images': './data/coco/images/', 108 | 'train_info': 'path_to_annotation_file', 109 | 110 | # Validation images and annotations. 111 | 'valid_images': './data/coco/images/', 112 | 'valid_info': 'path_to_annotation_file', 113 | 114 | # Whether or not to load GT. If this is False, eval.py quantitative evaluation won't work. 115 | 'has_gt': True, 116 | 117 | # A list of names for each of you classes. 118 | 'class_names': COCO_CLASSES, 119 | 120 | # COCO class ids aren't sequential, so this is a bandage fix. If your ids aren't sequential, 121 | # provide a map from category_id -> index in class_names + 1 (the +1 is there because it's 1-indexed). 122 | # If not specified, this just assumes category ids start at 1 and increase sequentially. 123 | 'label_map': None 124 | }) 125 | 126 | coco2014_dataset = dataset_base.copy({ 127 | 'name': 'COCO 2014', 128 | 129 | 'train_info': './data/coco/annotations/instances_train2014.json', 130 | 'valid_info': './data/coco/annotations/instances_val2014.json', 131 | 132 | 'label_map': COCO_LABEL_MAP 133 | }) 134 | 135 | coco2017_dataset = dataset_base.copy({ 136 | 'name': 'COCO 2017', 137 | 138 | 'train_info': './data/coco/annotations/instances_train2017.json', 139 | 'valid_info': './data/coco/annotations/instances_val2017.json', 140 | 141 | 'label_map': COCO_LABEL_MAP 142 | }) 143 | 144 | coco2017_testdev_dataset = dataset_base.copy({ 145 | 'name': 'COCO 2017 Test-Dev', 146 | 147 | 'valid_info': './data/coco/annotations/image_info_test-dev2017.json', 148 | 'has_gt': False, 149 | 150 | 'label_map': COCO_LABEL_MAP 151 | }) 152 | 153 | PASCAL_CLASSES = ("aeroplane", "bicycle", "bird", "boat", "bottle", 154 | "bus", "car", "cat", "chair", "cow", "diningtable", 155 | "dog", "horse", "motorbike", "person", "pottedplant", 156 | "sheep", "sofa", "train", "tvmonitor") 157 | 158 | pascal_sbd_dataset = dataset_base.copy({ 159 | 'name': 'Pascal SBD 2012', 160 | 161 | 'train_images': './data/sbd/img', 162 | 'valid_images': './data/sbd/img', 163 | 164 | 'train_info': './data/sbd/pascal_sbd_train.json', 165 | 'valid_info': './data/sbd/pascal_sbd_val.json', 166 | 167 | 'class_names': PASCAL_CLASSES, 168 | }) 169 | 170 | # ----------------------- TRANSFORMS ----------------------- # 171 | 172 | resnet_transform = Config({ 173 | 'channel_order': 'RGB', 174 | 'normalize': True, 175 | 'subtract_means': False, 176 | 'to_float': False, 177 | }) 178 | 179 | vgg_transform = Config({ 180 | # Note that though vgg is traditionally BGR, 181 | # the channel order of vgg_reducedfc.pth is RGB. 182 | 'channel_order': 'RGB', 183 | 'normalize': False, 184 | 'subtract_means': True, 185 | 'to_float': False, 186 | }) 187 | 188 | darknet_transform = Config({ 189 | 'channel_order': 'RGB', 190 | 'normalize': False, 191 | 'subtract_means': False, 192 | 'to_float': True, 193 | }) 194 | 195 | # ----------------------- BACKBONES ----------------------- # 196 | 197 | backbone_base = Config({ 198 | 'name': 'Base Backbone', 199 | 'path': 'path/to/pretrained/weights', 200 | 'type': object, 201 | 'args': tuple(), 202 | 'transform': resnet_transform, 203 | 204 | 'selected_layers': list(), 205 | 'pred_scales': list(), 206 | 'pred_aspect_ratios': list(), 207 | 208 | 'use_pixel_scales': False, 209 | 'preapply_sqrt': True, 210 | 'use_square_anchors': False, 211 | }) 212 | 213 | resnet101_backbone = backbone_base.copy({ 214 | 'name': 'ResNet101', 215 | 'path': 'resnet101_reducedfc.pth', 216 | 'type': ResNetBackbone, 217 | 'args': ([3, 4, 23, 3],), 218 | 'transform': resnet_transform, 219 | 220 | 'selected_layers': list(range(2, 8)), 221 | 'pred_scales': [[1]] * 6, 222 | 'pred_aspect_ratios': [[[0.66685089, 1.7073535, 0.87508774, 1.16524493, 0.49059086]]] * 6, 223 | }) 224 | 225 | resnet101_gn_backbone = backbone_base.copy({ 226 | 'name': 'ResNet101_GN', 227 | 'path': 'R-101-GN.pkl', 228 | 'type': ResNetBackboneGN, 229 | 'args': ([3, 4, 23, 3],), 230 | 'transform': resnet_transform, 231 | 232 | 'selected_layers': list(range(2, 8)), 233 | 'pred_scales': [[1]] * 6, 234 | 'pred_aspect_ratios': [[[0.66685089, 1.7073535, 0.87508774, 1.16524493, 0.49059086]]] * 6, 235 | }) 236 | 237 | resnet101_dcn_inter3_backbone = resnet101_backbone.copy({ 238 | 'name': 'ResNet101_DCN_Interval3', 239 | 'args': ([3, 4, 23, 3], [0, 4, 23, 3], 3), 240 | }) 241 | 242 | resnet50_backbone = resnet101_backbone.copy({ 243 | 'name': 'ResNet50', 244 | 'path': 'resnet50-19c8e357.pth', 245 | 'type': ResNetBackbone, 246 | 'args': ([3, 4, 6, 3],), 247 | 'transform': resnet_transform, 248 | }) 249 | 250 | resnet50_dcnv2_backbone = resnet50_backbone.copy({ 251 | 'name': 'ResNet50_DCNv2', 252 | 'args': ([3, 4, 6, 3], [0, 4, 6, 3]), 253 | }) 254 | 255 | darknet53_backbone = backbone_base.copy({ 256 | 'name': 'DarkNet53', 257 | 'path': 'darknet53.pth', 258 | 'type': DarkNetBackbone, 259 | 'args': ([1, 2, 8, 8, 4],), 260 | 'transform': darknet_transform, 261 | 262 | 'selected_layers': list(range(3, 9)), 263 | 'pred_scales': [[3.5, 4.95], [3.6, 4.90], [3.3, 4.02], [2.7, 3.10], [2.1, 2.37], [1.8, 1.92]], 264 | 'pred_aspect_ratios': [[[1, sqrt(2), 1 / sqrt(2), sqrt(3), 1 / sqrt(3)][:n], [1]] for n in [3, 5, 5, 5, 3, 3]], 265 | }) 266 | 267 | vgg16_arch = [[64, 64], 268 | ['M', 128, 128], 269 | ['M', 256, 256, 256], 270 | [('M', {'kernel_size': 2, 'stride': 2, 'ceil_mode': True}), 512, 512, 512], 271 | ['M', 512, 512, 512], 272 | [('M', {'kernel_size': 3, 'stride': 1, 'padding': 1}), 273 | (1024, {'kernel_size': 3, 'padding': 6, 'dilation': 6}), 274 | (1024, {'kernel_size': 1})]] 275 | 276 | vgg16_backbone = backbone_base.copy({ 277 | 'name': 'VGG16', 278 | 'path': 'vgg16_reducedfc.pth', 279 | 'type': VGGBackbone, 280 | 'args': (vgg16_arch, [(256, 2), (128, 2), (128, 1), (128, 1)], [3]), 281 | 'transform': vgg_transform, 282 | 283 | 'selected_layers': [3] + list(range(5, 10)), 284 | 'pred_scales': [[5, 4]] * 6, 285 | 'pred_aspect_ratios': [[[1], [1, sqrt(2), 1 / sqrt(2), sqrt(3), 1 / sqrt(3)][:n]] for n in [3, 5, 5, 5, 3, 3]], 286 | }) 287 | 288 | # ----------------------- MASK BRANCH TYPES ----------------------- # 289 | 290 | mask_type = Config({ 291 | # Direct produces masks directly as the output of each pred module. 292 | # This is denoted as fc-mask in the paper. 293 | # Parameters: mask_size, use_gt_bboxes 294 | 'direct': 0, 295 | 296 | # Lincomb produces coefficients as the output of each pred module then uses those coefficients 297 | # to linearly combine features from a prototype network to create image-sized masks. 298 | # Parameters: 299 | # - masks_to_train (int): Since we're producing (near) full image masks, it'd take too much 300 | # vram to backprop on every single mask. Thus we select only a subset. 301 | # - mask_proto_src (int): The input layer to the mask prototype generation network. This is an 302 | # index in backbone.layers. Use to use the image itself instead. 303 | # - mask_proto_net (list): A list of layers in the mask proto network with the last one 304 | # being where the masks are taken from. Each conv layer is in 305 | # the form (num_features, kernel_size, **kwdargs). An empty 306 | # list means to use the source for prototype masks. If the 307 | # kernel_size is negative, this creates a deconv layer instead. 308 | # If the kernel_size is negative and the num_features is None, 309 | # this creates a simple bilinear interpolation layer instead. 310 | # - mask_proto_bias (bool): Whether to include an extra coefficient that corresponds to a proto 311 | # mask of all ones. 312 | # - mask_proto_prototype_activation (func): The activation to apply to each prototype mask. 313 | # - mask_proto_mask_activation (func): After summing the prototype masks with the predicted 314 | # coeffs, what activation to apply to the final mask. 315 | # - mask_proto_coeff_activation (func): The activation to apply to the mask coefficients. 316 | # - mask_proto_crop (bool): If True, crop the mask with the predicted bbox during training. 317 | # - mask_proto_crop_expand (float): If cropping, the percent to expand the cropping bbox by 318 | # in each direction. This is to make the model less reliant 319 | # on perfect bbox predictions. 320 | # - mask_proto_loss (str [l1|disj]): If not None, apply an l1 or disjunctive regularization 321 | # loss directly to the prototype masks. 322 | # - mask_proto_binarize_downsampled_gt (bool): Binarize GT after dowsnampling during training? 323 | # - mask_proto_normalize_mask_loss_by_sqrt_area (bool): Whether to normalize mask loss by sqrt(sum(gt)) 324 | # - mask_proto_reweight_mask_loss (bool): Reweight mask loss such that background is divided by 325 | # #background and foreground is divided by #foreground. 326 | # - mask_proto_grid_file (str): The path to the grid file to use with the next option. 327 | # This should be a numpy.dump file with shape [numgrids, h, w] 328 | # where h and w are w.r.t. the mask_proto_src convout. 329 | # - mask_proto_use_grid (bool): Whether to add extra grid features to the proto_net input. 330 | # - mask_proto_coeff_gate (bool): Add an extra set of sigmoided coefficients that is multiplied 331 | # into the predicted coefficients in order to "gate" them. 332 | # - mask_proto_prototypes_as_features (bool): For each prediction module, downsample the prototypes 333 | # to the convout size of that module and supply the prototypes as input 334 | # in addition to the already supplied backbone features. 335 | # - mask_proto_prototypes_as_features_no_grad (bool): If the above is set, don't backprop gradients to 336 | # to the prototypes from the network head. 337 | # - mask_proto_remove_empty_masks (bool): Remove masks that are downsampled to 0 during loss calculations. 338 | # - mask_proto_reweight_coeff (float): The coefficient to multiple the forground pixels with if reweighting. 339 | # - mask_proto_coeff_diversity_loss (bool): Apply coefficient diversity loss on the coefficients so that the same 340 | # instance has similar coefficients. 341 | # - mask_proto_coeff_diversity_alpha (float): The weight to use for the coefficient diversity loss. 342 | # - mask_proto_normalize_emulate_roi_pooling (bool): Normalize the mask loss to emulate roi pooling's affect on loss. 343 | # - mask_proto_double_loss (bool): Whether to use the old loss in addition to any special new losses. 344 | # - mask_proto_double_loss_alpha (float): The alpha to weight the above loss. 345 | # - mask_proto_split_prototypes_by_head (bool): If true, this will give each prediction head its own prototypes. 346 | # - mask_proto_crop_with_pred_box (bool): Whether to crop with the predicted box or the gt box. 347 | 'lincomb': 1, 348 | }) 349 | 350 | # ----------------------- ACTIVATION FUNCTIONS ----------------------- # 351 | 352 | activation_func = Config({ 353 | 'tanh': torch.tanh, 354 | 'sigmoid': torch.sigmoid, 355 | 'softmax': lambda x: torch.nn.functional.softmax(x, dim=-1), 356 | 'relu': lambda x: torch.nn.functional.relu(x, inplace=True), 357 | 'none': lambda x: x, 358 | }) 359 | 360 | # ----------------------- FPN DEFAULTS ----------------------- # 361 | 362 | fpn_base = Config({ 363 | # The number of features to have in each FPN layer 364 | 'num_features': 256, 365 | 366 | # The upsampling mode used 367 | 'interpolation_mode': 'bilinear', 368 | 369 | # The number of extra layers to be produced by downsampling starting at P5 370 | 'num_downsample': 1, 371 | 372 | # Whether to down sample with a 3x3 stride 2 conv layer instead of just a stride 2 selection 373 | 'use_conv_downsample': False, 374 | 375 | # Whether to pad the pred layers with 1 on each side (I forgot to add this at the start) 376 | # This is just here for backwards compatibility 377 | 'pad': True, 378 | 379 | # Whether to add relu to the downsampled layers. 380 | 'relu_downsample_layers': False, 381 | 382 | # Whether to add relu to the regular layers 383 | 'relu_pred_layers': True, 384 | }) 385 | 386 | # ----------------------- CONFIG DEFAULTS ----------------------- # 387 | 388 | coco_base_config = Config({ 389 | 'dataset': coco2014_dataset, 390 | 'num_classes': 81, # This should include the background class 391 | 392 | 'max_iter': 400000, 393 | 394 | # The maximum number of detections for evaluation 395 | 'max_num_detections': 100, 396 | 397 | # dw' = momentum * dw - lr * (grad + decay * w) 398 | 'lr': 1e-3, 399 | 'momentum': 0.9, 400 | 'decay': 5e-4, 401 | 402 | # For each lr step, what to multiply the lr with 403 | 'gamma': 0.1, 404 | 'lr_steps': (280000, 360000, 400000), 405 | 406 | # Initial learning rate to linearly warmup from (if until > 0) 407 | 'lr_warmup_init': 1e-4, 408 | 409 | # If > 0 then increase the lr linearly from warmup_init to lr each iter for until iters 410 | 'lr_warmup_until': 500, 411 | 412 | # The terms to scale the respective loss by 413 | 'conf_alpha': 1, 414 | 'bbox_alpha': 1.5, 415 | 'mask_alpha': 0.4 / 256 * 140 * 140, # Some funky equation. Don't worry about it. 416 | 417 | # Eval.py sets this if you just want to run YOLACT as a detector 418 | 'eval_mask_branch': True, 419 | 420 | # Top_k examples to consider for NMS 421 | 'nms_top_k': 200, 422 | # Examples with confidence less than this are not considered by NMS 423 | 'nms_conf_thresh': 0.05, 424 | # Boxes with IoU overlap greater than this threshold will be culled during NMS 425 | 'nms_thresh': 0.5, 426 | 427 | # See mask_type for details. 428 | 'mask_type': mask_type.direct, 429 | 'mask_size': 16, 430 | 'masks_to_train': 100, 431 | 'mask_proto_src': None, 432 | 'mask_proto_net': [(256, 3, {}), (256, 3, {})], 433 | 'mask_proto_bias': False, 434 | 'mask_proto_prototype_activation': activation_func.relu, 435 | 'mask_proto_mask_activation': activation_func.sigmoid, 436 | 'mask_proto_coeff_activation': activation_func.tanh, 437 | 'mask_proto_crop': True, 438 | 'mask_proto_crop_expand': 0, 439 | 'mask_proto_loss': None, 440 | 'mask_proto_binarize_downsampled_gt': True, 441 | 'mask_proto_normalize_mask_loss_by_sqrt_area': False, 442 | 'mask_proto_reweight_mask_loss': False, 443 | 'mask_proto_grid_file': 'data/grid.npy', 444 | 'mask_proto_use_grid': False, 445 | 'mask_proto_coeff_gate': False, 446 | 'mask_proto_prototypes_as_features': False, 447 | 'mask_proto_prototypes_as_features_no_grad': False, 448 | 'mask_proto_remove_empty_masks': False, 449 | 'mask_proto_reweight_coeff': 1, 450 | 'mask_proto_coeff_diversity_loss': False, 451 | 'mask_proto_coeff_diversity_alpha': 1, 452 | 'mask_proto_normalize_emulate_roi_pooling': False, 453 | 'mask_proto_double_loss': False, 454 | 'mask_proto_double_loss_alpha': 1, 455 | 'mask_proto_split_prototypes_by_head': False, 456 | 'mask_proto_crop_with_pred_box': False, 457 | 458 | # SSD data augmentation parameters 459 | # Randomize hue, vibrance, etc. 460 | 'augment_photometric_distort': True, 461 | # Have a chance to scale down the image and pad (to emulate smaller detections) 462 | 'augment_expand': True, 463 | # Potentialy sample a random crop from the image and put it in a random place 464 | 'augment_random_sample_crop': True, 465 | # Mirror the image with a probability of 1/2 466 | 'augment_random_mirror': True, 467 | # Flip the image vertically with a probability of 1/2 468 | 'augment_random_flip': False, 469 | # With uniform probability, rotate the image [0,90,180,270] degrees 470 | 'augment_random_rot90': False, 471 | 472 | # Discard detections with width and height smaller than this (in absolute width and height) 473 | 'discard_box_width': 4 / 550, 474 | 'discard_box_height': 4 / 550, 475 | 476 | # If using batchnorm anywhere in the backbone, freeze the batchnorm layer during training. 477 | # Note: any additional batch norm layers after the backbone will not be frozen. 478 | 'freeze_bn': False, 479 | 480 | # Set this to a config object if you want an FPN (inherit from fpn_base). See fpn_base for details. 481 | 'fpn': None, 482 | 483 | # Use the same weights for each network head 484 | 'share_prediction_module': False, 485 | 486 | # For hard negative mining, instead of using the negatives that are leastl confidently background, 487 | # use negatives that are most confidently not background. 488 | 'ohem_use_most_confident': False, 489 | 490 | # Use focal loss as described in https://arxiv.org/pdf/1708.02002.pdf instead of OHEM 491 | 'use_focal_loss': False, 492 | 'focal_loss_alpha': 0.25, 493 | 'focal_loss_gamma': 2, 494 | 495 | # The initial bias toward forground objects, as specified in the focal loss paper 496 | 'focal_loss_init_pi': 0.01, 497 | 498 | # Keeps track of the average number of examples for each class, and weights the loss for that class accordingly. 499 | 'use_class_balanced_conf': False, 500 | 501 | # Whether to use sigmoid focal loss instead of softmax, all else being the same. 502 | 'use_sigmoid_focal_loss': False, 503 | 504 | # Use class[0] to be the objectness score and class[1:] to be the softmax predicted class. 505 | # Note: at the moment this is only implemented if use_focal_loss is on. 506 | 'use_objectness_score': False, 507 | 508 | # Adds a global pool + fc layer to the smallest selected layer that predicts the existence of each of the 80 classes. 509 | # This branch is only evaluated during training time and is just there for multitask learning. 510 | 'use_class_existence_loss': False, 511 | 'class_existence_alpha': 1, 512 | 513 | # Adds a 1x1 convolution directly to the biggest selected layer that predicts a semantic segmentations for each of the 80 classes. 514 | # This branch is only evaluated during training time and is just there for multitask learning. 515 | 'use_semantic_segmentation_loss': False, 516 | 'semantic_segmentation_alpha': 1, 517 | 518 | # Adds another branch to the netwok to predict Mask IoU. 519 | 'use_mask_scoring': False, 520 | 'mask_scoring_alpha': 1, 521 | 522 | # Match gt boxes using the Box2Pix change metric instead of the standard IoU metric. 523 | # Note that the threshold you set for iou_threshold should be negative with this setting on. 524 | 'use_change_matching': False, 525 | 526 | # Uses the same network format as mask_proto_net, except this time it's for adding extra head layers before the final 527 | # prediction in prediction modules. If this is none, no extra layers will be added. 528 | 'extra_head_net': None, 529 | 530 | # What params should the final head layers have (the ones that predict box, confidence, and mask coeffs) 531 | 'head_layer_params': {'kernel_size': 3, 'padding': 1}, 532 | 533 | # Add extra layers between the backbone and the network heads 534 | # The order is (bbox, conf, mask) 535 | 'extra_layers': (0, 0, 0), 536 | 537 | # During training, to match detections with gt, first compute the maximum gt IoU for each prior. 538 | # Then, any of those priors whose maximum overlap is over the positive threshold, mark as positive. 539 | # For any priors whose maximum is less than the negative iou threshold, mark them as negative. 540 | # The rest are neutral and not used in calculating the loss. 541 | 'positive_iou_threshold': 0.5, 542 | 'negative_iou_threshold': 0.5, 543 | 544 | # When using ohem, the ratio between positives and negatives (3 means 3 negatives to 1 positive) 545 | 'ohem_negpos_ratio': 3, 546 | 547 | # If less than 1, anchors treated as a negative that have a crowd iou over this threshold with 548 | # the crowd boxes will be treated as a neutral. 549 | 'crowd_iou_threshold': 1, 550 | 551 | # This is filled in at runtime by Yolact's __init__, so don't touch it 552 | 'mask_dim': None, 553 | 554 | # Input image size. 555 | 'max_size': 300, 556 | 557 | # Whether or not to do post processing on the cpu at test time 558 | 'force_cpu_nms': True, 559 | 560 | # Whether to use mask coefficient cosine similarity nms instead of bbox iou nms 561 | 'use_coeff_nms': False, 562 | 563 | # Whether or not to have a separate branch whose sole purpose is to act as the coefficients for coeff_diversity_loss 564 | # Remember to turn on coeff_diversity_loss, or these extra coefficients won't do anything! 565 | # To see their effect, also remember to turn on use_coeff_nms. 566 | 'use_instance_coeff': False, 567 | 'num_instance_coeffs': 64, 568 | 569 | # Whether or not to tie the mask loss / box loss to 0 570 | 'train_masks': True, 571 | 'train_boxes': True, 572 | # If enabled, the gt masks will be cropped using the gt bboxes instead of the predicted ones. 573 | # This speeds up training time considerably but results in much worse mAP at test time. 574 | 'use_gt_bboxes': False, 575 | 576 | # Whether or not to preserve aspect ratio when resizing the image. 577 | # If True, this will resize all images to be max_size^2 pixels in area while keeping aspect ratio. 578 | # If False, all images are resized to max_size x max_size 579 | 'preserve_aspect_ratio': False, 580 | 581 | # Whether or not to use the prediction module (c) from DSSD 582 | 'use_prediction_module': False, 583 | 584 | # Whether or not to use the predicted coordinate scheme from Yolo v2 585 | 'use_yolo_regressors': False, 586 | 587 | # For training, bboxes are considered "positive" if their anchors have a 0.5 IoU overlap 588 | # or greater with a ground truth box. If this is true, instead of using the anchor boxes 589 | # for this IoU computation, the matching function will use the predicted bbox coordinates. 590 | # Don't turn this on if you're not using yolo regressors! 591 | 'use_prediction_matching': False, 592 | 593 | # A list of settings to apply after the specified iteration. Each element of the list should look like 594 | # (iteration, config_dict) where config_dict is a dictionary you'd pass into a config object's init. 595 | 'delayed_settings': [], 596 | 597 | # Use command-line arguments to set this. 598 | 'no_jit': False, 599 | 600 | 'backbone': None, 601 | 'name': 'base_config', 602 | 603 | # Fast Mask Re-scoring Network 604 | # Inspried by Mask Scoring R-CNN (https://arxiv.org/abs/1903.00241) 605 | # Do not crop out the mask with bbox but slide a convnet on the image-size mask, 606 | # then use global pooling to get the final mask score 607 | 'use_maskiou': False, 608 | 609 | # Archecture for the mask iou network. A (num_classes-1, 1, {}) layer is appended to the end. 610 | 'maskiou_net': [], 611 | 612 | # Discard predicted masks whose area is less than this 613 | 'discard_mask_area': -1, 614 | 615 | 'maskiou_alpha': 1.0, 616 | 'rescore_mask': False, 617 | 'rescore_bbox': False, 618 | 'maskious_to_train': -1, 619 | }) 620 | 621 | # ----------------------- YOLACT v1.0 CONFIGS ----------------------- # 622 | 623 | yolact_base_config = coco_base_config.copy({ 624 | 'name': 'yolact_base', 625 | 626 | # Dataset stuff 627 | 'dataset': coco2017_dataset, 628 | 'num_classes': len(coco2017_dataset.class_names) + 1, 629 | 630 | # Image Size 631 | 'max_size': 550, 632 | 633 | # Training params 634 | 'lr_steps': (280000, 600000, 700000, 750000), 635 | 'max_iter': 800000, 636 | 637 | # Backbone Settings 638 | 'backbone': resnet101_backbone.copy({ 639 | 'selected_layers': list(range(1, 4)), 640 | 'use_pixel_scales': True, 641 | 'preapply_sqrt': False, 642 | 'use_square_anchors': True, # This is for backward compatability with a bug 643 | 644 | 'pred_aspect_ratios': [[[1, 1 / 2, 2]]] * 5, 645 | 'pred_scales': [[24], [48], [96], [192], [384]], 646 | }), 647 | 648 | # FPN Settings 649 | 'fpn': fpn_base.copy({ 650 | 'use_conv_downsample': True, 651 | 'num_downsample': 2, 652 | }), 653 | 654 | # Mask Settings 655 | 'mask_type': mask_type.lincomb, 656 | 'mask_alpha': 6.125, 657 | 'mask_proto_src': 0, 658 | 'mask_proto_net': [(256, 3, {'padding': 1})] * 3 + [(None, -2, {}), (256, 3, {'padding': 1})] + [(32, 1, {})], 659 | 'mask_proto_normalize_emulate_roi_pooling': True, 660 | 661 | # Other stuff 662 | 'share_prediction_module': True, 663 | 'extra_head_net': [(256, 3, {'padding': 1})], 664 | 665 | 'positive_iou_threshold': 0.5, 666 | 'negative_iou_threshold': 0.4, 667 | 668 | 'crowd_iou_threshold': 0.7, 669 | 670 | 'use_semantic_segmentation_loss': True, 671 | }) 672 | 673 | yolact_im400_config = yolact_base_config.copy({ 674 | 'name': 'yolact_im400', 675 | 676 | 'max_size': 400, 677 | 'backbone': yolact_base_config.backbone.copy({ 678 | 'pred_scales': [[int(x[0] / yolact_base_config.max_size * 400)] for x in 679 | yolact_base_config.backbone.pred_scales], 680 | }), 681 | }) 682 | 683 | yolact_im700_config = yolact_base_config.copy({ 684 | 'name': 'yolact_im700', 685 | 686 | 'masks_to_train': 300, 687 | 'max_size': 700, 688 | 'backbone': yolact_base_config.backbone.copy({ 689 | 'pred_scales': [[int(x[0] / yolact_base_config.max_size * 700)] for x in 690 | yolact_base_config.backbone.pred_scales], 691 | }), 692 | }) 693 | 694 | yolact_darknet53_config = yolact_base_config.copy({ 695 | 'name': 'yolact_darknet53', 696 | 697 | 'backbone': darknet53_backbone.copy({ 698 | 'selected_layers': list(range(2, 5)), 699 | 700 | 'pred_scales': yolact_base_config.backbone.pred_scales, 701 | 'pred_aspect_ratios': yolact_base_config.backbone.pred_aspect_ratios, 702 | 'use_pixel_scales': True, 703 | 'preapply_sqrt': False, 704 | 'use_square_anchors': True, # This is for backward compatability with a bug 705 | }), 706 | }) 707 | 708 | yolact_resnet50_config = yolact_base_config.copy({ 709 | 'name': 'yolact_resnet50', 710 | 711 | 'backbone': resnet50_backbone.copy({ 712 | 'selected_layers': list(range(1, 4)), 713 | 714 | 'pred_scales': yolact_base_config.backbone.pred_scales, 715 | 'pred_aspect_ratios': yolact_base_config.backbone.pred_aspect_ratios, 716 | 'use_pixel_scales': True, 717 | 'preapply_sqrt': False, 718 | 'use_square_anchors': True, # This is for backward compatability with a bug 719 | }), 720 | }) 721 | 722 | yolact_resnet50_pascal_config = yolact_resnet50_config.copy({ 723 | 'name': None, # Will default to yolact_resnet50_pascal 724 | 725 | # Dataset stuff 726 | 'dataset': pascal_sbd_dataset, 727 | 'num_classes': len(pascal_sbd_dataset.class_names) + 1, 728 | 729 | 'max_iter': 120000, 730 | 'lr_steps': (60000, 100000), 731 | 732 | 'backbone': yolact_resnet50_config.backbone.copy({ 733 | 'pred_scales': [[32], [64], [128], [256], [512]], 734 | 'use_square_anchors': False, 735 | }) 736 | }) 737 | 738 | # ----------------------- YOLACT++ CONFIGS ----------------------- # 739 | 740 | yolact_plus_base_config = yolact_base_config.copy({ 741 | 'name': 'yolact_plus_base', 742 | 743 | 'backbone': resnet101_dcn_inter3_backbone.copy({ 744 | 'selected_layers': list(range(1, 4)), 745 | 746 | 'pred_aspect_ratios': [[[1, 1 / 2, 2]]] * 5, 747 | 'pred_scales': [[i * 2 ** (j / 3.0) for j in range(3)] for i in [24, 48, 96, 192, 384]], 748 | 'use_pixel_scales': True, 749 | 'preapply_sqrt': False, 750 | 'use_square_anchors': False, 751 | }), 752 | 753 | 'use_maskiou': True, 754 | 'maskiou_net': [(8, 3, {'stride': 2}), (16, 3, {'stride': 2}), (32, 3, {'stride': 2}), (64, 3, {'stride': 2}), 755 | (128, 3, {'stride': 2})], 756 | 'maskiou_alpha': 25, 757 | 'rescore_bbox': False, 758 | 'rescore_mask': True, 759 | 760 | 'discard_mask_area': 5 * 5, 761 | }) 762 | 763 | yolact_plus_resnet50_config = yolact_plus_base_config.copy({ 764 | 'name': 'yolact_plus_resnet50', 765 | 766 | 'backbone': resnet50_dcnv2_backbone.copy({ 767 | 'selected_layers': list(range(1, 4)), 768 | 769 | 'pred_aspect_ratios': [[[1, 1 / 2, 2]]] * 5, 770 | 'pred_scales': [[i * 2 ** (j / 3.0) for j in range(3)] for i in [24, 48, 96, 192, 384]], 771 | 'use_pixel_scales': True, 772 | 'preapply_sqrt': False, 773 | 'use_square_anchors': False, 774 | }), 775 | }) 776 | 777 | # Default config 778 | cfg = yolact_base_config.copy() 779 | 780 | 781 | def set_cfg(config_name: str): 782 | """ Sets the active config. Works even if cfg is already imported! """ 783 | global cfg 784 | 785 | # Note this is not just an eval because I'm lazy, but also because it can 786 | # be used like ssd300_config.copy({'max_size': 400}) for extreme fine-tuning 787 | cfg.replace(eval(config_name)) 788 | 789 | if cfg.name is None: 790 | cfg.name = config_name.split('_config')[0] 791 | 792 | 793 | def set_dataset(dataset_name: str): 794 | """ Sets the dataset of the current config. """ 795 | cfg.dataset = eval(dataset_name) 796 | 797 | --------------------------------------------------------------------------------