├── vision
    ├── utils
    │   ├── __init__.py
    │   ├── misc.py
    │   ├── measurements.py
    │   ├── model_book.py
    │   ├── box_utils_numpy.py
    │   └── box_utils.py
    ├── test
    │   ├── assets
    │   │   └── 000138.jpg
    │   └── test_vgg_ssd.py
    ├── ssd
    │   ├── config
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-36.pyc
    │   │   │   ├── vgg_ssd_config.cpython-36.pyc
    │   │   │   ├── squeezenet_ssd_config.cpython-36.pyc
    │   │   │   └── mobilenetv1_ssd_config.cpython-36.pyc
    │   │   ├── vgg_ssd_config.py
    │   │   ├── mobilenetv1_ssd_config.py
    │   │   ├── mobilenetv3_ssd_config.py
    │   │   └── squeezenet_ssd_config.py
    │   ├── data_preprocessing.py
    │   ├── predictor.py
    │   ├── mobilenetv1_ssd.py
    │   ├── vgg_ssd.py
    │   ├── fpn_mobilenetv1_ssd.py
    │   ├── mobilenet_v2_ssd_lite.py
    │   ├── mobilenetv1_ssd_lite.py
    │   ├── squeezenet_ssd_lite.py
    │   ├── mobilenet_v3_ssd_lite.py
    │   ├── fpn_ssd.py
    │   └── ssd.py
    ├── nn
    │   ├── scaled_l2_norm.py
    │   ├── vgg.py
    │   ├── mobilenet.py
    │   ├── multibox_loss.py
    │   ├── alexnet.py
    │   ├── squeezenet.py
    │   ├── mobilenet_v2.py
    │   └── mobilenet_v3.py
    ├── datasets
    │   ├── collation.py
    │   ├── generate_vocdata.py
    │   ├── open_images.py
    │   └── voc_dataset.py
    ├── prunning
    │   └── prunner.py
    └── transforms
    │   └── transforms.py
├── models
    └── open-images-model-labels.txt
├── visual_tf_models.py
├── extract_tf_weights.py
├── draw_eval_results.py
├── convert_to_caffe2_models.py
├── README.md
├── run_ssd_example.py
├── translate_tf_mobilenetv1.py
├── run_ssd_live_demo.py
├── run_ssd_live_caffe2.py
├── open_images_downloader.py
├── eval_ssd.py
├── prune_alexnet.py
└── train_ssd.py


/vision/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .misc import *
2 | 


--------------------------------------------------------------------------------
/models/open-images-model-labels.txt:
--------------------------------------------------------------------------------
1 | BACKGROUND
2 | Bread
3 | Cake


--------------------------------------------------------------------------------
/vision/test/assets/000138.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaoshengsong/MobileNetV3-SSD/HEAD/vision/test/assets/000138.jpg


--------------------------------------------------------------------------------
/vision/ssd/config/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaoshengsong/MobileNetV3-SSD/HEAD/vision/ssd/config/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/vision/ssd/config/__pycache__/vgg_ssd_config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaoshengsong/MobileNetV3-SSD/HEAD/vision/ssd/config/__pycache__/vgg_ssd_config.cpython-36.pyc


--------------------------------------------------------------------------------
/vision/ssd/config/__pycache__/squeezenet_ssd_config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaoshengsong/MobileNetV3-SSD/HEAD/vision/ssd/config/__pycache__/squeezenet_ssd_config.cpython-36.pyc


--------------------------------------------------------------------------------
/vision/ssd/config/__pycache__/mobilenetv1_ssd_config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shaoshengsong/MobileNetV3-SSD/HEAD/vision/ssd/config/__pycache__/mobilenetv1_ssd_config.cpython-36.pyc


--------------------------------------------------------------------------------
/visual_tf_models.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.python.platform import gfile
 3 | import sys
 4 | import time
 5 | 
 6 | if len(sys.argv) < 2:
 7 |     print("Usage: python visual_tf_model.py <model.pb>")
 8 |     sys.exit(0)
 9 | 
10 | model_file_name = sys.argv[1]
11 | with tf.Session() as sess:
12 |     with gfile.FastGFile(model_file_name, 'rb') as f:
13 |         graph_def = tf.GraphDef()
14 |         graph_def.ParseFromString(f.read())
15 |         g_in = tf.import_graph_def(graph_def)
16 | LOGDIR='log'
17 | train_writer = tf.summary.FileWriter(LOGDIR)
18 | train_writer.add_graph(sess.graph)
19 | 
20 | while True:
21 |     time.sleep(1000)


--------------------------------------------------------------------------------
/vision/nn/scaled_l2_norm.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class ScaledL2Norm(nn.Module):
 7 |     def __init__(self, in_channels, initial_scale):
 8 |         super(ScaledL2Norm, self).__init__()
 9 |         self.in_channels = in_channels
10 |         self.scale = nn.Parameter(torch.Tensor(in_channels))
11 |         self.initial_scale = initial_scale
12 |         self.reset_parameters()
13 | 
14 |     def forward(self, x):
15 |         return (F.normalize(x, p=2, dim=1)
16 |                 * self.scale.unsqueeze(0).unsqueeze(2).unsqueeze(3))
17 | 
18 |     def reset_parameters(self):
19 |         self.scale.data.fill_(self.initial_scale)


--------------------------------------------------------------------------------
/vision/ssd/config/vgg_ssd_config.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from vision.utils.box_utils import SSDSpec, SSDBoxSizes, generate_ssd_priors
 4 | 
 5 | 
 6 | image_size = 300
 7 | image_mean = np.array([123, 117, 104])  # RGB layout
 8 | image_std = 1.0
 9 | 
10 | iou_threshold = 0.45
11 | center_variance = 0.1
12 | size_variance = 0.2
13 | 
14 | specs = [
15 |     SSDSpec(38, 8, SSDBoxSizes(30, 60), [2]),
16 |     SSDSpec(19, 16, SSDBoxSizes(60, 111), [2, 3]),
17 |     SSDSpec(10, 32, SSDBoxSizes(111, 162), [2, 3]),
18 |     SSDSpec(5, 64, SSDBoxSizes(162, 213), [2, 3]),
19 |     SSDSpec(3, 100, SSDBoxSizes(213, 264), [2]),
20 |     SSDSpec(1, 300, SSDBoxSizes(264, 315), [2])
21 | ]
22 | 
23 | 
24 | priors = generate_ssd_priors(specs, image_size)


--------------------------------------------------------------------------------
/vision/ssd/config/mobilenetv1_ssd_config.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from vision.utils.box_utils import SSDSpec, SSDBoxSizes, generate_ssd_priors
 4 | 
 5 | 
 6 | image_size = 300
 7 | image_mean = np.array([127, 127, 127])  # RGB layout
 8 | image_std = 128.0
 9 | iou_threshold = 0.45
10 | center_variance = 0.1
11 | size_variance = 0.2
12 | 
13 | specs = [
14 |     SSDSpec(19, 16, SSDBoxSizes(60, 105), [2, 3]),
15 |     SSDSpec(10, 32, SSDBoxSizes(105, 150), [2, 3]),
16 |     SSDSpec(5, 64, SSDBoxSizes(150, 195), [2, 3]),
17 |     SSDSpec(3, 100, SSDBoxSizes(195, 240), [2, 3]),
18 |     SSDSpec(2, 150, SSDBoxSizes(240, 285), [2, 3]),
19 |     SSDSpec(1, 300, SSDBoxSizes(285, 330), [2, 3])
20 | ]
21 | 
22 | 
23 | priors = generate_ssd_priors(specs, image_size)


--------------------------------------------------------------------------------
/vision/ssd/config/mobilenetv3_ssd_config.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from vision.utils.box_utils import SSDSpec, SSDBoxSizes, generate_ssd_priors
 4 | 
 5 | 
 6 | image_size = 300
 7 | image_mean = np.array([127, 127, 127])  # RGB layout
 8 | image_std = 128.0
 9 | iou_threshold = 0.45
10 | center_variance = 0.1
11 | size_variance = 0.2
12 | 
13 | specs = [
14 |     SSDSpec(19, 16, SSDBoxSizes(60, 105), [2, 3]),
15 |     SSDSpec(10, 32, SSDBoxSizes(105, 150), [2, 3]),
16 |     SSDSpec(5, 64, SSDBoxSizes(150, 195), [2, 3]),
17 |     SSDSpec(3, 100, SSDBoxSizes(195, 240), [2, 3]),
18 |     SSDSpec(2, 150, SSDBoxSizes(240, 285), [2, 3]),
19 |     SSDSpec(1, 300, SSDBoxSizes(285, 330), [2, 3])
20 | ]
21 | 
22 | 
23 | priors = generate_ssd_priors(specs, image_size)


--------------------------------------------------------------------------------
/vision/ssd/config/squeezenet_ssd_config.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from vision.utils.box_utils import SSDSpec, SSDBoxSizes, generate_ssd_priors
 4 | 
 5 | 
 6 | image_size = 300
 7 | image_mean = np.array([127, 127, 127])  # RGB layout
 8 | image_std = 128.0
 9 | iou_threshold = 0.45
10 | center_variance = 0.1
11 | size_variance = 0.2
12 | 
13 | specs = [
14 |     SSDSpec(17, 16, SSDBoxSizes(60, 105), [2, 3]),
15 |     SSDSpec(10, 32, SSDBoxSizes(105, 150), [2, 3]),
16 |     SSDSpec(5, 64, SSDBoxSizes(150, 195), [2, 3]),
17 |     SSDSpec(3, 100, SSDBoxSizes(195, 240), [2, 3]),
18 |     SSDSpec(2, 150, SSDBoxSizes(240, 285), [2, 3]),
19 |     SSDSpec(1, 300, SSDBoxSizes(285, 330), [2, 3])
20 | ]
21 | 
22 | 
23 | priors = generate_ssd_priors(specs, image_size)


--------------------------------------------------------------------------------
/vision/nn/vgg.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | 
 4 | # borrowed from https://github.com/amdegroot/ssd.pytorch/blob/master/ssd.py
 5 | def vgg(cfg, batch_norm=False):
 6 |     layers = []
 7 |     in_channels = 3
 8 |     for v in cfg:
 9 |         if v == 'M':
10 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
11 |         elif v == 'C':
12 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
13 |         else:
14 |             conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
15 |             if batch_norm:
16 |                 layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
17 |             else:
18 |                 layers += [conv2d, nn.ReLU(inplace=True)]
19 |             in_channels = v
20 |     pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
21 |     conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
22 |     conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
23 |     layers += [pool5, conv6,
24 |                nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]
25 |     return layers


--------------------------------------------------------------------------------
/extract_tf_weights.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.python.platform import gfile
 3 | from tensorflow.python.framework import tensor_util
 4 | import sys
 5 | import pickle
 6 | 
 7 | 
 8 | def read_weights(frozen_model):
 9 |     weights = {}
10 |     with tf.Session() as sess:
11 |         with gfile.FastGFile(frozen_model, 'rb') as f:
12 |             graph_def = tf.GraphDef()
13 |             graph_def.ParseFromString(f.read())
14 |             tf.import_graph_def(graph_def)
15 |         for n in graph_def.node:
16 |             if n.op == 'Const':
17 |                 weights[n.name] = tensor_util.MakeNdarray(n.attr['value'].tensor)
18 |                 print("Name:", n.name, "Shape:", weights[n.name].shape)
19 |     return weights
20 | 
21 | 
22 | if len(sys.argv)  < 3:
23 |     print("Usage: python extract_tf_weights.py <frozen_model.pb> <weights_file.pickle>")
24 | 
25 | frozen_model = sys.argv[1]
26 | weights_file = sys.argv[2]
27 | 
28 | weights = read_weights(frozen_model)
29 | with open(weights_file, "wb") as f:
30 |     pickle.dump(weights, f)
31 |     print(f"Saved weights to {weights_file}.")


--------------------------------------------------------------------------------
/vision/datasets/collation.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | 
 5 | def object_detection_collate(batch):
 6 |     images = []
 7 |     gt_boxes = []
 8 |     gt_labels = []
 9 |     image_type = type(batch[0][0])
10 |     box_type = type(batch[0][1])
11 |     label_type = type(batch[0][2])
12 |     for image, boxes, labels in batch:
13 |         if image_type is np.ndarray:
14 |             images.append(torch.from_numpy(image))
15 |         elif image_type is torch.Tensor:
16 |             images.append(image)
17 |         else:
18 |             raise TypeError(f"Image should be tensor or np.ndarray, but got {image_type}.")
19 |         if box_type is np.ndarray:
20 |             gt_boxes.append(torch.from_numpy(boxes))
21 |         elif box_type is torch.Tensor:
22 |             gt_boxes.append(boxes)
23 |         else:
24 |             raise TypeError(f"Boxes should be tensor or np.ndarray, but got {box_type}.")
25 |         if label_type is np.ndarray:
26 |             gt_labels.append(torch.from_numpy(labels))
27 |         elif label_type is torch.Tensor:
28 |             gt_labels.append(labels)
29 |         else:
30 |             raise TypeError(f"Labels should be tensor or np.ndarray, but got {label_type}.")
31 |     return torch.stack(images), gt_boxes, gt_labels


--------------------------------------------------------------------------------
/draw_eval_results.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import cv2
 3 | import pandas as pd
 4 | import os
 5 | 
 6 | eval_result_file = sys.argv[1]
 7 | image_dir = sys.argv[2]
 8 | output_dir = sys.argv[3]
 9 | threshold = float(sys.argv[4])
10 | 
11 | if not os.path.exists(output_dir):
12 |     os.mkdir(output_dir)
13 | 
14 | r = pd.read_csv(eval_result_file, delimiter=" ", names=["ImageID", "Prob", "x1", "y1", "x2", "y2"])
15 | r['x1'] = r['x1'].astype(int)
16 | r['y1'] = r['y1'].astype(int)
17 | r['x2'] = r['x2'].astype(int)
18 | r['y2'] = r['y2'].astype(int)
19 | 
20 | 
21 | for image_id, g in r.groupby('ImageID'):
22 |     image = cv2.imread(os.path.join(image_dir, image_id + ".jpg"))
23 |     for row in g.itertuples():
24 |         if row.Prob < threshold:
25 |             continue
26 |         cv2.rectangle(image, (row.x1, row.y1), (row.x2, row.y2), (255, 255, 0), 4)
27 |         label = f"{row.Prob:.2f}"
28 |         cv2.putText(image, label,
29 |                     (row.x1 + 20, row.y1 + 40),
30 |                     cv2.FONT_HERSHEY_SIMPLEX,
31 |                     1,  # font scale
32 |                     (255, 0, 255),
33 |                     2)  # line type
34 |     cv2.imwrite(os.path.join(output_dir, image_id + ".jpg"), image)
35 | print(f"Task Done. Processed {r.shape[0]} bounding boxes.")


--------------------------------------------------------------------------------
/vision/utils/misc.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import torch
 3 | 
 4 | 
 5 | def str2bool(s):
 6 |     return s.lower() in ('true', '1')
 7 | 
 8 | 
 9 | class Timer:
10 |     def __init__(self):
11 |         self.clock = {}
12 | 
13 |     def start(self, key="default"):
14 |         self.clock[key] = time.time()
15 | 
16 |     def end(self, key="default"):
17 |         if key not in self.clock:
18 |             raise Exception(f"{key} is not in the clock.")
19 |         interval = time.time() - self.clock[key]
20 |         del self.clock[key]
21 |         return interval
22 |         
23 | 
24 | def save_checkpoint(epoch, net_state_dict, optimizer_state_dict, best_score, checkpoint_path, model_path):
25 |     torch.save({
26 |         'epoch': epoch,
27 |         'model': net_state_dict,
28 |         'optimizer': optimizer_state_dict,
29 |         'best_score': best_score
30 |     }, checkpoint_path)
31 |     torch.save(net_state_dict, model_path)
32 |         
33 |         
34 | def load_checkpoint(checkpoint_path):
35 |     return torch.load(checkpoint_path)
36 | 
37 | 
38 | def freeze_net_layers(net):
39 |     for param in net.parameters():
40 |         param.requires_grad = False
41 | 
42 | 
43 | def store_labels(path, labels):
44 |     with open(path, "w") as f:
45 |         f.write("\n".join(labels))
46 | 


--------------------------------------------------------------------------------
/vision/utils/measurements.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def compute_average_precision(precision, recall):
 5 |     """
 6 |     It computes average precision based on the definition of Pascal Competition. It computes the under curve area
 7 |     of precision and recall. Recall follows the normal definition. Precision is a variant.
 8 |     pascal_precision[i] = typical_precision[i:].max()
 9 |     """
10 |     # identical but faster version of new_precision[i] = old_precision[i:].max()
11 |     precision = np.concatenate([[0.0], precision, [0.0]])
12 |     for i in range(len(precision) - 1, 0, -1):
13 |         precision[i - 1] = np.maximum(precision[i - 1], precision[i])
14 | 
15 |     # find the index where the value changes
16 |     recall = np.concatenate([[0.0], recall, [1.0]])
17 |     changing_points = np.where(recall[1:] != recall[:-1])[0]
18 | 
19 |     # compute under curve area
20 |     areas = (recall[changing_points + 1] - recall[changing_points]) * precision[changing_points + 1]
21 |     return areas.sum()
22 | 
23 | 
24 | def compute_voc2007_average_precision(precision, recall):
25 |     ap = 0.
26 |     for t in np.arange(0., 1.1, 0.1):
27 |         if np.sum(recall >= t) == 0:
28 |             p = 0
29 |         else:
30 |             p = np.max(precision[recall >= t])
31 |         ap = ap + p / 11.
32 |     return ap
33 | 


--------------------------------------------------------------------------------
/vision/test/test_vgg_ssd.py:
--------------------------------------------------------------------------------
 1 | from ..ssd.vgg_ssd import create_vgg_ssd
 2 | 
 3 | import torch
 4 | import tempfile
 5 | 
 6 | 
 7 | def test_create_vgg_ssd():
 8 |     for num_classes in [2, 10, 21, 100]:
 9 |         _ = create_vgg_ssd(num_classes)
10 | 
11 | 
12 | def test_forward():
13 |     for num_classes in [2]:
14 |         net = create_vgg_ssd(num_classes)
15 |         net.init()
16 |         net.eval()
17 |         x = torch.randn(2, 3, 300, 300)
18 |         confidences, locations = net.forward(x)
19 |         assert confidences.size() == torch.Size([2, 8732, num_classes])
20 |         assert locations.size() == torch.Size([2, 8732, 4])
21 |         assert confidences.nonzero().size(0) != 0
22 |         assert locations.nonzero().size(0) != 0
23 | 
24 | 
25 | def test_save_model():
26 |     net = create_vgg_ssd(10)
27 |     net.init()
28 |     with tempfile.TemporaryFile() as f:
29 |         net.save(f)
30 | 
31 | 
32 | def test_save_load_model_consistency():
33 |     net = create_vgg_ssd(20)
34 |     net.init()
35 |     model_path = tempfile.NamedTemporaryFile().name
36 |     net.save(model_path)
37 |     net_copy = create_vgg_ssd(20)
38 |     net_copy.load(model_path)
39 | 
40 |     net.eval()
41 |     net_copy.eval()
42 | 
43 |     for _ in range(1):
44 |         x = torch.randn(1, 3, 300, 300)
45 |         confidences1, locations1 = net.forward(x)
46 |         confidences2, locations2 = net_copy.forward(x)
47 |         assert (confidences1 == confidences2).long().sum() == confidences2.numel()
48 |         assert (locations1 == locations2).long().sum() == locations2.numel()
49 | 


--------------------------------------------------------------------------------
/vision/nn/mobilenet.py:
--------------------------------------------------------------------------------
 1 | # borrowed from "https://github.com/marvis/pytorch-mobilenet"
 2 | 
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | class MobileNetV1(nn.Module):
 8 |     def __init__(self, num_classes=1024):
 9 |         super(MobileNetV1, self).__init__()
10 | 
11 |         def conv_bn(inp, oup, stride):
12 |             return nn.Sequential(
13 |                 nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
14 |                 nn.BatchNorm2d(oup),
15 |                 nn.ReLU(inplace=True)
16 |             )
17 | 
18 |         def conv_dw(inp, oup, stride):
19 |             return nn.Sequential(
20 |                 nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
21 |                 nn.BatchNorm2d(inp),
22 |                 nn.ReLU(inplace=True),
23 | 
24 |                 nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
25 |                 nn.BatchNorm2d(oup),
26 |                 nn.ReLU(inplace=True),
27 |             )
28 | 
29 |         self.model = nn.Sequential(
30 |             conv_bn(3, 32, 2),
31 |             conv_dw(32, 64, 1),
32 |             conv_dw(64, 128, 2),
33 |             conv_dw(128, 128, 1),
34 |             conv_dw(128, 256, 2),
35 |             conv_dw(256, 256, 1),
36 |             conv_dw(256, 512, 2),
37 |             conv_dw(512, 512, 1),
38 |             conv_dw(512, 512, 1),
39 |             conv_dw(512, 512, 1),
40 |             conv_dw(512, 512, 1),
41 |             conv_dw(512, 512, 1),
42 |             conv_dw(512, 1024, 2),
43 |             conv_dw(1024, 1024, 1),
44 |         )
45 |         self.fc = nn.Linear(1024, num_classes)
46 | 
47 |     def forward(self, x):
48 |         x = self.model(x)
49 |         x = F.avg_pool2d(x, 7)
50 |         x = x.view(-1, 1024)
51 |         x = self.fc(x)
52 |         return x


--------------------------------------------------------------------------------
/vision/ssd/data_preprocessing.py:
--------------------------------------------------------------------------------
 1 | from ..transforms.transforms import *
 2 | 
 3 | 
 4 | class TrainAugmentation:
 5 |     def __init__(self, size, mean=0, std=1.0):
 6 |         """
 7 |         Args:
 8 |             size: the size the of final image.
 9 |             mean: mean pixel value per channel.
10 |         """
11 |         self.mean = mean
12 |         self.size = size
13 |         self.augment = Compose([
14 |             ConvertFromInts(),
15 |             PhotometricDistort(),
16 |             Expand(self.mean),
17 |             RandomSampleCrop(),
18 |             RandomMirror(),
19 |             ToPercentCoords(),
20 |             Resize(self.size),
21 |             SubtractMeans(self.mean),
22 |             lambda img, boxes=None, labels=None: (img / std, boxes, labels),
23 |             ToTensor(),
24 |         ])
25 | 
26 |     def __call__(self, img, boxes, labels):
27 |         """
28 | 
29 |         Args:
30 |             img: the output of cv.imread in RGB layout.
31 |             boxes: boundding boxes in the form of (x1, y1, x2, y2).
32 |             labels: labels of boxes.
33 |         """
34 |         return self.augment(img, boxes, labels)
35 | 
36 | 
37 | class TestTransform:
38 |     def __init__(self, size, mean=0.0, std=1.0):
39 |         self.transform = Compose([
40 |             ToPercentCoords(),
41 |             Resize(size),
42 |             SubtractMeans(mean),
43 |             lambda img, boxes=None, labels=None: (img / std, boxes, labels),
44 |             ToTensor(),
45 |         ])
46 | 
47 |     def __call__(self, image, boxes, labels):
48 |         return self.transform(image, boxes, labels)
49 | 
50 | 
51 | class PredictionTransform:
52 |     def __init__(self, size, mean=0.0, std=1.0):
53 |         self.transform = Compose([
54 |             Resize(size),
55 |             SubtractMeans(mean),
56 |             lambda img, boxes=None, labels=None: (img / std, boxes, labels),
57 |             ToTensor()
58 |         ])
59 | 
60 |     def __call__(self, image):
61 |         image, _, _ = self.transform(image)
62 |         return image


--------------------------------------------------------------------------------
/vision/nn/multibox_loss.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 | import torch
 4 | 
 5 | 
 6 | from ..utils import box_utils
 7 | 
 8 | 
 9 | class MultiboxLoss(nn.Module):
10 |     def __init__(self, priors, iou_threshold, neg_pos_ratio,
11 |                  center_variance, size_variance, device):
12 |         """Implement SSD Multibox Loss.
13 | 
14 |         Basically, Multibox loss combines classification loss
15 |          and Smooth L1 regression loss.
16 |         """
17 |         super(MultiboxLoss, self).__init__()
18 |         self.iou_threshold = iou_threshold
19 |         self.neg_pos_ratio = neg_pos_ratio
20 |         self.center_variance = center_variance
21 |         self.size_variance = size_variance
22 |         self.priors = priors
23 |         self.priors.to(device)
24 | 
25 |     def forward(self, confidence, predicted_locations, labels, gt_locations):
26 |         """Compute classification loss and smooth l1 loss.
27 | 
28 |         Args:
29 |             confidence (batch_size, num_priors, num_classes): class predictions.
30 |             locations (batch_size, num_priors, 4): predicted locations.
31 |             labels (batch_size, num_priors): real labels of all the priors.
32 |             boxes (batch_size, num_priors, 4): real boxes corresponding all the priors.
33 |         """
34 |         num_classes = confidence.size(2)
35 |         with torch.no_grad():
36 |             # derived from cross_entropy=sum(log(p))
37 |             loss = -F.log_softmax(confidence, dim=2)[:, :, 0]
38 |             mask = box_utils.hard_negative_mining(loss, labels, self.neg_pos_ratio)
39 | 
40 |         confidence = confidence[mask, :]
41 |         classification_loss = F.cross_entropy(confidence.reshape(-1, num_classes), labels[mask], size_average=False)
42 |         pos_mask = labels > 0
43 |         predicted_locations = predicted_locations[pos_mask, :].reshape(-1, 4)
44 |         gt_locations = gt_locations[pos_mask, :].reshape(-1, 4)
45 |         smooth_l1_loss = F.smooth_l1_loss(predicted_locations, gt_locations, size_average=False)
46 |         num_pos = gt_locations.size(0)
47 |         return smooth_l1_loss/num_pos, classification_loss/num_pos
48 | 


--------------------------------------------------------------------------------
/vision/nn/alexnet.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.utils.model_zoo as model_zoo
 3 | 
 4 | # copied from torchvision (https://github.com/pytorch/vision/blob/master/torchvision/models/alexnet.py).
 5 | # The forward function is modified for model pruning.
 6 | 
 7 | __all__ = ['AlexNet', 'alexnet']
 8 | 
 9 | 
10 | model_urls = {
11 |     'alexnet': 'https://download.pytorch.org/models/alexnet-owt-4df8aa71.pth',
12 | }
13 | 
14 | 
15 | class AlexNet(nn.Module):
16 | 
17 |     def __init__(self, num_classes=1000):
18 |         super(AlexNet, self).__init__()
19 |         self.features = nn.Sequential(
20 |             nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
21 |             nn.ReLU(inplace=True),
22 |             nn.MaxPool2d(kernel_size=3, stride=2),
23 |             nn.Conv2d(64, 192, kernel_size=5, padding=2),
24 |             nn.ReLU(inplace=True),
25 |             nn.MaxPool2d(kernel_size=3, stride=2),
26 |             nn.Conv2d(192, 384, kernel_size=3, padding=1),
27 |             nn.ReLU(inplace=True),
28 |             nn.Conv2d(384, 256, kernel_size=3, padding=1),
29 |             nn.ReLU(inplace=True),
30 |             nn.Conv2d(256, 256, kernel_size=3, padding=1),
31 |             nn.ReLU(inplace=True),
32 |             nn.MaxPool2d(kernel_size=3, stride=2),
33 |         )
34 |         self.classifier = nn.Sequential(
35 |             nn.Dropout(),
36 |             nn.Linear(256 * 6 * 6, 4096),
37 |             nn.ReLU(inplace=True),
38 |             nn.Dropout(),
39 |             nn.Linear(4096, 4096),
40 |             nn.ReLU(inplace=True),
41 |             nn.Linear(4096, num_classes),
42 |         )
43 | 
44 |     def forward(self, x):
45 |         x = self.features(x)
46 |         x = x.view(x.size(0), -1)
47 |         x = self.classifier(x)
48 |         return x
49 | 
50 | 
51 | def alexnet(pretrained=False, **kwargs):
52 |     r"""AlexNet model architecture from the
53 |     `"One weird trick..." <https://arxiv.org/abs/1404.5997>`_ paper.
54 | 
55 |     Args:
56 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
57 |     """
58 |     model = AlexNet(**kwargs)
59 |     if pretrained:
60 |         model.load_state_dict(model_zoo.load_url(model_urls['alexnet']))
61 |     return model


--------------------------------------------------------------------------------
/convert_to_caffe2_models.py:
--------------------------------------------------------------------------------
 1 | from vision.ssd.vgg_ssd import create_vgg_ssd
 2 | from vision.ssd.mobilenetv1_ssd import create_mobilenetv1_ssd
 3 | from vision.ssd.mobilenetv1_ssd_lite import create_mobilenetv1_ssd_lite
 4 | from vision.ssd.squeezenet_ssd_lite import create_squeezenet_ssd_lite
 5 | from vision.ssd.mobilenet_v3_ssd_lite import create_mobilenetv3_ssd_lite
 6 | import sys
 7 | import torch.onnx
 8 | #from caffe2.python.onnx.backend import Caffe2Backend as c2
 9 | #import onnx
10 | 
11 | 
12 | if len(sys.argv) < 3:
13 |     print('Usage: python convert_to_caffe2_models.py <net type: mobilenet-v1-ssd|others>  <model path>')
14 |     sys.exit(0)
15 | net_type = sys.argv[1]
16 | model_path = sys.argv[2]
17 | 
18 | label_path = sys.argv[3]
19 | 
20 | class_names = [name.strip() for name in open(label_path).readlines()]
21 | num_classes = len(class_names)
22 | 
23 | if net_type == 'vgg16-ssd':
24 |     net = create_vgg_ssd(len(class_names), is_test=True)
25 | elif net_type == 'mb1-ssd':
26 |     net = create_mobilenetv1_ssd(len(class_names), is_test=True)
27 | elif net_type == 'mb1-ssd-lite':
28 |     net = create_mobilenetv1_ssd_lite(len(class_names), is_test=True)
29 | elif net_type == 'sq-ssd-lite':
30 |     net = create_squeezenet_ssd_lite(len(class_names), is_test=True)
31 | elif net_type == 'mb3-ssd-lite':
32 |     net = create_mobilenetv3_ssd_lite(len(class_names), is_test=True)    
33 | else:
34 |     print("The net type is wrong. It should be one of vgg16-ssd, mb1-ssd and mb1-ssd-lite.")
35 |     sys.exit(1)
36 | net.load(model_path)
37 | net.eval()
38 | 
39 | model_path = f"models/{net_type}.onnx"
40 | init_net_path = f"models/{net_type}_init_net.pb"
41 | init_net_txt_path = f"models/{net_type}_init_net.pbtxt"
42 | predict_net_path = f"models/{net_type}_predict_net.pb"
43 | predict_net_txt_path = f"models/{net_type}_predict_net.pbtxt"
44 | 
45 | dummy_input = torch.randn(1, 3, 300, 300)
46 | torch.onnx.export(net, dummy_input, model_path, verbose=False, output_names=['scores', 'boxes'])
47 | '''
48 | model = onnx.load(model_path)
49 | init_net, predict_net = c2.onnx_graph_to_caffe2_net(model)
50 | 
51 | print(f"Save the model in binary format to the files {init_net_path} and {predict_net_path}.")
52 | 
53 | with open(init_net_path, "wb") as fopen:
54 |     fopen.write(init_net.SerializeToString())
55 | with open(predict_net_path, "wb") as fopen:
56 |     fopen.write(predict_net.SerializeToString())
57 | 
58 | print(f"Save the model in txt format to the files {init_net_txt_path} and {predict_net_txt_path}. ")
59 | with open(init_net_txt_path, 'w') as f:
60 |     f.write(str(init_net))
61 | 
62 | with open(predict_net_txt_path, 'w') as f:
63 |     f.write(str(predict_net))
64 | '''    
65 | 


--------------------------------------------------------------------------------
/vision/utils/model_book.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class ModelBook:
 6 |     """Maintain the mapping between modules and their paths.
 7 | 
 8 |     Example:
 9 |         book = ModelBook(model_ft)
10 |         for p, m in book.conv2d_modules():
11 |             print('path:', p, 'num of filters:', m.out_channels)
12 |             assert m is book.get_module(p)
13 |     """
14 | 
15 |     def __init__(self, model):
16 |         self._model = model
17 |         self._modules = OrderedDict()
18 |         self._paths = OrderedDict()
19 |         path = []
20 |         self._construct(self._model, path)
21 | 
22 |     def _construct(self, module, path):
23 |         if not module._modules:
24 |             return
25 |         for name, m in module._modules.items():
26 |             cur_path = tuple(path + [name])
27 |             self._paths[m] = cur_path
28 |             self._modules[cur_path] = m
29 |             self._construct(m, path + [name])
30 | 
31 |     def conv2d_modules(self):
32 |         return self.modules(nn.Conv2d)
33 | 
34 |     def linear_modules(self):
35 |         return self.modules(nn.Linear)
36 | 
37 |     def modules(self, module_type=None):
38 |         for p, m in self._modules.items():
39 |             if not module_type or isinstance(m, module_type):
40 |                 yield p, m
41 | 
42 |     def num_of_conv2d_modules(self):
43 |         return self.num_of_modules(nn.Conv2d)
44 | 
45 |     def num_of_conv2d_filters(self):
46 |         """Return the sum of out_channels of all conv2d layers.
47 | 
48 |         Here we treat the sub weight with size of [in_channels, h, w] as a single filter.
49 |         """
50 |         num_filters = 0
51 |         for _, m in self.conv2d_modules():
52 |             num_filters += m.out_channels
53 |         return num_filters
54 | 
55 |     def num_of_linear_modules(self):
56 |         return self.num_of_modules(nn.Linear)
57 | 
58 |     def num_of_linear_filters(self):
59 |         num_filters = 0
60 |         for _, m in self.linear_modules():
61 |             num_filters += m.out_features
62 |         return num_filters
63 | 
64 |     def num_of_modules(self, module_type=None):
65 |         num = 0
66 |         for p, m in self._modules.items():
67 |             if not module_type or isinstance(m, module_type):
68 |                 num += 1
69 |         return num
70 | 
71 |     def get_module(self, path):
72 |         return self._modules.get(path)
73 | 
74 |     def get_path(self, module):
75 |         return self._paths.get(module)
76 | 
77 |     def update(self, path, module):
78 |         old_module = self._modules[path]
79 |         del self._paths[old_module]
80 |         self._paths[module] = path
81 |         self._modules[path] = module
82 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MobileNetV3-SSD
 2 | 
 3 | 
 4 | MobileNetV3-SSD implementation in PyTorch 
 5 | 
 6 | 关于第二个版本请移步 https://github.com/shaoshengsong/MobileNetV3-SSD-Compact-Version
 7 | 有测试结果
 8 | 希望尝试新技术请到这里 https://github.com/shaoshengsong/quarkdet
 9 | 一个轻量级的目标检测包括多种模型
10 | **目的**
11 | Object Detection 
12 | 应用于目标检测
13 | 
14 | 环境 
15 | 
16 | 操作系统: Ubuntu18.04
17 | 
18 | Python: 3.6
19 | 
20 | PyTorch: 1.1.0
21 | 
22 | 
23 | **使用MobileNetV3-SSD实现目标检测**
24 | 
25 | **Support Export ONNX**
26 | 
27 | 代码参考（严重参考以下代码）
28 | 
29 | 
30 | **一 SSD部分**
31 | 
32 | 
33 | [A PyTorch Implementation of Single Shot MultiBox Detector ](https://github.com/amdegroot/ssd.pytorch)
34 | 
35 | **二 MobileNetV3 部分**
36 | 
37 | 
38 | 
39 | [1 mobilenetv3 with pytorch，provide pre-train model](https://github.com/xiaolai-sqlai/mobilenetv3) 
40 | 
41 | 
42 | [2 MobileNetV3 in pytorch and ImageNet pretrained models ](https://github.com/kuan-wang/pytorch-mobilenet-v3)
43 | 
44 | 
45 | [3Implementing Searching for MobileNetV3 paper using Pytorch ](https://github.com/leaderj1001/MobileNetV3-Pytorch)
46 | 
47 | 
48 | [4 MobileNetV1, MobileNetV2, VGG based SSD/SSD-lite implementation in Pytorch 1.0 / Pytorch 0.4. Out-of-box support for retraining on Open Images dataset. ONNX and Caffe2 support. Experiment Ideas like CoordConv. 
49 | no discernible latency cost](https://github.com/qfgaohao/pytorch-ssd).
50 | 
51 | 
52 | 针对4我这里没有做MobileNetV1, MobileNetV2等等代码兼容，只有MobileNetV3可用
53 | 
54 | **下载数据**
55 | 本例是以蛋糕和面包为例，原因是数据量小
56 | 所有类别总大小是561G，蛋糕和面包是3.2G
57 | 
58 | python3 open_images_downloader.py --root /media/santiago/a/data/open_images --class_names "Cake,Bread" --num_workers 20
59 | 
60 | 
61 | **训练过程**
62 | 
63 | **首次训练**
64 | 
65 | python3 train_ssd.py --dataset_type open_images --datasets /media/santiago/data/open_images --net mb3-ssd-lite  --scheduler cosine --lr 0.01 --t_max 100 --validation_epochs 5 --num_epochs 100 --base_net_lr 0.001  --batch_size 5
66 | 
67 | 
68 | **预加载之前训练的模型**
69 | 
70 | python3 train_ssd.py --dataset_type open_images --datasets /media/santiago/data/open_images --net mb3-ssd-lite --pretrained_ssd models/mb3-ssd-lite-Epoch-99-Loss-2.5194434596402613.pth  --scheduler cosine --lr 0.01 --t_max 100 --validation_epochs 5 --num_epochs 200 --base_net_lr 0.001  --batch_size 5
71 | 
72 | 
73 | 
74 | **测试一张图片**
75 | 
76 | python run_ssd_example.py mb3-ssd-lite models/mb3-ssd-lite-Epoch-99-Loss-2.5194434596402613.pth models/open-images-model-labels.txt /home/santiago/picture/test.jpg
77 | 
78 | **视频检测**
79 | 
80 | python3 run_ssd_live_demo.py mb3-ssd-lite models/mb3-ssd-lite-Epoch-99-Loss-2.5194434596402613.pth models/open-images-model-labels.txt
81 | 
82 | 
83 | **Cake and Bread Pretrained model**
84 | 
85 | 
86 | 链接: https://pan.baidu.com/s/1byY1eJk3Hm3CTp-29KirxA 
87 | 
88 | 提取码: qxwv 
89 | 
90 | **VOC Dataset Pretrained model**
91 | 
92 | 链接: https://pan.baidu.com/s/1yt_IRY0RcgSxB-YwywoHuA 
93 | 
94 | 提取码: 2sta 
95 | 


--------------------------------------------------------------------------------
/vision/ssd/predictor.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from ..utils import box_utils
 4 | from .data_preprocessing import PredictionTransform
 5 | from ..utils.misc import Timer
 6 | 
 7 | 
 8 | class Predictor:
 9 |     def __init__(self, net, size, mean=0.0, std=1.0, nms_method=None,
10 |                  iou_threshold=0.45, filter_threshold=0.01, candidate_size=200, sigma=0.5, device=None):
11 |         self.net = net
12 |         self.transform = PredictionTransform(size, mean, std)
13 |         self.iou_threshold = iou_threshold
14 |         self.filter_threshold = filter_threshold
15 |         self.candidate_size = candidate_size
16 |         self.nms_method = nms_method
17 | 
18 |         self.sigma = sigma
19 |         if device:
20 |             self.device = device
21 |         else:
22 |             self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
23 | 
24 |         self.net.to(self.device)
25 |         self.net.eval()
26 | 
27 |         self.timer = Timer()
28 | 
29 |     def predict(self, image, top_k=-1, prob_threshold=None):
30 |         cpu_device = torch.device("cpu")
31 |         height, width, _ = image.shape
32 |         image = self.transform(image)
33 |         images = image.unsqueeze(0)
34 |         images = images.to(self.device)
35 |         with torch.no_grad():
36 |             self.timer.start()
37 |             scores, boxes = self.net.forward(images)
38 |             print("Inference time: ", self.timer.end())
39 |         boxes = boxes[0]
40 |         scores = scores[0]
41 |         if not prob_threshold:
42 |             prob_threshold = self.filter_threshold
43 |         # this version of nms is slower on GPU, so we move data to CPU.
44 |         boxes = boxes.to(cpu_device)
45 |         scores = scores.to(cpu_device)
46 |         picked_box_probs = []
47 |         picked_labels = []
48 |         for class_index in range(1, scores.size(1)):
49 |             probs = scores[:, class_index]
50 |             mask = probs > prob_threshold
51 |             probs = probs[mask]
52 |             if probs.size(0) == 0:
53 |                 continue
54 |             subset_boxes = boxes[mask, :]
55 |             box_probs = torch.cat([subset_boxes, probs.reshape(-1, 1)], dim=1)
56 |             box_probs = box_utils.nms(box_probs, self.nms_method,
57 |                                       score_threshold=prob_threshold,
58 |                                       iou_threshold=self.iou_threshold,
59 |                                       sigma=self.sigma,
60 |                                       top_k=top_k,
61 |                                       candidate_size=self.candidate_size)
62 |             picked_box_probs.append(box_probs)
63 |             picked_labels.extend([class_index] * box_probs.size(0))
64 |         if not picked_box_probs:
65 |             return torch.tensor([]), torch.tensor([]), torch.tensor([])
66 |         picked_box_probs = torch.cat(picked_box_probs)
67 |         picked_box_probs[:, 0] *= width
68 |         picked_box_probs[:, 1] *= height
69 |         picked_box_probs[:, 2] *= width
70 |         picked_box_probs[:, 3] *= height
71 |         return picked_box_probs[:, :4], torch.tensor(picked_labels), picked_box_probs[:, 4]


--------------------------------------------------------------------------------
/run_ssd_example.py:
--------------------------------------------------------------------------------
 1 | from vision.ssd.vgg_ssd import create_vgg_ssd, create_vgg_ssd_predictor
 2 | from vision.ssd.mobilenetv1_ssd import create_mobilenetv1_ssd, create_mobilenetv1_ssd_predictor
 3 | from vision.ssd.mobilenetv1_ssd_lite import create_mobilenetv1_ssd_lite, create_mobilenetv1_ssd_lite_predictor
 4 | from vision.ssd.squeezenet_ssd_lite import create_squeezenet_ssd_lite, create_squeezenet_ssd_lite_predictor
 5 | from vision.ssd.mobilenet_v2_ssd_lite import create_mobilenetv2_ssd_lite, create_mobilenetv2_ssd_lite_predictor
 6 | from vision.utils.misc import Timer
 7 | import cv2
 8 | import sys
 9 | 
10 | from vision.ssd.mobilenet_v3_ssd_lite import create_mobilenetv3_ssd_lite,create_mobilenetv3_ssd_lite_predictor
11 | 
12 | if len(sys.argv) < 5:
13 |     print('Usage: python run_ssd_example.py <net type>  <model path> <label path> <image path>')
14 |     sys.exit(0)
15 | net_type = sys.argv[1]
16 | model_path = sys.argv[2]
17 | label_path = sys.argv[3]
18 | image_path = sys.argv[4]
19 | 
20 | class_names = [name.strip() for name in open(label_path).readlines()]
21 | 
22 | if net_type == 'vgg16-ssd':
23 |     net = create_vgg_ssd(len(class_names), is_test=True)
24 | elif net_type == 'mb1-ssd':
25 |     net = create_mobilenetv1_ssd(len(class_names), is_test=True)
26 | elif net_type == 'mb1-ssd-lite':
27 |     net = create_mobilenetv1_ssd_lite(len(class_names), is_test=True)
28 | elif net_type == 'mb2-ssd-lite':
29 |     net = create_mobilenetv2_ssd_lite(len(class_names), is_test=True)
30 | elif net_type == 'sq-ssd-lite':
31 |     net = create_squeezenet_ssd_lite(len(class_names), is_test=True)
32 | elif net_type == 'mb3-ssd-lite':
33 |     net = create_mobilenetv3_ssd_lite(len(class_names), is_test=True)
34 | 
35 | else:
36 |     print("The net type is wrong. It should be one of vgg16-ssd, mb1-ssd and mb1-ssd-lite.")
37 |     sys.exit(1)
38 | net.load(model_path)
39 | 
40 | if net_type == 'vgg16-ssd':
41 |     predictor = create_vgg_ssd_predictor(net, candidate_size=200)
42 | elif net_type == 'mb1-ssd':
43 |     predictor = create_mobilenetv1_ssd_predictor(net, candidate_size=200)
44 | elif net_type == 'mb1-ssd-lite':
45 |     predictor = create_mobilenetv1_ssd_lite_predictor(net, candidate_size=200)
46 | elif net_type == 'mb2-ssd-lite':
47 |     predictor = create_mobilenetv2_ssd_lite_predictor(net, candidate_size=200)
48 | elif net_type == 'sq-ssd-lite':
49 |     predictor = create_squeezenet_ssd_lite_predictor(net, candidate_size=200)
50 | elif net_type == 'mb3-ssd-lite':
51 |     predictor = create_mobilenetv3_ssd_lite_predictor(net, candidate_size=200)
52 | else:
53 |     predictor = create_vgg_ssd_predictor(net, candidate_size=200)
54 | 
55 | orig_image = cv2.imread(image_path)
56 | image = cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB)
57 | boxes, labels, probs = predictor.predict(image, 10, 0.4)
58 | 
59 | for i in range(boxes.size(0)):
60 |     box = boxes[i, :]
61 |     cv2.rectangle(orig_image, (box[0], box[1]), (box[2], box[3]), (255, 255, 0), 4)
62 |     #label = f"""{voc_dataset.class_names[labels[i]]}: {probs[i]:.2f}"""
63 |     label = f"{class_names[labels[i]]}: {probs[i]:.2f}"
64 |     cv2.putText(orig_image, label,
65 |                 (box[0] + 20, box[1] + 40),
66 |                 cv2.FONT_HERSHEY_SIMPLEX,
67 |                 1,  # font scale
68 |                 (255, 0, 255),
69 |                 2)  # line type
70 | path = "run_ssd_example_output.jpg"
71 | cv2.imwrite(path, orig_image)
72 | print(f"Found {len(probs)} objects. The output image is {path}")
73 | 


--------------------------------------------------------------------------------
/vision/ssd/mobilenetv1_ssd.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import Conv2d, Sequential, ModuleList, ReLU
 3 | from ..nn.mobilenet import MobileNetV1
 4 | 
 5 | from .ssd import SSD
 6 | from .predictor import Predictor
 7 | from .config import mobilenetv1_ssd_config as config
 8 | 
 9 | 
10 | def create_mobilenetv1_ssd(num_classes, is_test=False):
11 |     base_net = MobileNetV1(1001).model  # disable dropout layer
12 | 
13 |     source_layer_indexes = [
14 |         12,
15 |         14,
16 |     ]
17 |     extras = ModuleList([
18 |         Sequential(
19 |             Conv2d(in_channels=1024, out_channels=256, kernel_size=1),
20 |             ReLU(),
21 |             Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1),
22 |             ReLU()
23 |         ),
24 |         Sequential(
25 |             Conv2d(in_channels=512, out_channels=128, kernel_size=1),
26 |             ReLU(),
27 |             Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
28 |             ReLU()
29 |         ),
30 |         Sequential(
31 |             Conv2d(in_channels=256, out_channels=128, kernel_size=1),
32 |             ReLU(),
33 |             Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
34 |             ReLU()
35 |         ),
36 |         Sequential(
37 |             Conv2d(in_channels=256, out_channels=128, kernel_size=1),
38 |             ReLU(),
39 |             Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
40 |             ReLU()
41 |         )
42 |     ])
43 | 
44 |     regression_headers = ModuleList([
45 |         Conv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
46 |         Conv2d(in_channels=1024, out_channels=6 * 4, kernel_size=3, padding=1),
47 |         Conv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
48 |         Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
49 |         Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
50 |         Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1), # TODO: change to kernel_size=1, padding=0?
51 |     ])
52 | 
53 |     classification_headers = ModuleList([
54 |         Conv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
55 |         Conv2d(in_channels=1024, out_channels=6 * num_classes, kernel_size=3, padding=1),
56 |         Conv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
57 |         Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
58 |         Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
59 |         Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1), # TODO: change to kernel_size=1, padding=0?
60 |     ])
61 | 
62 |     return SSD(num_classes, base_net, source_layer_indexes,
63 |                extras, classification_headers, regression_headers, is_test=is_test, config=config)
64 | 
65 | 
66 | def create_mobilenetv1_ssd_predictor(net, candidate_size=200, nms_method=None, sigma=0.5, device=None):
67 |     predictor = Predictor(net, config.image_size, config.image_mean,
68 |                           config.image_std,
69 |                           nms_method=nms_method,
70 |                           iou_threshold=config.iou_threshold,
71 |                           candidate_size=candidate_size,
72 |                           sigma=sigma,
73 |                           device=device)
74 |     return predictor
75 | 


--------------------------------------------------------------------------------
/vision/ssd/vgg_ssd.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import Conv2d, Sequential, ModuleList, ReLU, BatchNorm2d
 3 | from ..nn.vgg import vgg
 4 | 
 5 | from .ssd import SSD
 6 | from .predictor import Predictor
 7 | from .config import vgg_ssd_config as config
 8 | 
 9 | 
10 | def create_vgg_ssd(num_classes, is_test=False):
11 |     vgg_config = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
12 |                   512, 512, 512]
13 |     base_net = ModuleList(vgg(vgg_config))
14 | 
15 |     source_layer_indexes = [
16 |         (23, BatchNorm2d(512)),
17 |         len(base_net),
18 |     ]
19 |     extras = ModuleList([
20 |         Sequential(
21 |             Conv2d(in_channels=1024, out_channels=256, kernel_size=1),
22 |             ReLU(),
23 |             Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1),
24 |             ReLU()
25 |         ),
26 |         Sequential(
27 |             Conv2d(in_channels=512, out_channels=128, kernel_size=1),
28 |             ReLU(),
29 |             Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
30 |             ReLU()
31 |         ),
32 |         Sequential(
33 |             Conv2d(in_channels=256, out_channels=128, kernel_size=1),
34 |             ReLU(),
35 |             Conv2d(in_channels=128, out_channels=256, kernel_size=3),
36 |             ReLU()
37 |         ),
38 |         Sequential(
39 |             Conv2d(in_channels=256, out_channels=128, kernel_size=1),
40 |             ReLU(),
41 |             Conv2d(in_channels=128, out_channels=256, kernel_size=3),
42 |             ReLU()
43 |         )
44 |     ])
45 | 
46 |     regression_headers = ModuleList([
47 |         Conv2d(in_channels=512, out_channels=4 * 4, kernel_size=3, padding=1),
48 |         Conv2d(in_channels=1024, out_channels=6 * 4, kernel_size=3, padding=1),
49 |         Conv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
50 |         Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
51 |         Conv2d(in_channels=256, out_channels=4 * 4, kernel_size=3, padding=1),
52 |         Conv2d(in_channels=256, out_channels=4 * 4, kernel_size=3, padding=1), # TODO: change to kernel_size=1, padding=0?
53 |     ])
54 | 
55 |     classification_headers = ModuleList([
56 |         Conv2d(in_channels=512, out_channels=4 * num_classes, kernel_size=3, padding=1),
57 |         Conv2d(in_channels=1024, out_channels=6 * num_classes, kernel_size=3, padding=1),
58 |         Conv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
59 |         Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
60 |         Conv2d(in_channels=256, out_channels=4 * num_classes, kernel_size=3, padding=1),
61 |         Conv2d(in_channels=256, out_channels=4 * num_classes, kernel_size=3, padding=1), # TODO: change to kernel_size=1, padding=0?
62 |     ])
63 | 
64 |     return SSD(num_classes, base_net, source_layer_indexes,
65 |                extras, classification_headers, regression_headers, is_test=is_test, config=config)
66 | 
67 | 
68 | def create_vgg_ssd_predictor(net, candidate_size=200, nms_method=None, sigma=0.5, device=None):
69 |     predictor = Predictor(net, config.image_size, config.image_mean,
70 |                           nms_method=nms_method,
71 |                           iou_threshold=config.iou_threshold,
72 |                           candidate_size=candidate_size,
73 |                           sigma=sigma,
74 |                           device=device)
75 |     return predictor
76 | 


--------------------------------------------------------------------------------
/translate_tf_mobilenetv1.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | 
 4 | from vision.nn.mobilenet import MobileNetV1
 5 | from extract_tf_weights import read_weights
 6 | 
 7 | 
 8 | def fill_weights_torch_model(weights, state_dict):
 9 |     for name in state_dict:
10 |         if name == 'classifier.weight':
11 |             weight = weights['MobilenetV1/Logits/Conv2d_1c_1x1/weights']
12 |             weight = torch.tensor(weight, dtype=torch.float32).permute(3, 2, 0, 1)
13 |             assert state_dict[name].size() == weight.size()
14 |             state_dict[name] = weight
15 |         elif name == 'classifier.bias':
16 |             bias = weights['MobilenetV1/Logits/Conv2d_1c_1x1/biases']
17 |             bias = torch.tensor(bias, dtype=torch.float32)
18 |             assert state_dict[name].size() == bias.size()
19 |             state_dict[name] = bias
20 |         elif name.endswith('BatchNorm.weight'):
21 |             key = name.replace("features", "MobilenetV1").replace(".", "/").replace('BatchNorm/weight', 'BatchNorm/gamma')
22 |             weight = torch.tensor(weights[key], dtype=torch.float32)
23 |             assert weight.size() == state_dict[name].size()
24 |             state_dict[name] = weight
25 |         elif name.endswith('BatchNorm.bias'):
26 |             key = name.replace("features", "MobilenetV1").replace(".", "/").replace('BatchNorm/bias', 'BatchNorm/beta')
27 |             bias = torch.tensor(weights[key], dtype=torch.float32)
28 |             assert bias.size() == state_dict[name].size()
29 |             state_dict[name] = bias
30 |         elif name.endswith('running_mean'):
31 |             key = name.replace("features", "MobilenetV1").replace(".", "/").replace('running_mean', 'moving_mean')
32 |             running_mean = torch.tensor(weights[key], dtype=torch.float32)
33 |             assert running_mean.size() == state_dict[name].size()
34 |             state_dict[name] = running_mean
35 |         elif name.endswith('running_var'):
36 |             key = name.replace("features", "MobilenetV1").replace(".", "/").replace('running_var', 'moving_variance')
37 |             running_var = torch.tensor(weights[key], dtype=torch.float32)
38 |             assert running_var.size() == state_dict[name].size()
39 |             state_dict[name] = running_var
40 |         elif name.endswith('depthwise.weight'):
41 |             key = name.replace("features", "MobilenetV1").replace(".", "/")
42 |             key = key.replace('depthwise/weight', 'depthwise/depthwise_weights')
43 |             weight = torch.tensor(weights[key], dtype=torch.float32).permute(2, 3, 0, 1)
44 |             assert weight.size() == state_dict[name].size()
45 |             state_dict[name] = weight
46 |         else:
47 |             key = name.replace("features", "MobilenetV1").replace(".", "/").replace('weight', 'weights')
48 |             weight = torch.tensor(weights[key], dtype=torch.float32).permute(3, 2, 0, 1)
49 |             assert weight.size() == state_dict[name].size()
50 |             state_dict[name] = weight
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     if len(sys.argv) < 3:
55 |         print("Usage: python translate_tf_modelnetv1.py <tf_model.pb> <pytorch_weights.pth>")
56 |     tf_model = sys.argv[1]
57 |     torch_weights_path = sys.argv[2]
58 |     print("Extract weights from tf model.")
59 |     weights = read_weights(tf_model)
60 | 
61 |     net = MobileNetV1(1001)
62 |     states = net.state_dict()
63 |     print("Translate tf weights.")
64 |     fill_weights_torch_model(weights, states)
65 |     torch.save(states, torch_weights_path)


--------------------------------------------------------------------------------
/vision/ssd/fpn_mobilenetv1_ssd.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import Conv2d, Sequential, ModuleList, ReLU
 3 | from ..nn.mobilenet import MobileNetV1
 4 | 
 5 | from .fpn_ssd import FPNSSD
 6 | from .predictor import Predictor
 7 | from .config import mobilenetv1_ssd_config as config
 8 | 
 9 | 
10 | def create_fpn_mobilenetv1_ssd(num_classes):
11 |     base_net = MobileNetV1(1001).features  # disable dropout layer
12 | 
13 |     source_layer_indexes = [
14 |         (69, Conv2d(in_channels=512, out_channels=256, kernel_size=1)),
15 |         (len(base_net), Conv2d(in_channels=1024, out_channels=256, kernel_size=1)),
16 |     ]
17 |     extras = ModuleList([
18 |         Sequential(
19 |             Conv2d(in_channels=1024, out_channels=256, kernel_size=1),
20 |             ReLU(),
21 |             Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=2, padding=1),
22 |             ReLU()
23 |         ),
24 |         Sequential(
25 |             Conv2d(in_channels=256, out_channels=128, kernel_size=1),
26 |             ReLU(),
27 |             Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
28 |             ReLU()
29 |         ),
30 |         Sequential(
31 |             Conv2d(in_channels=256, out_channels=128, kernel_size=1),
32 |             ReLU(),
33 |             Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
34 |             ReLU()
35 |         ),
36 |         Sequential(
37 |             Conv2d(in_channels=256, out_channels=128, kernel_size=1),
38 |             ReLU(),
39 |             Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
40 |             ReLU()
41 |         )
42 |     ])
43 | 
44 |     regression_headers = ModuleList([
45 |         Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
46 |         Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
47 |         Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
48 |         Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
49 |         Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
50 |         Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),   #  TODO: change to kernel_size=1, padding=0?
51 |     ])
52 | 
53 |     classification_headers = ModuleList([
54 |         Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
55 |         Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
56 |         Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
57 |         Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
58 |         Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
59 |         Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1), # TODO: change to kernel_size=1, padding=0?
60 |     ])
61 | 
62 |     return FPNSSD(num_classes, base_net, source_layer_indexes,
63 |                extras, classification_headers, regression_headers)
64 | 
65 | 
66 | def create_fpn_mobilenetv1_ssd_predictor(net, candidate_size=200, nms_method=None, sigma=0.5, device=torch.device('cpu')):
67 |     predictor = Predictor(net, config.image_size, config.image_mean, config.priors,
68 |                           config.center_variance, config.size_variance,
69 |                           nms_method=nms_method,
70 |                           iou_threshold=config.iou_threshold,
71 |                           candidate_size=candidate_size,
72 |                           sigma=sigma,
73 |                           device=device)
74 |     return predictor
75 | 


--------------------------------------------------------------------------------
/vision/ssd/mobilenet_v2_ssd_lite.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import Conv2d, Sequential, ModuleList, BatchNorm2d
 3 | from torch import nn
 4 | from ..nn.mobilenet_v2 import MobileNetV2, InvertedResidual
 5 | 
 6 | from .ssd import SSD, GraphPath
 7 | from .predictor import Predictor
 8 | from .config import mobilenetv1_ssd_config as config
 9 | 
10 | 
11 | def SeperableConv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, onnx_compatible=False):
12 |     """Replace Conv2d with a depthwise Conv2d and Pointwise Conv2d.
13 |     """
14 |     ReLU = nn.ReLU if onnx_compatible else nn.ReLU6
15 |     return Sequential(
16 |         Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size,
17 |                groups=in_channels, stride=stride, padding=padding),
18 |         BatchNorm2d(in_channels),
19 |         ReLU(),
20 |         Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1),
21 |     )
22 | 
23 | 
24 | def create_mobilenetv2_ssd_lite(num_classes, width_mult=1.0, use_batch_norm=True, onnx_compatible=False, is_test=False):
25 |     base_net = MobileNetV2(width_mult=width_mult, use_batch_norm=use_batch_norm,
26 |                            onnx_compatible=onnx_compatible).features
27 | 
28 |     source_layer_indexes = [
29 |         GraphPath(14, 'conv', 3),
30 |         19,
31 |     ]
32 |     extras = ModuleList([
33 |         InvertedResidual(1280, 512, stride=2, expand_ratio=0.2),
34 |         InvertedResidual(512, 256, stride=2, expand_ratio=0.25),
35 |         InvertedResidual(256, 256, stride=2, expand_ratio=0.5),
36 |         InvertedResidual(256, 64, stride=2, expand_ratio=0.25)
37 |     ])
38 | 
39 |     regression_headers = ModuleList([
40 |         SeperableConv2d(in_channels=round(576 * width_mult), out_channels=6 * 4,
41 |                         kernel_size=3, padding=1, onnx_compatible=False),
42 |         SeperableConv2d(in_channels=1280, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
43 |         SeperableConv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
44 |         SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
45 |         SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
46 |         Conv2d(in_channels=64, out_channels=6 * 4, kernel_size=1),
47 |     ])
48 | 
49 |     classification_headers = ModuleList([
50 |         SeperableConv2d(in_channels=round(576 * width_mult), out_channels=6 * num_classes, kernel_size=3, padding=1),
51 |         SeperableConv2d(in_channels=1280, out_channels=6 * num_classes, kernel_size=3, padding=1),
52 |         SeperableConv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
53 |         SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
54 |         SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
55 |         Conv2d(in_channels=64, out_channels=6 * num_classes, kernel_size=1),
56 |     ])
57 | 
58 |     return SSD(num_classes, base_net, source_layer_indexes,
59 |                extras, classification_headers, regression_headers, is_test=is_test, config=config)
60 | 
61 | 
62 | def create_mobilenetv2_ssd_lite_predictor(net, candidate_size=200, nms_method=None, sigma=0.5, device=torch.device('cpu')):
63 |     predictor = Predictor(net, config.image_size, config.image_mean,
64 |                           config.image_std,
65 |                           nms_method=nms_method,
66 |                           iou_threshold=config.iou_threshold,
67 |                           candidate_size=candidate_size,
68 |                           sigma=sigma,
69 |                           device=device)
70 |     return predictor
71 | 


--------------------------------------------------------------------------------
/vision/ssd/mobilenetv1_ssd_lite.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import Conv2d, Sequential, ModuleList, ReLU, BatchNorm2d
 3 | from ..nn.mobilenet import MobileNetV1
 4 | 
 5 | from .ssd import SSD
 6 | from .predictor import Predictor
 7 | from .config import mobilenetv1_ssd_config as config
 8 | 
 9 | 
10 | def SeperableConv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0):
11 |     """Replace Conv2d with a depthwise Conv2d and Pointwise Conv2d.
12 |     """
13 |     return Sequential(
14 |         Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size,
15 |                groups=in_channels, stride=stride, padding=padding),
16 |         ReLU(),
17 |         Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1),
18 |     )
19 | 
20 | 
21 | def create_mobilenetv1_ssd_lite(num_classes, is_test=False):
22 |     base_net = MobileNetV1(1001).model  # disable dropout layer
23 | 
24 |     source_layer_indexes = [
25 |         12,
26 |         14,
27 |     ]
28 |     extras = ModuleList([
29 |         Sequential(
30 |             Conv2d(in_channels=1024, out_channels=256, kernel_size=1),
31 |             ReLU(),
32 |             SeperableConv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1),
33 |         ),
34 |         Sequential(
35 |             Conv2d(in_channels=512, out_channels=128, kernel_size=1),
36 |             ReLU(),
37 |             SeperableConv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
38 |         ),
39 |         Sequential(
40 |             Conv2d(in_channels=256, out_channels=128, kernel_size=1),
41 |             ReLU(),
42 |             SeperableConv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
43 |         ),
44 |         Sequential(
45 |             Conv2d(in_channels=256, out_channels=128, kernel_size=1),
46 |             ReLU(),
47 |             SeperableConv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1)
48 |         )
49 |     ])
50 | 
51 |     regression_headers = ModuleList([
52 |         SeperableConv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
53 |         SeperableConv2d(in_channels=1024, out_channels=6 * 4, kernel_size=3, padding=1),
54 |         SeperableConv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
55 |         SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
56 |         SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
57 |         Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=1),
58 |     ])
59 | 
60 |     classification_headers = ModuleList([
61 |         SeperableConv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
62 |         SeperableConv2d(in_channels=1024, out_channels=6 * num_classes, kernel_size=3, padding=1),
63 |         SeperableConv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
64 |         SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
65 |         SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
66 |         Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=1),
67 |     ])
68 | 
69 |     return SSD(num_classes, base_net, source_layer_indexes,
70 |                extras, classification_headers, regression_headers, is_test=is_test, config=config)
71 | 
72 | 
73 | def create_mobilenetv1_ssd_lite_predictor(net, candidate_size=200, nms_method=None, sigma=0.5, device=None):
74 |     predictor = Predictor(net, config.image_size, config.image_mean,
75 |                           config.image_std,
76 |                           nms_method=nms_method,
77 |                           iou_threshold=config.iou_threshold,
78 |                           candidate_size=candidate_size,
79 |                           sigma=sigma,
80 |                           device=device)
81 |     return predictor
82 | 


--------------------------------------------------------------------------------
/run_ssd_live_demo.py:
--------------------------------------------------------------------------------
 1 | from vision.ssd.vgg_ssd import create_vgg_ssd, create_vgg_ssd_predictor
 2 | from vision.ssd.mobilenetv1_ssd import create_mobilenetv1_ssd, create_mobilenetv1_ssd_predictor
 3 | from vision.ssd.mobilenetv1_ssd_lite import create_mobilenetv1_ssd_lite, create_mobilenetv1_ssd_lite_predictor
 4 | from vision.ssd.squeezenet_ssd_lite import create_squeezenet_ssd_lite, create_squeezenet_ssd_lite_predictor
 5 | from vision.ssd.mobilenet_v2_ssd_lite import create_mobilenetv2_ssd_lite, create_mobilenetv2_ssd_lite_predictor
 6 | from vision.utils.misc import Timer
 7 | import cv2
 8 | import sys
 9 | 
10 | 
11 | from vision.ssd.mobilenet_v3_ssd_lite import create_mobilenetv3_ssd_lite,create_mobilenetv3_ssd_lite_predictor
12 | 
13 | 
14 | if len(sys.argv) < 4:
15 |     print('Usage: python run_ssd_example.py <net type>  <model path> <label path> [video file]')
16 |     sys.exit(0)
17 | net_type = sys.argv[1]
18 | model_path = sys.argv[2]
19 | label_path = sys.argv[3]
20 | 
21 | if len(sys.argv) >= 5:
22 |     cap = cv2.VideoCapture(sys.argv[4])  # capture from file
23 | else:
24 |     cap = cv2.VideoCapture(0)   # capture from camera
25 |     cap.set(3, 1920)
26 |     cap.set(4, 1080)
27 | 
28 | class_names = [name.strip() for name in open(label_path).readlines()]
29 | num_classes = len(class_names)
30 | 
31 | 
32 | if net_type == 'vgg16-ssd':
33 |     net = create_vgg_ssd(len(class_names), is_test=True)
34 | elif net_type == 'mb1-ssd':
35 |     net = create_mobilenetv1_ssd(len(class_names), is_test=True)
36 | elif net_type == 'mb1-ssd-lite':
37 |     net = create_mobilenetv1_ssd_lite(len(class_names), is_test=True)
38 | elif net_type == 'mb2-ssd-lite':
39 |     net = create_mobilenetv2_ssd_lite(len(class_names), is_test=True)
40 | elif net_type == 'sq-ssd-lite':
41 |     net = create_squeezenet_ssd_lite(len(class_names), is_test=True)
42 | elif net_type == 'mb3-ssd-lite':
43 |     net = create_mobilenetv3_ssd_lite(len(class_names), is_test=True)    
44 | else:
45 |     print("The net type is wrong. It should be one of vgg16-ssd, mb1-ssd and mb1-ssd-lite.")
46 |     sys.exit(1)
47 | net.load(model_path)
48 | 
49 | if net_type == 'vgg16-ssd':
50 |     predictor = create_vgg_ssd_predictor(net, candidate_size=200)
51 | elif net_type == 'mb1-ssd':
52 |     predictor = create_mobilenetv1_ssd_predictor(net, candidate_size=200)
53 | elif net_type == 'mb1-ssd-lite':
54 |     predictor = create_mobilenetv1_ssd_lite_predictor(net, candidate_size=200)
55 | elif net_type == 'mb2-ssd-lite':
56 |     predictor = create_mobilenetv2_ssd_lite_predictor(net, candidate_size=200)
57 | elif net_type == 'sq-ssd-lite':
58 |     predictor = create_squeezenet_ssd_lite_predictor(net, candidate_size=200)
59 | elif net_type == 'mb3-ssd-lite':
60 |     predictor = create_mobilenetv3_ssd_lite_predictor(net, candidate_size=10)
61 | else:
62 |     print("The net type is wrong. It should be one of vgg16-ssd, mb1-ssd and mb1-ssd-lite.")
63 |     sys.exit(1)
64 | 
65 | 
66 | timer = Timer()
67 | while True:
68 |     ret, orig_image = cap.read()
69 |     if orig_image is None:
70 |         continue
71 |     image = cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB)
72 |     timer.start()
73 |     boxes, labels, probs = predictor.predict(image, 10, 0.4)
74 |     interval = timer.end()
75 |     print('Time: {:.2f}s, Detect Objects: {:d}.'.format(interval, labels.size(0)))
76 |     for i in range(boxes.size(0)):
77 |         box = boxes[i, :]
78 |         label = f"{class_names[labels[i]]}: {probs[i]:.2f}"
79 |         cv2.rectangle(orig_image, (box[0], box[1]), (box[2], box[3]), (255, 255, 0), 4)
80 | 
81 |         cv2.putText(orig_image, label,
82 |                     (box[0]+20, box[1]+40),
83 |                     cv2.FONT_HERSHEY_SIMPLEX,
84 |                     1,  # font scale
85 |                     (255, 0, 255),
86 |                     2)  # line type
87 |     cv2.imshow('annotated', orig_image)
88 |     if cv2.waitKey(1) & 0xFF == ord('q'):
89 |         break
90 | cap.release()
91 | cv2.destroyAllWindows()
92 | 


--------------------------------------------------------------------------------
/vision/ssd/squeezenet_ssd_lite.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import Conv2d, Sequential, ModuleList, ReLU
 3 | from ..nn.squeezenet import squeezenet1_1
 4 | 
 5 | from .ssd import SSD
 6 | from .predictor import Predictor
 7 | from .config import squeezenet_ssd_config as config
 8 | 
 9 | 
10 | def SeperableConv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0):
11 |     """Replace Conv2d with a depthwise Conv2d and Pointwise Conv2d.
12 |     """
13 |     return Sequential(
14 |         Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size,
15 |                groups=in_channels, stride=stride, padding=padding),
16 |         ReLU(),
17 |         Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1),
18 |     )
19 | 
20 | 
21 | def create_squeezenet_ssd_lite(num_classes, is_test=False):
22 |     base_net = squeezenet1_1(False).features  # disable dropout layer
23 | 
24 |     source_layer_indexes = [
25 |         12
26 |     ]
27 |     extras = ModuleList([
28 |         Sequential(
29 |             Conv2d(in_channels=512, out_channels=256, kernel_size=1),
30 |             ReLU(),
31 |             SeperableConv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=2),
32 |         ),
33 |         Sequential(
34 |             Conv2d(in_channels=512, out_channels=256, kernel_size=1),
35 |             ReLU(),
36 |             SeperableConv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1),
37 |         ),
38 |         Sequential(
39 |             Conv2d(in_channels=512, out_channels=128, kernel_size=1),
40 |             ReLU(),
41 |             SeperableConv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
42 |         ),
43 |         Sequential(
44 |             Conv2d(in_channels=256, out_channels=128, kernel_size=1),
45 |             ReLU(),
46 |             SeperableConv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
47 |         ),
48 |         Sequential(
49 |             Conv2d(in_channels=256, out_channels=128, kernel_size=1),
50 |             ReLU(),
51 |             SeperableConv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1)
52 |         )
53 |     ])
54 | 
55 |     regression_headers = ModuleList([
56 |         SeperableConv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
57 |         SeperableConv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
58 |         SeperableConv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
59 |         SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
60 |         SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
61 |         Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=1),
62 |     ])
63 | 
64 |     classification_headers = ModuleList([
65 |         SeperableConv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
66 |         SeperableConv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
67 |         SeperableConv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
68 |         SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
69 |         SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
70 |         Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=1),
71 |     ])
72 | 
73 |     return SSD(num_classes, base_net, source_layer_indexes,
74 |                extras, classification_headers, regression_headers, is_test=is_test, config=config)
75 | 
76 | 
77 | def create_squeezenet_ssd_lite_predictor(net, candidate_size=200, nms_method=None, sigma=0.5, device=torch.device('cpu')):
78 |     predictor = Predictor(net, config.image_size, config.image_mean,
79 |                           config.image_std,
80 |                           nms_method=nms_method,
81 |                           iou_threshold=config.iou_threshold,
82 |                           candidate_size=candidate_size,
83 |                           sigma=sigma,
84 |                           device=device)
85 |     return predictor


--------------------------------------------------------------------------------
/run_ssd_live_caffe2.py:
--------------------------------------------------------------------------------
  1 | import vision.utils.box_utils_numpy as box_utils
  2 | from vision.utils.misc import Timer
  3 | from vision.ssd.config.mobilenetv1_ssd_config import specs, center_variance, size_variance
  4 | 
  5 | 
  6 | import cv2
  7 | import sys
  8 | from caffe2.python import core, workspace, net_printer
  9 | import numpy as np
 10 | 
 11 | priors = box_utils.generate_ssd_priors(specs, 300)
 12 | print('priors.shape', priors.shape)
 13 | 
 14 | 
 15 | def load_model(init_net_path, predict_net_path):
 16 |     with open(init_net_path, "rb") as f:
 17 |         init_net = f.read()
 18 |     with open(predict_net_path, "rb") as f:
 19 |         predict_net = f.read()
 20 |     p = workspace.Predictor(init_net, predict_net)
 21 |     return p
 22 | 
 23 | 
 24 | def predict(width, height, confidences, boxes, prob_threshold, iou_threshold=0.5, top_k=-1):
 25 |     boxes = boxes[0]
 26 |     confidences = confidences[0]
 27 |     picked_box_probs = []
 28 |     picked_labels = []
 29 |     for class_index in range(1, confidences.shape[1]):
 30 |         probs = confidences[:, class_index]
 31 |         mask = probs > prob_threshold
 32 |         probs = probs[mask]
 33 |         if probs.shape[0] == 0:
 34 |             continue
 35 |         subset_boxes = boxes[mask, :]
 36 |         box_probs = np.concatenate([subset_boxes, probs.reshape(-1, 1)], axis=1)
 37 |         box_probs = box_utils.hard_nms(box_probs,
 38 |                                   iou_threshold=iou_threshold,
 39 |                                   top_k=top_k,
 40 |                                   )
 41 |         picked_box_probs.append(box_probs)
 42 |         picked_labels.extend([class_index] * box_probs.shape[0])
 43 |     if not picked_box_probs:
 44 |         return np.array([]), np.array([]), np.array([])
 45 |     picked_box_probs = np.concatenate(picked_box_probs)
 46 |     picked_box_probs[:, 0] *= width
 47 |     picked_box_probs[:, 1] *= height
 48 |     picked_box_probs[:, 2] *= width
 49 |     picked_box_probs[:, 3] *= height
 50 |     return picked_box_probs[:, :4].astype(np.int32), np.array(picked_labels), picked_box_probs[:, 4]
 51 | 
 52 | 
 53 | if len(sys.argv) < 2:
 54 |     print('Usage: python run_ssd_live_caffe2.py init_net predict_net')
 55 |     sys.exit(0)
 56 | init_net_path = sys.argv[1]
 57 | predict_net_path = sys.argv[2]
 58 | label_path = sys.argv[3]
 59 | 
 60 | class_names = [name.strip() for name in open(label_path).readlines()]
 61 | predictor = load_model(init_net_path, predict_net_path)
 62 | 
 63 | if len(sys.argv) >= 5:
 64 |     cap = cv2.VideoCapture(sys.argv[4])  # capture from file
 65 | else:
 66 |     cap = cv2.VideoCapture(0)   # capture from camera
 67 |     cap.set(3, 1920)
 68 |     cap.set(4, 1080)
 69 | 
 70 | timer = Timer()
 71 | while True:
 72 |     ret, orig_image = cap.read()
 73 |     if orig_image is None:
 74 |         continue
 75 |     image = cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB)
 76 |     image = cv2.resize(image, (300, 300))
 77 |     image = image.astype(np.float32)
 78 |     image = (image - 127) / 128
 79 |     image = np.transpose(image, [2, 0, 1])
 80 |     image = np.expand_dims(image, axis=0)
 81 |     timer.start()
 82 |     confidences, boxes = predictor.run({'0': image})
 83 |     interval = timer.end()
 84 |     print('Inference Time: {:.2f}s.'.format(interval))
 85 |     timer.start()
 86 |     boxes, labels, probs = predict(orig_image.shape[1], orig_image.shape[0], confidences, boxes, 0.55)
 87 |     interval = timer.end()
 88 |     print('NMS Time: {:.2f}s, Detect Objects: {:d}.'.format(interval, labels.shape[0]))
 89 |     for i in range(boxes.shape[0]):
 90 |         box = boxes[i, :]
 91 |         label = f"{class_names[labels[i]]}: {probs[i]:.2f}"
 92 | 
 93 |         cv2.rectangle(orig_image, (box[0], box[1]), (box[2], box[3]), (255, 255, 0), 4)
 94 | 
 95 |         cv2.putText(orig_image, label,
 96 |                     (box[0]+20, box[1]+40),
 97 |                     cv2.FONT_HERSHEY_SIMPLEX,
 98 |                     1,  # font scale
 99 |                     (255, 0, 255),
100 |                     2)  # line type
101 |     cv2.imshow('annotated', orig_image)
102 |     if cv2.waitKey(1) & 0xFF == ord('q'):
103 |         break
104 | cap.release()
105 | cv2.destroyAllWindows()
106 | 


--------------------------------------------------------------------------------
/vision/datasets/generate_vocdata.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import sys
  3 | import os
  4 | import xml.etree.ElementTree as ET
  5 | from random import random
  6 | 
  7 | def main(filename):
  8 |     # ratio to divide up the images
  9 |     train = 0.7
 10 |     val = 0.2
 11 |     test = 0.1
 12 |     if (train + test + val) != 1.0:
 13 |         print("probabilities must equal 1")
 14 |         exit()
 15 | 
 16 |     # get the labels
 17 |     labels = []
 18 |     imgnames = []
 19 |     annotations = {}
 20 | 
 21 |     with open(filename, 'r') as labelfile:
 22 |         label_string = ""
 23 |         for line in labelfile:
 24 |                 label_string += line.rstrip()
 25 | 
 26 |     labels = label_string.split(',')
 27 |     labels  = [elem.replace(" ", "") for elem in labels]
 28 | 
 29 |     # get image names
 30 |     for filename in os.listdir("./JPEGImages"):
 31 |         if filename.endswith(".jpg"):
 32 |             img = filename.rstrip('.jpg')
 33 |             imgnames.append(img)
 34 | 
 35 |     print("Labels:", labels, "imgcnt:", len(imgnames))
 36 | 
 37 |     # initialise annotation list
 38 |     for label in labels:
 39 |         annotations[label] = []
 40 | 
 41 |     # Scan the annotations for the labels
 42 |     for img in imgnames:
 43 |         annote = "Annotations/" + img + '.xml'
 44 |         if os.path.isfile(annote):
 45 |             tree = ET.parse(annote)
 46 |             root = tree.getroot()
 47 |             annote_labels = []
 48 |             for labelname in root.findall('*/name'):
 49 |                 labelname = labelname.text
 50 |                 annote_labels.append(labelname)
 51 |                 if labelname in labels:
 52 |                     annotations[labelname].append(img)
 53 |             annotations[img] = annote_labels
 54 |         else:
 55 |             print("Missing annotation for ", annote)
 56 |             exit() 
 57 | 
 58 |     # divvy up the images to the different sets
 59 |     sampler = imgnames.copy()
 60 |     train_list = []
 61 |     val_list = []
 62 |     test_list = []
 63 | 
 64 |     while len(sampler) > 0:
 65 |         dice = random()
 66 |         elem = sampler.pop()
 67 | 
 68 |         if dice <= test:
 69 |             test_list.append(elem)
 70 |         elif dice <= (test + val):
 71 |             val_list.append(elem)
 72 |         else:
 73 |             train_list.append(elem) 
 74 | 
 75 |     print("Training set:", len(train_list), "validation set:", len(val_list), "test set:", len(test_list))
 76 | 
 77 | 
 78 |     # create the dataset files
 79 |     create_folder("./ImageSets/Main/")
 80 |     with open("./ImageSets/Main/train.txt", 'w') as outfile:
 81 |         for name in train_list:
 82 |             outfile.write(name + "\n")
 83 |     with open("./ImageSets/Main/val.txt", 'w') as outfile:
 84 |         for name in val_list:
 85 |             outfile.write(name + "\n")
 86 |     with open("./ImageSets/Main/trainval.txt", 'w') as outfile:
 87 |         for name in train_list:
 88 |             outfile.write(name + "\n")
 89 |         for name in val_list:
 90 |             outfile.write(name + "\n")
 91 | 
 92 |     with open("./ImageSets/Main/test.txt", 'w') as outfile:
 93 |         for name in test_list:
 94 |             outfile.write(name + "\n")
 95 | 
 96 |     # create the individiual files for each label
 97 |     for label in labels:
 98 |         with open("./ImageSets/Main/"+ label +"_train.txt", 'w') as outfile:
 99 |             for name in train_list:
100 |                 if label in annotations[name]:
101 |                     outfile.write(name + " 1\n")
102 |                 else:
103 |                     outfile.write(name + " -1\n")
104 |         with open("./ImageSets/Main/"+ label +"_val.txt", 'w') as outfile:
105 |             for name in val_list:
106 |                 if label in annotations[name]:
107 |                     outfile.write(name + " 1\n")
108 |                 else:
109 |                     outfile.write(name + " -1\n")
110 |         with open("./ImageSets/Main/"+ label +"_test.txt", 'w') as outfile:
111 |             for name in test_list:
112 |                 if label in annotations[name]:
113 |                     outfile.write(name + " 1\n")
114 |                 else:
115 |                     outfile.write(name + " -1\n")
116 | 
117 | def create_folder(foldername):
118 |     if os.path.exists(foldername):
119 |         print('folder already exists:', foldername)
120 |     else:
121 |         os.makedirs(foldername)
122 | 
123 | if __name__=='__main__':
124 |     if len(sys.argv) < 2:
125 |         print("usage: python generate_vocdata.py <labelfile>")
126 |         exit()
127 |     main(sys.argv[1])
128 | 


--------------------------------------------------------------------------------
/vision/datasets/open_images.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pathlib
  3 | import cv2
  4 | import pandas as pd
  5 | 
  6 | 
  7 | class OpenImagesDataset:
  8 | 
  9 |     def __init__(self, root,
 10 |                  transform=None, target_transform=None,
 11 |                  dataset_type="train", balance_data=False):
 12 |         self.root = pathlib.Path(root)
 13 |         self.transform = transform
 14 |         self.target_transform = target_transform
 15 |         self.dataset_type = dataset_type.lower()
 16 | 
 17 |         self.data, self.class_names, self.class_dict = self._read_data()
 18 |         self.balance_data = balance_data
 19 |         self.min_image_num = -1
 20 |         if self.balance_data:
 21 |             self.data = self._balance_data()
 22 |         self.ids = [info['image_id'] for info in self.data]
 23 | 
 24 |         self.class_stat = None
 25 | 
 26 |     def _getitem(self, index):
 27 |         image_info = self.data[index]
 28 |         image = self._read_image(image_info['image_id'])
 29 |         boxes = image_info['boxes']
 30 |         boxes[:, 0] *= image.shape[1]
 31 |         boxes[:, 1] *= image.shape[0]
 32 |         boxes[:, 2] *= image.shape[1]
 33 |         boxes[:, 3] *= image.shape[0]
 34 |         labels = image_info['labels']
 35 |         if self.transform:
 36 |             image, boxes, labels = self.transform(image, boxes, labels)
 37 |         if self.target_transform:
 38 |             boxes, labels = self.target_transform(boxes, labels)
 39 |         return image_info['image_id'], image, boxes, labels
 40 | 
 41 |     def __getitem__(self, index):
 42 |         _, image, boxes, labels = self._getitem(index)
 43 |         return image, boxes, labels
 44 | 
 45 |     def get_annotation(self, index):
 46 |         """To conform the eval_ssd implementation that is based on the VOC dataset."""
 47 |         image_id, image, boxes, labels = self._getitem(index)
 48 |         is_difficult = np.zeros(boxes.shape[0], dtype=np.uint8)
 49 |         return image_id, (boxes, labels, is_difficult)
 50 | 
 51 |     def get_image(self, index):
 52 |         image_info = self.data[index]
 53 |         image = self._read_image(image_info['image_id'])
 54 |         if self.transform:
 55 |             image, _ = self.transform(image)
 56 |         return image
 57 | 
 58 |     def _read_data(self):
 59 |         annotation_file = f"{self.root}/sub-{self.dataset_type}-annotations-bbox.csv"
 60 |         annotations = pd.read_csv(annotation_file)
 61 |         class_names = ['BACKGROUND'] + sorted(list(annotations['ClassName'].unique()))
 62 |         class_dict = {class_name: i for i, class_name in enumerate(class_names)}
 63 |         data = []
 64 |         for image_id, group in annotations.groupby("ImageID"):
 65 |             boxes = group.loc[:, ["XMin", "YMin", "XMax", "YMax"]].values.astype(np.float32)
 66 |             labels = np.array([class_dict[name] for name in group["ClassName"]])
 67 |             data.append({
 68 |                 'image_id': image_id,
 69 |                 'boxes': boxes,
 70 |                 'labels': labels
 71 |             })
 72 |         return data, class_names, class_dict
 73 | 
 74 |     def __len__(self):
 75 |         return len(self.data)
 76 | 
 77 |     def __repr__(self):
 78 |         if self.class_stat is None:
 79 |             self.class_stat = {name: 0 for name in self.class_names[1:]}
 80 |             for example in self.data:
 81 |                 for class_index in example['labels']:
 82 |                     class_name = self.class_names[class_index]
 83 |                     self.class_stat[class_name] += 1
 84 |         content = ["Dataset Summary:"
 85 |                    f"Number of Images: {len(self.data)}",
 86 |                    f"Minimum Number of Images for a Class: {self.min_image_num}",
 87 |                    "Label Distribution:"]
 88 |         for class_name, num in self.class_stat.items():
 89 |             content.append(f"\t{class_name}: {num}")
 90 |         return "\n".join(content)
 91 | 
 92 |     def _read_image(self, image_id):
 93 |         image_file = self.root / self.dataset_type / f"{image_id}.jpg"
 94 |         image = cv2.imread(str(image_file))
 95 |         if image.shape[2] == 1:
 96 |             image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
 97 |         else:
 98 |             image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
 99 |         return image
100 | 
101 |     def _balance_data(self):
102 |         label_image_indexes = [set() for _ in range(len(self.class_names))]
103 |         for i, image in enumerate(self.data):
104 |             for label_id in image['labels']:
105 |                 label_image_indexes[label_id].add(i)
106 |         label_stat = [len(s) for s in label_image_indexes]
107 |         self.min_image_num = min(label_stat[1:])
108 |         sample_image_indexes = set()
109 |         for image_indexes in label_image_indexes[1:]:
110 |             image_indexes = np.array(list(image_indexes))
111 |             sub = np.random.permutation(image_indexes)[:self.min_image_num]
112 |             sample_image_indexes.update(sub)
113 |         sample_data = [self.data[i] for i in sample_image_indexes]
114 |         return sample_data
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 


--------------------------------------------------------------------------------
/vision/datasets/voc_dataset.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import logging
  3 | import pathlib
  4 | import xml.etree.ElementTree as ET
  5 | import cv2
  6 | import os
  7 | 
  8 | 
  9 | class VOCDataset:
 10 | 
 11 |     def __init__(self, root, transform=None, target_transform=None, is_test=False, keep_difficult=False, label_file=None):
 12 |         """Dataset for VOC data.
 13 |         Args:
 14 |             root: the root of the VOC2007 or VOC2012 dataset, the directory contains the following sub-directories:
 15 |                 Annotations, ImageSets, JPEGImages, SegmentationClass, SegmentationObject.
 16 |         """
 17 |         self.root = pathlib.Path(root)
 18 |         self.transform = transform
 19 |         self.target_transform = target_transform
 20 |         if is_test:
 21 |             image_sets_file = self.root / "ImageSets/Main/test.txt"
 22 |         else:
 23 |             image_sets_file = self.root / "ImageSets/Main/trainval.txt"
 24 |         self.ids = VOCDataset._read_image_ids(image_sets_file)
 25 |         self.keep_difficult = keep_difficult
 26 | 
 27 |         # if the labels file exists, read in the class names
 28 |         label_file_name = self.root / "labels.txt"
 29 | 
 30 |         if os.path.isfile(label_file_name):
 31 |             class_string = ""
 32 |             with open(label_file_name, 'r') as infile:
 33 |                 for line in infile:
 34 |                     class_string += line.rstrip()
 35 | 
 36 |             # classes should be a comma separated list
 37 |             
 38 |             classes = class_string.split(',')
 39 |             # prepend BACKGROUND as first class
 40 |             classes.insert(0, 'BACKGROUND')
 41 |             classes  = [ elem.replace(" ", "") for elem in classes]
 42 |             self.class_names = tuple(classes)
 43 |             logging.info("VOC Labels read from file: " + str(self.class_names))
 44 | 
 45 |         else:
 46 |             logging.info("No labels file, using default VOC classes.")
 47 |             self.class_names = ('BACKGROUND',
 48 |             'aeroplane', 'bicycle', 'bird', 'boat',
 49 |             'bottle', 'bus', 'car', 'cat', 'chair',
 50 |             'cow', 'diningtable', 'dog', 'horse',
 51 |             'motorbike', 'person', 'pottedplant',
 52 |             'sheep', 'sofa', 'train', 'tvmonitor')
 53 | 
 54 | 
 55 |         self.class_dict = {class_name: i for i, class_name in enumerate(self.class_names)}
 56 | 
 57 |     def __getitem__(self, index):
 58 |         image_id = self.ids[index]
 59 |         boxes, labels, is_difficult = self._get_annotation(image_id)
 60 |         if not self.keep_difficult:
 61 |             boxes = boxes[is_difficult == 0]
 62 |             labels = labels[is_difficult == 0]
 63 |         image = self._read_image(image_id)
 64 |         if self.transform:
 65 |             image, boxes, labels = self.transform(image, boxes, labels)
 66 |         if self.target_transform:
 67 |             boxes, labels = self.target_transform(boxes, labels)
 68 |         return image, boxes, labels
 69 | 
 70 |     def get_image(self, index):
 71 |         image_id = self.ids[index]
 72 |         image = self._read_image(image_id)
 73 |         if self.transform:
 74 |             image, _ = self.transform(image)
 75 |         return image
 76 | 
 77 |     def get_annotation(self, index):
 78 |         image_id = self.ids[index]
 79 |         return image_id, self._get_annotation(image_id)
 80 | 
 81 |     def __len__(self):
 82 |         return len(self.ids)
 83 | 
 84 |     @staticmethod
 85 |     def _read_image_ids(image_sets_file):
 86 |         ids = []
 87 |         with open(image_sets_file) as f:
 88 |             for line in f:
 89 |                 ids.append(line.rstrip())
 90 |         return ids
 91 | 
 92 |     def _get_annotation(self, image_id):
 93 |         annotation_file = self.root / f"Annotations/{image_id}.xml"
 94 |         objects = ET.parse(annotation_file).findall("object")
 95 |         boxes = []
 96 |         labels = []
 97 |         is_difficult = []
 98 |         for object in objects:
 99 |             class_name = object.find('name').text.lower().strip()
100 |             # we're only concerned with clases in our list
101 |             if class_name in self.class_dict:
102 |                 bbox = object.find('bndbox')
103 | 
104 |                 # VOC dataset format follows Matlab, in which indexes start from 0
105 |                 x1 = float(bbox.find('xmin').text) - 1
106 |                 y1 = float(bbox.find('ymin').text) - 1
107 |                 x2 = float(bbox.find('xmax').text) - 1
108 |                 y2 = float(bbox.find('ymax').text) - 1
109 |                 boxes.append([x1, y1, x2, y2])
110 | 
111 |                 labels.append(self.class_dict[class_name])
112 |                 is_difficult_str = object.find('difficult').text
113 |                 is_difficult.append(int(is_difficult_str) if is_difficult_str else 0)
114 | 
115 |         return (np.array(boxes, dtype=np.float32),
116 |                 np.array(labels, dtype=np.int64),
117 |                 np.array(is_difficult, dtype=np.uint8))
118 | 
119 |     def _read_image(self, image_id):
120 |         image_file = self.root / f"JPEGImages/{image_id}.jpg"
121 |         image = cv2.imread(str(image_file))
122 |         image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
123 |         return image
124 | 
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/vision/nn/squeezenet.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.init as init
  5 | import torch.utils.model_zoo as model_zoo
  6 | 
  7 | 
  8 | __all__ = ['SqueezeNet', 'squeezenet1_0', 'squeezenet1_1']
  9 | 
 10 | 
 11 | model_urls = {
 12 |     'squeezenet1_0': 'https://download.pytorch.org/models/squeezenet1_0-a815701f.pth',
 13 |     'squeezenet1_1': 'https://download.pytorch.org/models/squeezenet1_1-f364aa15.pth',
 14 | }
 15 | 
 16 | 
 17 | class Fire(nn.Module):
 18 | 
 19 |     def __init__(self, inplanes, squeeze_planes,
 20 |                  expand1x1_planes, expand3x3_planes):
 21 |         super(Fire, self).__init__()
 22 |         self.inplanes = inplanes
 23 |         self.squeeze = nn.Conv2d(inplanes, squeeze_planes, kernel_size=1)
 24 |         self.squeeze_activation = nn.ReLU(inplace=True)
 25 |         self.expand1x1 = nn.Conv2d(squeeze_planes, expand1x1_planes,
 26 |                                    kernel_size=1)
 27 |         self.expand1x1_activation = nn.ReLU(inplace=True)
 28 |         self.expand3x3 = nn.Conv2d(squeeze_planes, expand3x3_planes,
 29 |                                    kernel_size=3, padding=1)
 30 |         self.expand3x3_activation = nn.ReLU(inplace=True)
 31 | 
 32 |     def forward(self, x):
 33 |         x = self.squeeze_activation(self.squeeze(x))
 34 |         return torch.cat([
 35 |             self.expand1x1_activation(self.expand1x1(x)),
 36 |             self.expand3x3_activation(self.expand3x3(x))
 37 |         ], 1)
 38 | 
 39 | 
 40 | class SqueezeNet(nn.Module):
 41 | 
 42 |     def __init__(self, version=1.0, num_classes=1000):
 43 |         super(SqueezeNet, self).__init__()
 44 |         if version not in [1.0, 1.1]:
 45 |             raise ValueError("Unsupported SqueezeNet version {version}:"
 46 |                              "1.0 or 1.1 expected".format(version=version))
 47 |         self.num_classes = num_classes
 48 |         if version == 1.0:
 49 |             self.features = nn.Sequential(
 50 |                 nn.Conv2d(3, 96, kernel_size=7, stride=2),
 51 |                 nn.ReLU(inplace=True),
 52 |                 nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
 53 |                 Fire(96, 16, 64, 64),
 54 |                 Fire(128, 16, 64, 64),
 55 |                 Fire(128, 32, 128, 128),
 56 |                 nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
 57 |                 Fire(256, 32, 128, 128),
 58 |                 Fire(256, 48, 192, 192),
 59 |                 Fire(384, 48, 192, 192),
 60 |                 Fire(384, 64, 256, 256),
 61 |                 nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
 62 |                 Fire(512, 64, 256, 256),
 63 |             )
 64 |         else:
 65 |             self.features = nn.Sequential(
 66 |                 nn.Conv2d(3, 64, kernel_size=3, stride=2),
 67 |                 nn.ReLU(inplace=True),
 68 |                 nn.MaxPool2d(kernel_size=3, stride=2),
 69 |                 Fire(64, 16, 64, 64),
 70 |                 Fire(128, 16, 64, 64),
 71 |                 nn.MaxPool2d(kernel_size=3, stride=2),
 72 |                 Fire(128, 32, 128, 128),
 73 |                 Fire(256, 32, 128, 128),
 74 |                 nn.MaxPool2d(kernel_size=3, stride=2),
 75 |                 Fire(256, 48, 192, 192),
 76 |                 Fire(384, 48, 192, 192),
 77 |                 Fire(384, 64, 256, 256),
 78 |                 Fire(512, 64, 256, 256),
 79 |             )
 80 |         # Final convolution is initialized differently form the rest
 81 |         final_conv = nn.Conv2d(512, self.num_classes, kernel_size=1)
 82 |         self.classifier = nn.Sequential(
 83 |             nn.Dropout(p=0.5),
 84 |             final_conv,
 85 |             nn.ReLU(inplace=True),
 86 |             nn.AvgPool2d(13, stride=1)
 87 |         )
 88 | 
 89 |         for m in self.modules():
 90 |             if isinstance(m, nn.Conv2d):
 91 |                 if m is final_conv:
 92 |                     init.normal_(m.weight, mean=0.0, std=0.01)
 93 |                 else:
 94 |                     init.kaiming_uniform_(m.weight)
 95 |                 if m.bias is not None:
 96 |                     init.constant_(m.bias, 0)
 97 | 
 98 |     def forward(self, x):
 99 |         x = self.features(x)
100 |         x = self.classifier(x)
101 |         return x.view(x.size(0), self.num_classes)
102 | 
103 | 
104 | def squeezenet1_0(pretrained=False, **kwargs):
105 |     r"""SqueezeNet model architecture from the `"SqueezeNet: AlexNet-level
106 |     accuracy with 50x fewer parameters and <0.5MB model size"
107 |     <https://arxiv.org/abs/1602.07360>`_ paper.
108 | 
109 |     Args:
110 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
111 |     """
112 |     model = SqueezeNet(version=1.0, **kwargs)
113 |     if pretrained:
114 |         model.load_state_dict(model_zoo.load_url(model_urls['squeezenet1_0']))
115 |     return model
116 | 
117 | 
118 | def squeezenet1_1(pretrained=False, **kwargs):
119 |     r"""SqueezeNet 1.1 model from the `official SqueezeNet repo
120 |     <https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1>`_.
121 |     SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters
122 |     than SqueezeNet 1.0, without sacrificing accuracy.
123 | 
124 |     Args:
125 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
126 |     """
127 |     model = SqueezeNet(version=1.1, **kwargs)
128 |     if pretrained:
129 |         model.load_state_dict(model_zoo.load_url(model_urls['squeezenet1_1']))
130 |     return model
131 | 


--------------------------------------------------------------------------------
/vision/ssd/mobilenet_v3_ssd_lite.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import Conv2d, Sequential, ModuleList, BatchNorm2d
 3 | from torch import nn
 4 | from ..nn.mobilenet_v2 import InvertedResidual
 5 | from ..nn.mobilenet_v3 import MobileNetV3
 6 | 
 7 | from .ssd import SSD, GraphPath
 8 | from .predictor import Predictor
 9 | from .config import mobilenetv1_ssd_config as config
10 | 
11 | 
12 | def SeperableConv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, onnx_compatible=False):
13 |     """Replace Conv2d with a depthwise Conv2d and Pointwise Conv2d.
14 |     """
15 |     ReLU = nn.ReLU if onnx_compatible else nn.ReLU6
16 |     return Sequential(
17 |         Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size,
18 |                groups=in_channels, stride=stride, padding=padding),
19 |         BatchNorm2d(in_channels),
20 |         ReLU(),
21 |         Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1),
22 |     )
23 | 
24 | # 论文里说的是第9个，第一个的序号是3，
25 | # 1，3
26 | # 9,11 所以是11 
27 | #一共是19层，所以这里是20
28 | def create_mobilenetv3_ssd_lite(num_classes, width_mult=1.0, is_test=False):
29 |     base_net = MobileNetV3().features
30 | 
31 |     source_layer_indexes = [GraphPath(11, 'conv'),20,]
32 |     # extras = ModuleList([
33 |     #     InvertedResidual(1280, 512, stride=2, expand_ratio=0.2),
34 |     #     InvertedResidual(512, 256, stride=2, expand_ratio=0.25),
35 |     #     InvertedResidual(256, 256, stride=2, expand_ratio=0.5),
36 |     #     InvertedResidual(256, 64, stride=2, expand_ratio=0.25)
37 |     # ])
38 | 
39 |     # regression_headers = ModuleList([
40 |     #     SeperableConv2d(in_channels=round(576 * width_mult), out_channels=6 * 4,
41 |     #                     kernel_size=3, padding=1, onnx_compatible=False),
42 |     #     SeperableConv2d(in_channels=1280, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
43 |     #     SeperableConv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
44 |     #     SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
45 |     #     SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
46 |     #     Conv2d(in_channels=64, out_channels=6 * 4, kernel_size=1),
47 |     # ])
48 | 
49 |     # classification_headers = ModuleList([
50 |     #     SeperableConv2d(in_channels=round(576 * width_mult), out_channels=6 * num_classes, kernel_size=3, padding=1),
51 |     #     SeperableConv2d(in_channels=1280, out_channels=6 * num_classes, kernel_size=3, padding=1),
52 |     #     SeperableConv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
53 |     #     SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
54 |     #     SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
55 |     #     Conv2d(in_channels=64, out_channels=6 * num_classes, kernel_size=1),
56 |     # ])
57 | 
58 |     # return SSD(num_classes, base_net, source_layer_indexes,
59 |     #            extras, classification_headers, regression_headers, is_test=is_test, config=config)
60 |     extras = ModuleList([
61 |         InvertedResidual(1280, 512, stride=2, expand_ratio=0.2),
62 |         InvertedResidual(512, 256, stride=2, expand_ratio=0.25),
63 |         InvertedResidual(256, 256, stride=2, expand_ratio=0.5),
64 |         InvertedResidual(256, 64, stride=2, expand_ratio=0.25)
65 |     ])
66 | 
67 |     regression_headers = ModuleList([
68 |         SeperableConv2d(in_channels=round(288 * width_mult), out_channels=6 * 4,
69 |                         kernel_size=3, padding=1, onnx_compatible=False),
70 |         SeperableConv2d(in_channels=1280, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
71 |         SeperableConv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
72 |         SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
73 |         SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
74 |         Conv2d(in_channels=64, out_channels=6 * 4, kernel_size=1),
75 |     ])
76 | 
77 |     classification_headers = ModuleList([
78 |         SeperableConv2d(in_channels=round(288 * width_mult), out_channels=6 * num_classes, kernel_size=3, padding=1),
79 |         SeperableConv2d(in_channels=1280, out_channels=6 * num_classes, kernel_size=3, padding=1),
80 |         SeperableConv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
81 |         SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
82 |         SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
83 |         Conv2d(in_channels=64, out_channels=6 * num_classes, kernel_size=1),
84 |     ])
85 | 
86 |     return SSD(num_classes, base_net, source_layer_indexes,
87 |                extras, classification_headers, regression_headers, is_test=is_test, config=config)
88 | 
89 | 
90 | def create_mobilenetv3_ssd_lite_predictor(net, candidate_size=200, nms_method=None, sigma=0.5, device=torch.device('cpu')):
91 |     predictor = Predictor(net, config.image_size, config.image_mean,
92 |                           config.image_std,
93 |                           nms_method=nms_method,
94 |                           iou_threshold=config.iou_threshold,
95 |                           candidate_size=candidate_size,
96 |                           sigma=sigma,
97 |                           device=device)
98 |     return predictor
99 | 


--------------------------------------------------------------------------------
/vision/ssd/fpn_ssd.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | from typing import List, Tuple
  6 | 
  7 | from ..utils import box_utils
  8 | 
  9 | 
 10 | class FPNSSD(nn.Module):
 11 |     def __init__(self, num_classes: int, base_net: nn.ModuleList, source_layer_indexes: List[int],
 12 |                  extras: nn.ModuleList, classification_headers: nn.ModuleList,
 13 |                  regression_headers: nn.ModuleList, upsample_mode="nearest"):
 14 |         """Compose a SSD model using the given components.
 15 |         """
 16 |         super(FPNSSD, self).__init__()
 17 | 
 18 |         self.num_classes = num_classes
 19 |         self.base_net = base_net
 20 |         self.source_layer_indexes = source_layer_indexes
 21 |         self.extras = extras
 22 |         self.classification_headers = classification_headers
 23 |         self.regression_headers = regression_headers
 24 |         self.upsample_mode = upsample_mode
 25 | 
 26 |         # register layers in source_layer_indexes by adding them to a module list
 27 |         self.source_layer_add_ons = nn.ModuleList([t[1] for t in source_layer_indexes if isinstance(t, tuple)])
 28 |         self.upsamplers = [
 29 |             nn.Upsample(size=(19, 19), mode='bilinear'),
 30 |             nn.Upsample(size=(10, 10), mode='bilinear'),
 31 |             nn.Upsample(size=(5, 5), mode='bilinear'),
 32 |             nn.Upsample(size=(3, 3), mode='bilinear'),
 33 |             nn.Upsample(size=(2, 2), mode='bilinear'),
 34 |         ]
 35 | 
 36 |     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
 37 |         confidences = []
 38 |         locations = []
 39 |         start_layer_index = 0
 40 |         header_index = 0
 41 |         features = []
 42 |         for end_layer_index in self.source_layer_indexes:
 43 | 
 44 |             if isinstance(end_layer_index, tuple):
 45 |                 added_layer = end_layer_index[1]
 46 |                 end_layer_index = end_layer_index[0]
 47 |             else:
 48 |                 added_layer = None
 49 |             for layer in self.base_net[start_layer_index: end_layer_index]:
 50 |                 x = layer(x)
 51 |             start_layer_index = end_layer_index
 52 |             if added_layer:
 53 |                 y = added_layer(x)
 54 |             else:
 55 |                 y = x
 56 |             #confidence, location = self.compute_header(header_index, y)
 57 |             features.append(y)
 58 |             header_index += 1
 59 |             # confidences.append(confidence)
 60 |             # locations.append(location)
 61 | 
 62 |         for layer in self.base_net[end_layer_index:]:
 63 |             x = layer(x)
 64 | 
 65 |         for layer in self.extras:
 66 |             x = layer(x)
 67 |             #confidence, location = self.compute_header(header_index, x)
 68 |             features.append(x)
 69 |             header_index += 1
 70 |             # confidences.append(confidence)
 71 |             # locations.append(location)
 72 | 
 73 |         upstream_feature = None
 74 |         for i in range(len(features) - 1, -1, -1):
 75 |             feature = features[i]
 76 |             if upstream_feature is not None:
 77 |                 upstream_feature = self.upsamplers[i](upstream_feature)
 78 |                 upstream_feature += feature
 79 |             else:
 80 |                 upstream_feature = feature
 81 |             confidence, location = self.compute_header(i, upstream_feature)
 82 |             confidences.append(confidence)
 83 |             locations.append(location)
 84 |         confidences = torch.cat(confidences, 1)
 85 |         locations = torch.cat(locations, 1)
 86 |         return confidences, locations
 87 | 
 88 |     def compute_header(self, i, x):
 89 |         confidence = self.classification_headers[i](x)
 90 |         confidence = confidence.permute(0, 2, 3, 1).contiguous()
 91 |         confidence = confidence.view(confidence.size(0), -1, self.num_classes)
 92 | 
 93 |         location = self.regression_headers[i](x)
 94 |         location = location.permute(0, 2, 3, 1).contiguous()
 95 |         location = location.view(location.size(0), -1, 4)
 96 | 
 97 |         return confidence, location
 98 | 
 99 |     def init_from_base_net(self, model):
100 |         self.base_net.load_state_dict(torch.load(model, map_location=lambda storage, loc: storage), strict=False)
101 |         self.source_layer_add_ons.apply(_xavier_init_)
102 |         self.extras.apply(_xavier_init_)
103 |         self.classification_headers.apply(_xavier_init_)
104 |         self.regression_headers.apply(_xavier_init_)
105 | 
106 |     def init(self):
107 |         self.base_net.apply(_xavier_init_)
108 |         self.source_layer_add_ons.apply(_xavier_init_)
109 |         self.extras.apply(_xavier_init_)
110 |         self.classification_headers.apply(_xavier_init_)
111 |         self.regression_headers.apply(_xavier_init_)
112 | 
113 |     def load(self, model):
114 |         self.load_state_dict(torch.load(model, map_location=lambda storage, loc: storage))
115 | 
116 |     def save(self, model_path):
117 |         torch.save(self.state_dict(), model_path)
118 | 
119 | 
120 | class MatchPrior(object):
121 |     def __init__(self, center_form_priors, center_variance, size_variance, iou_threshold):
122 |         self.center_form_priors = center_form_priors
123 |         self.corner_form_priors = box_utils.center_form_to_corner_form(center_form_priors)
124 |         self.center_variance = center_variance
125 |         self.size_variance = size_variance
126 |         self.iou_threshold = iou_threshold
127 | 
128 |     def __call__(self, gt_boxes, gt_labels):
129 |         if type(gt_boxes) is np.ndarray:
130 |             gt_boxes = torch.from_numpy(gt_boxes)
131 |         if type(gt_labels) is np.ndarray:
132 |             gt_labels = torch.from_numpy(gt_labels)
133 |         boxes, labels = box_utils.assign_priors(gt_boxes, gt_labels,
134 |                                                 self.corner_form_priors, self.iou_threshold)
135 |         boxes = box_utils.corner_form_to_center_form(boxes)
136 |         locations = box_utils.convert_boxes_to_locations(boxes, self.center_form_priors, self.center_variance, self.size_variance)
137 |         return locations, labels
138 | 
139 | 
140 | def _xavier_init_(m: nn.Module):
141 |     if isinstance(m, nn.Conv2d):
142 |         nn.init.xavier_uniform_(m.weight)
143 | 


--------------------------------------------------------------------------------
/vision/nn/mobilenet_v2.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import math
  3 | 
  4 | # Modified from https://github.com/tonylins/pytorch-mobilenet-v2/blob/master/MobileNetV2.py.
  5 | # In this version, Relu6 is replaced with Relu to make it ONNX compatible.
  6 | # BatchNorm Layer is optional to make it easy do batch norm confusion.
  7 | 
  8 | 
  9 | def conv_bn(inp, oup, stride, use_batch_norm=True, onnx_compatible=False):
 10 |     ReLU = nn.ReLU if onnx_compatible else nn.ReLU6
 11 | 
 12 |     if use_batch_norm:
 13 |         return nn.Sequential(
 14 |             nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
 15 |             nn.BatchNorm2d(oup),
 16 |             ReLU(inplace=True)
 17 |         )
 18 |     else:
 19 |         return nn.Sequential(
 20 |             nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
 21 |             ReLU(inplace=True)
 22 |         )
 23 | 
 24 | 
 25 | def conv_1x1_bn(inp, oup, use_batch_norm=True, onnx_compatible=False):
 26 |     ReLU = nn.ReLU if onnx_compatible else nn.ReLU6
 27 |     if use_batch_norm:
 28 |         return nn.Sequential(
 29 |             nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
 30 |             nn.BatchNorm2d(oup),
 31 |             ReLU(inplace=True)
 32 |         )
 33 |     else:
 34 |         return nn.Sequential(
 35 |             nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
 36 |             ReLU(inplace=True)
 37 |         )
 38 | 
 39 | 
 40 | class InvertedResidual(nn.Module):
 41 |     def __init__(self, inp, oup, stride, expand_ratio, use_batch_norm=True, onnx_compatible=False):
 42 |         super(InvertedResidual, self).__init__()
 43 |         ReLU = nn.ReLU if onnx_compatible else nn.ReLU6
 44 | 
 45 |         self.stride = stride
 46 |         assert stride in [1, 2]
 47 | 
 48 |         hidden_dim = round(inp * expand_ratio)
 49 |         self.use_res_connect = self.stride == 1 and inp == oup
 50 | 
 51 |         if expand_ratio == 1:
 52 |             if use_batch_norm:
 53 |                 self.conv = nn.Sequential(
 54 |                     # dw
 55 |                     nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
 56 |                     nn.BatchNorm2d(hidden_dim),
 57 |                     ReLU(inplace=True),
 58 |                     # pw-linear
 59 |                     nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
 60 |                     nn.BatchNorm2d(oup),
 61 |                 )
 62 |             else:
 63 |                 self.conv = nn.Sequential(
 64 |                     # dw
 65 |                     nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
 66 |                     ReLU(inplace=True),
 67 |                     # pw-linear
 68 |                     nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
 69 |                 )
 70 |         else:
 71 |             if use_batch_norm:
 72 |                 self.conv = nn.Sequential(
 73 |                     # pw
 74 |                     nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
 75 |                     nn.BatchNorm2d(hidden_dim),
 76 |                     ReLU(inplace=True),
 77 |                     # dw
 78 |                     nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
 79 |                     nn.BatchNorm2d(hidden_dim),
 80 |                     ReLU(inplace=True),
 81 |                     # pw-linear
 82 |                     nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
 83 |                     nn.BatchNorm2d(oup),
 84 |                 )
 85 |             else:
 86 |                 self.conv = nn.Sequential(
 87 |                     # pw
 88 |                     nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
 89 |                     ReLU(inplace=True),
 90 |                     # dw
 91 |                     nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
 92 |                     ReLU(inplace=True),
 93 |                     # pw-linear
 94 |                     nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
 95 |                 )
 96 | 
 97 |     def forward(self, x):
 98 |         if self.use_res_connect:
 99 |             return x + self.conv(x)
100 |         else:
101 |             return self.conv(x)
102 | 
103 | 
104 | class MobileNetV2(nn.Module):
105 |     def __init__(self, n_class=1000, input_size=224, width_mult=1., dropout_ratio=0.2,
106 |                  use_batch_norm=True, onnx_compatible=False):
107 |         super(MobileNetV2, self).__init__()
108 |         block = InvertedResidual
109 |         input_channel = 32
110 |         last_channel = 1280
111 |         interverted_residual_setting = [
112 |             # t, c, n, s
113 |             [1, 16, 1, 1],
114 |             [6, 24, 2, 2],
115 |             [6, 32, 3, 2],
116 |             [6, 64, 4, 2],
117 |             [6, 96, 3, 1],
118 |             [6, 160, 3, 2],
119 |             [6, 320, 1, 1],
120 |         ]
121 | 
122 |         # building first layer
123 |         assert input_size % 32 == 0
124 |         input_channel = int(input_channel * width_mult)
125 |         self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel
126 |         self.features = [conv_bn(3, input_channel, 2, onnx_compatible=onnx_compatible)]
127 |         # building inverted residual blocks
128 |         for t, c, n, s in interverted_residual_setting:
129 |             output_channel = int(c * width_mult)
130 |             for i in range(n):
131 |                 if i == 0:
132 |                     self.features.append(block(input_channel, output_channel, s,
133 |                                                expand_ratio=t, use_batch_norm=use_batch_norm,
134 |                                                onnx_compatible=onnx_compatible))
135 |                 else:
136 |                     self.features.append(block(input_channel, output_channel, 1,
137 |                                                expand_ratio=t, use_batch_norm=use_batch_norm,
138 |                                                onnx_compatible=onnx_compatible))
139 |                 input_channel = output_channel
140 |         # building last several layers
141 |         self.features.append(conv_1x1_bn(input_channel, self.last_channel,
142 |                                          use_batch_norm=use_batch_norm, onnx_compatible=onnx_compatible))
143 |         # make it nn.Sequential
144 |         self.features = nn.Sequential(*self.features)
145 | 
146 |         # building classifier
147 |         self.classifier = nn.Sequential(
148 |             nn.Dropout(dropout_ratio),
149 |             nn.Linear(self.last_channel, n_class),
150 |         )
151 | 
152 |         self._initialize_weights()
153 | 
154 |     def forward(self, x):
155 |         x = self.features(x)
156 |         x = x.mean(3).mean(2)
157 |         x = self.classifier(x)
158 |         return x
159 | 
160 |     def _initialize_weights(self):
161 |         for m in self.modules():
162 |             if isinstance(m, nn.Conv2d):
163 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
164 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
165 |                 if m.bias is not None:
166 |                     m.bias.data.zero_()
167 |             elif isinstance(m, nn.BatchNorm2d):
168 |                 m.weight.data.fill_(1)
169 |                 m.bias.data.zero_()
170 |             elif isinstance(m, nn.Linear):
171 |                 n = m.weight.size(1)
172 |                 m.weight.data.normal_(0, 0.01)
173 |                 m.bias.data.zero_()
174 | 


--------------------------------------------------------------------------------
/vision/ssd/ssd.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch
  3 | import numpy as np
  4 | from typing import List, Tuple
  5 | import torch.nn.functional as F
  6 | 
  7 | from ..utils import box_utils
  8 | from collections import namedtuple
  9 | GraphPath = namedtuple("GraphPath", ['s0', 'name'])  #
 10 | 
 11 | 
 12 | class SSD(nn.Module):
 13 |     def __init__(self, num_classes: int, base_net: nn.ModuleList, source_layer_indexes: List[int],
 14 |                  extras: nn.ModuleList, classification_headers: nn.ModuleList,
 15 |                  regression_headers: nn.ModuleList, is_test=False, config=None, device=None):
 16 |         """Compose a SSD model using the given components.
 17 |         """
 18 |         super(SSD, self).__init__()
 19 | 
 20 |         self.num_classes = num_classes
 21 |         self.base_net = base_net
 22 |         self.source_layer_indexes = source_layer_indexes
 23 |         self.extras = extras
 24 |         self.classification_headers = classification_headers
 25 |         self.regression_headers = regression_headers
 26 |         self.is_test = is_test
 27 |         self.config = config
 28 | 
 29 |         self.source_layer_add_ons = nn.ModuleList([t[1] for t in source_layer_indexes
 30 |                                                    if isinstance(t, tuple) and not isinstance(t, GraphPath)])
 31 |         if device:
 32 |             self.device = device
 33 |         else:
 34 |             self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 35 |         if is_test:
 36 |             self.config = config
 37 |             self.priors = config.priors.to(self.device)
 38 |             
 39 |     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
 40 |         confidences = []
 41 |         locations = []
 42 |         start_layer_index = 0
 43 |         header_index = 0
 44 |         for index_0, end_layer_index in enumerate(self.source_layer_indexes):#总共要循环两次
 45 | 
 46 |             if isinstance(end_layer_index, GraphPath):#第一次循环执行
 47 |                 path = end_layer_index
 48 |                 end_layer_index = end_layer_index.s0
 49 |                 added_layer = None
 50 |             else:#第二次循环执行
 51 |                 added_layer = None
 52 |                 path = None
 53 | 
 54 | 
 55 |             for index_1,layer in enumerate(self.base_net[start_layer_index: end_layer_index]):
 56 |                 x = layer(x)
 57 |             if added_layer:
 58 |                 y = added_layer(x)
 59 |             else:
 60 |                 y = x
 61 | 
 62 |   
 63 |             if path:#只在第一次循环有值 GraphPath(s0=11, name='conv', s1=-1)
 64 |                 sub = getattr(self.base_net[end_layer_index], path.name)
 65 |                 for index_2,layer in enumerate(sub):
 66 |                     y = layer(y)
 67 |                 # sub: Sequential(
 68 |                 #   (0): Conv2d(48, 288, kernel_size=(1, 1), stride=(1, 1), bias=False)
 69 |                 #   (1): BatchNorm2d(288, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 70 |                 #   (2): h_swish()
 71 |                 # )
 72 | 
 73 | 
 74 |          
 75 |  
 76 | 
 77 |       
 78 |             start_layer_index = end_layer_index
 79 |             confidence, location = self.compute_header(header_index, y)
 80 |             header_index += 1
 81 |             confidences.append(confidence)
 82 |             locations.append(location)
 83 | 
 84 |         for layer in self.base_net[end_layer_index:]:
 85 |             x = layer(x)
 86 | 
 87 |         for layer in self.extras:
 88 |             x = layer(x)
 89 |             confidence, location = self.compute_header(header_index, x)
 90 |             header_index += 1
 91 |             confidences.append(confidence)
 92 |             locations.append(location)
 93 | 
 94 |         confidences = torch.cat(confidences, 1)
 95 |         locations = torch.cat(locations, 1)
 96 |         
 97 |         if self.is_test:
 98 |             confidences = F.softmax(confidences, dim=2)
 99 |             boxes = box_utils.convert_locations_to_boxes(
100 |                 locations, self.priors, self.config.center_variance, self.config.size_variance
101 |             )
102 |             boxes = box_utils.center_form_to_corner_form(boxes)
103 |             return confidences, boxes
104 |         else:
105 |             return confidences, locations
106 |             
107 | 
108 |     def compute_header(self, i, x):
109 | 
110 |         confidence = self.classification_headers[i](x)
111 |         confidence = confidence.permute(0, 2, 3, 1).contiguous()
112 |         confidence = confidence.view(confidence.size(0), -1, self.num_classes)
113 |         location = self.regression_headers[i](x)
114 |         location = location.permute(0, 2, 3, 1).contiguous()
115 |         location = location.view(location.size(0), -1, 4)
116 |         return confidence, location
117 | 
118 |     def init_from_base_net(self, model):
119 |         self.base_net.load_state_dict(torch.load(model, map_location=lambda storage, loc: storage), strict=True)
120 |         self.source_layer_add_ons.apply(_xavier_init_)
121 |         self.extras.apply(_xavier_init_)
122 |         self.classification_headers.apply(_xavier_init_)
123 |         self.regression_headers.apply(_xavier_init_)
124 | 
125 |     def init_from_pretrained_ssd(self, model):
126 |         state_dict = torch.load(model, map_location=lambda storage, loc: storage)
127 |         state_dict = {k: v for k, v in state_dict.items() if not (k.startswith("classification_headers") or k.startswith("regression_headers"))}
128 |         model_dict = self.state_dict()
129 |         model_dict.update(state_dict)
130 |         self.load_state_dict(model_dict)
131 |         self.classification_headers.apply(_xavier_init_)
132 |         self.regression_headers.apply(_xavier_init_)
133 | 
134 |     def init(self):
135 |         self.base_net.apply(_xavier_init_)
136 |         self.source_layer_add_ons.apply(_xavier_init_)
137 |         self.extras.apply(_xavier_init_)
138 |         self.classification_headers.apply(_xavier_init_)
139 |         self.regression_headers.apply(_xavier_init_)
140 | 
141 |     def load(self, model):
142 |         self.load_state_dict(torch.load(model, map_location=lambda storage, loc: storage))
143 | 
144 |     def save(self, model_path):
145 |         torch.save(self.state_dict(), model_path)
146 | 
147 | 
148 | class MatchPrior(object):
149 |     def __init__(self, center_form_priors, center_variance, size_variance, iou_threshold):
150 |         self.center_form_priors = center_form_priors
151 |         self.corner_form_priors = box_utils.center_form_to_corner_form(center_form_priors)
152 |         self.center_variance = center_variance
153 |         self.size_variance = size_variance
154 |         self.iou_threshold = iou_threshold
155 | 
156 |     def __call__(self, gt_boxes, gt_labels):
157 |         if type(gt_boxes) is np.ndarray:
158 |             gt_boxes = torch.from_numpy(gt_boxes)
159 |         if type(gt_labels) is np.ndarray:
160 |             gt_labels = torch.from_numpy(gt_labels)
161 |         boxes, labels = box_utils.assign_priors(gt_boxes, gt_labels,
162 |                                                 self.corner_form_priors, self.iou_threshold)
163 |         boxes = box_utils.corner_form_to_center_form(boxes)
164 |         locations = box_utils.convert_boxes_to_locations(boxes, self.center_form_priors, self.center_variance, self.size_variance)
165 |         return locations, labels
166 | 
167 | 
168 | def _xavier_init_(m: nn.Module):
169 |     if isinstance(m, nn.Conv2d):
170 |         nn.init.xavier_uniform_(m.weight)
171 | 


--------------------------------------------------------------------------------
/open_images_downloader.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import boto3
  3 | from botocore import UNSIGNED
  4 | from botocore.config import Config
  5 | import botocore
  6 | import logging
  7 | from multiprocessing import Pool, Manager
  8 | import pandas as pd
  9 | import os
 10 | import argparse
 11 | import sys
 12 | import functools
 13 | from urllib import request
 14 | 
 15 | 
 16 | s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
 17 | 
 18 | 
 19 | def download(bucket, root, retry, counter, lock, path):
 20 |     i = 0
 21 |     src = path
 22 |     dest = f"{root}/{path}"
 23 |     while i < retry:
 24 |         try:
 25 |             if not os.path.exists(dest):
 26 |                 s3.download_file(bucket, src, dest)
 27 |             else:
 28 |                 logging.info(f"{dest} already exists.")
 29 |             with lock:
 30 |                 counter.value += 1
 31 |                 if counter.value % 100 == 0:
 32 |                     logging.warning(f"Downloaded {counter.value} images.")
 33 |             return
 34 |         except botocore.exceptions.ClientError as e:
 35 |             if e.response['Error']['Code'] == "404":
 36 |                 logging.warning(f"The file s3://{bucket}/{src} does not exist.")
 37 |                 return
 38 |             i += 1
 39 |             logging.warning(f"Sleep {i} and try again.")
 40 |             time.sleep(i)
 41 |     logging.warning(f"Failed to download the file s3://{bucket}/{src}. Exception: {e}")
 42 | 
 43 | 
 44 | def batch_download(bucket, file_paths, root, num_workers=10, retry=10):
 45 |     with Pool(num_workers) as p:
 46 |         m = Manager()
 47 |         counter = m.Value('i', 0)
 48 |         lock = m.Lock()
 49 |         download_ = functools.partial(download, bucket, root, retry, counter, lock)
 50 |         p.map(download_, file_paths)
 51 | 
 52 | 
 53 | def http_download(url, path):
 54 |     with request.urlopen(url) as f:
 55 |         with open(path, "wb") as fout:
 56 |             buf = f.read(1024)
 57 |             while buf:
 58 |                 fout.write(buf)
 59 |                 buf = f.read(1024)
 60 | 
 61 | 
 62 | def log_counts(values):
 63 |     for k, count in values.value_counts().iteritems():
 64 |         logging.warning(f"{k}: {count}/{len(values)} = {count/len(values):.2f}.")
 65 | 
 66 | 
 67 | def parse_args():
 68 |     parser = argparse.ArgumentParser(
 69 |         description='Dowload open image dataset by class.')
 70 | 
 71 |     parser.add_argument("--root", type=str,
 72 |                         help='The root directory that you want to store the open image data.')
 73 |     parser.add_argument("include_depiction", action="store_true",
 74 |                         help="Do you want to include drawings or depictions?")
 75 |     parser.add_argument("--class_names", type=str,
 76 |                         help="the classes you want to download.")
 77 |     parser.add_argument("--num_workers", type=int, default=10,
 78 |                         help="the classes you want to download.")
 79 |     parser.add_argument("--retry", type=int, default=10,
 80 |                         help="retry times when downloading.")
 81 |     parser.add_argument("--filter_file", type=str, default="",
 82 |                         help="This file specifies the image ids you want to exclude.")
 83 |     parser.add_argument('--remove_overlapped', action='store_true',
 84 |                         help="Remove single boxes covered by group boxes.")
 85 |     return parser.parse_args()
 86 | 
 87 | 
 88 | if __name__ == '__main__':
 89 |     logging.basicConfig(stream=sys.stdout, level=logging.WARNING,
 90 |                         format='%(asctime)s - %(name)s - %(message)s')
 91 | 
 92 |     args = parse_args()
 93 |     bucket = "open-images-dataset"
 94 |     names = [e.strip() for e in args.class_names.split(",")]
 95 |     class_names = []
 96 |     group_filters = []
 97 |     percentages = []
 98 |     for name in names:
 99 |         t = name.split(":")
100 |         class_names.append(t[0].strip())
101 |         if len(t) >= 2 and t[1].strip():
102 |             group_filters.append(t[1].strip())
103 |         else:
104 |             group_filters.append("")
105 |         if len(t) >= 3 and t[2].strip():
106 |             percentages.append(float(t[2].strip()))
107 |         else:
108 |             percentages.append(1.0)
109 | 
110 |     if not os.path.exists(args.root):
111 |         os.makedirs(args.root)
112 | 
113 |     excluded_images = set()
114 |     if args.filter_file:
115 |         for line in open(args.filter_file):
116 |             img_id = line.strip()
117 |             if not img_id:
118 |                 continue
119 |             excluded_images.add(img_id)
120 | 
121 |     class_description_file = os.path.join(args.root, "class-descriptions-boxable.csv")
122 |     if not os.path.exists(class_description_file):
123 |         url = "https://storage.googleapis.com/openimages/2018_04/class-descriptions-boxable.csv"
124 |         logging.warning(f"Download {url}.")
125 |         http_download(url, class_description_file)
126 | 
127 |     class_descriptions = pd.read_csv(class_description_file,
128 |                                     names=["id", "ClassName"])
129 |     class_descriptions = class_descriptions[class_descriptions['ClassName'].isin(class_names)]
130 | 
131 |     image_files = []
132 |     for dataset_type in ["train", "validation", "test"]:
133 |         image_dir = os.path.join(args.root, dataset_type)
134 |         os.makedirs(image_dir, exist_ok=True)
135 | 
136 |         annotation_file = f"{args.root}/{dataset_type}-annotations-bbox.csv"
137 |         if not os.path.exists(annotation_file):
138 |             url = f"https://storage.googleapis.com/openimages/2018_04/{dataset_type}/{dataset_type}-annotations-bbox.csv"
139 |             logging.warning(f"Download {url}.")
140 |             http_download(url, annotation_file)
141 |         logging.warning(f"Read annotation file {annotation_file}")
142 |         annotations = pd.read_csv(annotation_file)
143 |         annotations = pd.merge(annotations, class_descriptions,
144 |                                left_on="LabelName", right_on="id",
145 |                                how="inner")
146 |         if not args.include_depiction:
147 |             annotations = annotations.loc[annotations['IsDepiction'] != 1, :]
148 | 
149 |         filtered = []
150 |         for class_name, group_filter, percentage in zip(class_names, group_filters, percentages):
151 |             sub = annotations.loc[annotations['ClassName'] == class_name, :]
152 |             excluded_images |= set(sub['ImageID'].sample(frac=1 - percentage))
153 | 
154 |             if group_filter == '~group':
155 |                 excluded_images |= set(sub.loc[sub['IsGroupOf'] == 1, 'ImageID'])
156 |             elif group_filter == 'group':
157 |                 excluded_images |= set(sub.loc[sub['IsGroupOf'] == 0, 'ImageID'])
158 |             filtered.append(sub)
159 | 
160 |         annotations = pd.concat(filtered)
161 |         annotations = annotations.loc[~annotations['ImageID'].isin(excluded_images), :]
162 | 
163 | 
164 |         if args.remove_overlapped:
165 |             images_with_group = annotations.loc[annotations['IsGroupOf'] == 1, 'ImageID']
166 |             annotations = annotations.loc[~(annotations['ImageID'].isin(set(images_with_group)) & (annotations['IsGroupOf'] == 0)), :]
167 |         annotations = annotations.sample(frac=1.0)
168 | 
169 |         logging.warning(f"{dataset_type} bounding boxes size: {annotations.shape[0]}")
170 |         logging.warning("Approximate Image Stats: ")
171 |         log_counts(annotations.drop_duplicates(["ImageID", "ClassName"])["ClassName"])
172 |         logging.warning("Label distribution: ")
173 |         log_counts(annotations['ClassName'])
174 | 
175 |         logging.warning(f"Shuffle dataset.")
176 | 
177 | 
178 |         sub_annotation_file = f"{args.root}/sub-{dataset_type}-annotations-bbox.csv"
179 |         logging.warning(f"Save {dataset_type} data to {sub_annotation_file}.")
180 |         annotations.to_csv(sub_annotation_file, index=False)
181 |         image_files.extend(f"{dataset_type}/{id}.jpg" for id in set(annotations['ImageID']))
182 |     logging.warning(f"Start downloading {len(image_files)} images.")
183 |     batch_download(bucket, image_files, args.root, args.num_workers, args.retry)
184 |     logging.warning("Task Done.")
185 | 


--------------------------------------------------------------------------------
/vision/utils/box_utils_numpy.py:
--------------------------------------------------------------------------------
  1 | from .box_utils import SSDSpec
  2 | 
  3 | from typing import List
  4 | import itertools
  5 | import math
  6 | import numpy as np
  7 | 
  8 | 
  9 | def generate_ssd_priors(specs: List[SSDSpec], image_size, clamp=True):
 10 |     """Generate SSD Prior Boxes.
 11 | 
 12 |     It returns the center, height and width of the priors. The values are relative to the image size
 13 |     Args:
 14 |         specs: SSDSpecs about the shapes of sizes of prior boxes. i.e.
 15 |             specs = [
 16 |                 SSDSpec(38, 8, SSDBoxSizes(30, 60), [2]),
 17 |                 SSDSpec(19, 16, SSDBoxSizes(60, 111), [2, 3]),
 18 |                 SSDSpec(10, 32, SSDBoxSizes(111, 162), [2, 3]),
 19 |                 SSDSpec(5, 64, SSDBoxSizes(162, 213), [2, 3]),
 20 |                 SSDSpec(3, 100, SSDBoxSizes(213, 264), [2]),
 21 |                 SSDSpec(1, 300, SSDBoxSizes(264, 315), [2])
 22 |             ]
 23 |         image_size: image size.
 24 |         clamp: if true, clamp the values to make fall between [0.0, 1.0]
 25 |     Returns:
 26 |         priors (num_priors, 4): The prior boxes represented as [[center_x, center_y, w, h]]. All the values
 27 |             are relative to the image size.
 28 |     """
 29 |     priors = []
 30 |     for spec in specs:
 31 |         scale = image_size / spec.shrinkage
 32 |         for j, i in itertools.product(range(spec.feature_map_size), repeat=2):
 33 |             x_center = (i + 0.5) / scale
 34 |             y_center = (j + 0.5) / scale
 35 | 
 36 |             # small sized square box
 37 |             size = spec.box_sizes.min
 38 |             h = w = size / image_size
 39 |             priors.append([
 40 |                 x_center,
 41 |                 y_center,
 42 |                 w,
 43 |                 h
 44 |             ])
 45 | 
 46 |             # big sized square box
 47 |             size = math.sqrt(spec.box_sizes.max * spec.box_sizes.min)
 48 |             h = w = size / image_size
 49 |             priors.append([
 50 |                 x_center,
 51 |                 y_center,
 52 |                 w,
 53 |                 h
 54 |             ])
 55 | 
 56 |             # change h/w ratio of the small sized box
 57 |             size = spec.box_sizes.min
 58 |             h = w = size / image_size
 59 |             for ratio in spec.aspect_ratios:
 60 |                 ratio = math.sqrt(ratio)
 61 |                 priors.append([
 62 |                     x_center,
 63 |                     y_center,
 64 |                     w * ratio,
 65 |                     h / ratio
 66 |                 ])
 67 |                 priors.append([
 68 |                     x_center,
 69 |                     y_center,
 70 |                     w / ratio,
 71 |                     h * ratio
 72 |                 ])
 73 | 
 74 |     priors = np.array(priors, dtype=np.float32)
 75 |     if clamp:
 76 |         np.clip(priors, 0.0, 1.0, out=priors)
 77 |     return priors
 78 | 
 79 | 
 80 | def convert_locations_to_boxes(locations, priors, center_variance,
 81 |                                size_variance):
 82 |     """Convert regressional location results of SSD into boxes in the form of (center_x, center_y, h, w).
 83 | 
 84 |     The conversion:
 85 |         $$predicted\_center * center_variance = \frac {real\_center - prior\_center} {prior\_hw}$$
 86 |         $$exp(predicted\_hw * size_variance) = \frac {real\_hw} {prior\_hw}$$
 87 |     We do it in the inverse direction here.
 88 |     Args:
 89 |         locations (batch_size, num_priors, 4): the regression output of SSD. It will contain the outputs as well.
 90 |         priors (num_priors, 4) or (batch_size/1, num_priors, 4): prior boxes.
 91 |         center_variance: a float used to change the scale of center.
 92 |         size_variance: a float used to change of scale of size.
 93 |     Returns:
 94 |         boxes:  priors: [[center_x, center_y, h, w]]. All the values
 95 |             are relative to the image size.
 96 |     """
 97 |     # priors can have one dimension less.
 98 |     if len(priors.shape) + 1 == len(locations.shape):
 99 |         priors = np.expand_dims(priors, 0)
100 |     return np.concatenate([
101 |         locations[..., :2] * center_variance * priors[..., 2:] + priors[..., :2],
102 |         np.exp(locations[..., 2:] * size_variance) * priors[..., 2:]
103 |     ], axis=len(locations.shape) - 1)
104 | 
105 | 
106 | def convert_boxes_to_locations(center_form_boxes, center_form_priors, center_variance, size_variance):
107 |     # priors can have one dimension less
108 |     if len(center_form_priors.shape) + 1 == len(center_form_boxes.shape):
109 |         center_form_priors = np.expand_dims(center_form_priors, 0)
110 |     return np.concatenate([
111 |         (center_form_boxes[..., :2] - center_form_priors[..., :2]) / center_form_priors[..., 2:] / center_variance,
112 |         np.log(center_form_boxes[..., 2:] / center_form_priors[..., 2:]) / size_variance
113 |     ], axis=len(center_form_boxes.shape) - 1)
114 | 
115 | 
116 | def area_of(left_top, right_bottom):
117 |     """Compute the areas of rectangles given two corners.
118 | 
119 |     Args:
120 |         left_top (N, 2): left top corner.
121 |         right_bottom (N, 2): right bottom corner.
122 | 
123 |     Returns:
124 |         area (N): return the area.
125 |     """
126 |     hw = np.clip(right_bottom - left_top, 0.0, None)
127 |     return hw[..., 0] * hw[..., 1]
128 | 
129 | 
130 | def iou_of(boxes0, boxes1, eps=1e-5):
131 |     """Return intersection-over-union (Jaccard index) of boxes.
132 | 
133 |     Args:
134 |         boxes0 (N, 4): ground truth boxes.
135 |         boxes1 (N or 1, 4): predicted boxes.
136 |         eps: a small number to avoid 0 as denominator.
137 |     Returns:
138 |         iou (N): IoU values.
139 |     """
140 |     overlap_left_top = np.maximum(boxes0[..., :2], boxes1[..., :2])
141 |     overlap_right_bottom = np.minimum(boxes0[..., 2:], boxes1[..., 2:])
142 | 
143 |     overlap_area = area_of(overlap_left_top, overlap_right_bottom)
144 |     area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
145 |     area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
146 |     return overlap_area / (area0 + area1 - overlap_area + eps)
147 | 
148 | 
149 | def center_form_to_corner_form(locations):
150 |     return np.concatenate([locations[..., :2] - locations[..., 2:]/2,
151 |                      locations[..., :2] + locations[..., 2:]/2], len(locations.shape) - 1)
152 | 
153 | 
154 | def corner_form_to_center_form(boxes):
155 |     return np.concatenate([
156 |         (boxes[..., :2] + boxes[..., 2:]) / 2,
157 |          boxes[..., 2:] - boxes[..., :2]
158 |     ], len(boxes.shape) - 1)
159 | 
160 | 
161 | def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
162 |     """
163 | 
164 |     Args:
165 |         box_scores (N, 5): boxes in corner-form and probabilities.
166 |         iou_threshold: intersection over union threshold.
167 |         top_k: keep top_k results. If k <= 0, keep all the results.
168 |         candidate_size: only consider the candidates with the highest scores.
169 |     Returns:
170 |          picked: a list of indexes of the kept boxes
171 |     """
172 |     scores = box_scores[:, -1]
173 |     boxes = box_scores[:, :-1]
174 |     picked = []
175 |     #_, indexes = scores.sort(descending=True)
176 |     indexes = np.argsort(scores)
177 |     #indexes = indexes[:candidate_size]
178 |     indexes = indexes[-candidate_size:]
179 |     while len(indexes) > 0:
180 |         #current = indexes[0]
181 |         current = indexes[-1]
182 |         picked.append(current)
183 |         if 0 < top_k == len(picked) or len(indexes) == 1:
184 |             break
185 |         current_box = boxes[current, :]
186 |         #indexes = indexes[1:]
187 |         indexes = indexes[:-1]
188 |         rest_boxes = boxes[indexes, :]
189 |         iou = iou_of(
190 |             rest_boxes,
191 |             np.expand_dims(current_box, axis=0),
192 |         )
193 |         indexes = indexes[iou <= iou_threshold]
194 | 
195 |     return box_scores[picked, :]
196 | 
197 | 
198 | # def nms(box_scores, nms_method=None, score_threshold=None, iou_threshold=None,
199 | #         sigma=0.5, top_k=-1, candidate_size=200):
200 | #     if nms_method == "soft":
201 | #         return soft_nms(box_scores, score_threshold, sigma, top_k)
202 | #     else:
203 | #         return hard_nms(box_scores, iou_threshold, top_k, candidate_size=candidate_size)
204 | 
205 | #
206 | # def soft_nms(box_scores, score_threshold, sigma=0.5, top_k=-1):
207 | #     """Soft NMS implementation.
208 | #
209 | #     References:
210 | #         https://arxiv.org/abs/1704.04503
211 | #         https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/cython_nms.pyx
212 | #
213 | #     Args:
214 | #         box_scores (N, 5): boxes in corner-form and probabilities.
215 | #         score_threshold: boxes with scores less than value are not considered.
216 | #         sigma: the parameter in score re-computation.
217 | #             scores[i] = scores[i] * exp(-(iou_i)^2 / simga)
218 | #         top_k: keep top_k results. If k <= 0, keep all the results.
219 | #     Returns:
220 | #          picked_box_scores (K, 5): results of NMS.
221 | #     """
222 | #     picked_box_scores = []
223 | #     while box_scores.size(0) > 0:
224 | #         max_score_index = torch.argmax(box_scores[:, 4])
225 | #         cur_box_prob = torch.tensor(box_scores[max_score_index, :])
226 | #         picked_box_scores.append(cur_box_prob)
227 | #         if len(picked_box_scores) == top_k > 0 or box_scores.size(0) == 1:
228 | #             break
229 | #         cur_box = cur_box_prob[:-1]
230 | #         box_scores[max_score_index, :] = box_scores[-1, :]
231 | #         box_scores = box_scores[:-1, :]
232 | #         ious = iou_of(cur_box.unsqueeze(0), box_scores[:, :-1])
233 | #         box_scores[:, -1] = box_scores[:, -1] * torch.exp(-(ious * ious) / sigma)
234 | #         box_scores = box_scores[box_scores[:, -1] > score_threshold, :]
235 | #     if len(picked_box_scores) > 0:
236 | #         return torch.stack(picked_box_scores)
237 | #     else:
238 | #         return torch.tensor([])
239 | 


--------------------------------------------------------------------------------
/eval_ssd.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from vision.ssd.vgg_ssd import create_vgg_ssd, create_vgg_ssd_predictor
  3 | from vision.ssd.mobilenetv1_ssd import create_mobilenetv1_ssd, create_mobilenetv1_ssd_predictor
  4 | from vision.ssd.mobilenetv1_ssd_lite import create_mobilenetv1_ssd_lite, create_mobilenetv1_ssd_lite_predictor
  5 | from vision.ssd.squeezenet_ssd_lite import create_squeezenet_ssd_lite, create_squeezenet_ssd_lite_predictor
  6 | from vision.datasets.voc_dataset import VOCDataset
  7 | from vision.datasets.open_images import OpenImagesDataset
  8 | from vision.utils import box_utils, measurements
  9 | from vision.utils.misc import str2bool, Timer
 10 | import argparse
 11 | import pathlib
 12 | import numpy as np
 13 | import logging
 14 | import sys
 15 | from vision.ssd.mobilenet_v2_ssd_lite import create_mobilenetv2_ssd_lite, create_mobilenetv2_ssd_lite_predictor
 16 | from vision.ssd.mobilenet_v3_ssd_lite import create_mobilenetv3_ssd_lite, create_mobilenetv3_ssd_lite_predictor
 17 | 
 18 | parser = argparse.ArgumentParser(description="SSD Evaluation on VOC Dataset.")
 19 | parser.add_argument('--net', default="vgg16-ssd",
 20 |                     help="The network architecture, it should be of mb1-ssd, mb1-ssd-lite, mb2-ssd-lite or vgg16-ssd.")
 21 | parser.add_argument("--trained_model", type=str)
 22 | 
 23 | parser.add_argument("--dataset_type", default="voc", type=str,
 24 |                     help='Specify dataset type. Currently support voc and open_images.')
 25 | parser.add_argument("--dataset", type=str, help="The root directory of the VOC dataset or Open Images dataset.")
 26 | parser.add_argument("--label_file", type=str, help="The label file path.")
 27 | parser.add_argument("--use_cuda", type=str2bool, default=True)
 28 | parser.add_argument("--use_2007_metric", type=str2bool, default=True)
 29 | parser.add_argument("--nms_method", type=str, default="hard")
 30 | parser.add_argument("--iou_threshold", type=float, default=0.5, help="The threshold of Intersection over Union.")
 31 | parser.add_argument("--eval_dir", default="eval_results", type=str, help="The directory to store evaluation results.")
 32 | parser.add_argument('--mb2_width_mult', default=1.0, type=float,
 33 |                     help='Width Multiplifier for MobilenetV2')
 34 | args = parser.parse_args()
 35 | DEVICE = torch.device("cuda:0" if torch.cuda.is_available() and args.use_cuda else "cpu")
 36 | 
 37 | 
 38 | def group_annotation_by_class(dataset):
 39 |     true_case_stat = {}
 40 |     all_gt_boxes = {}
 41 |     all_difficult_cases = {}
 42 |     for i in range(len(dataset)):
 43 |         image_id, annotation = dataset.get_annotation(i)
 44 |         gt_boxes, classes, is_difficult = annotation
 45 |         gt_boxes = torch.from_numpy(gt_boxes)
 46 |         for i, difficult in enumerate(is_difficult):
 47 |             class_index = int(classes[i])
 48 |             gt_box = gt_boxes[i]
 49 |             if not difficult:
 50 |                 true_case_stat[class_index] = true_case_stat.get(class_index, 0) + 1
 51 | 
 52 |             if class_index not in all_gt_boxes:
 53 |                 all_gt_boxes[class_index] = {}
 54 |             if image_id not in all_gt_boxes[class_index]:
 55 |                 all_gt_boxes[class_index][image_id] = []
 56 |             all_gt_boxes[class_index][image_id].append(gt_box)
 57 |             if class_index not in all_difficult_cases:
 58 |                 all_difficult_cases[class_index]={}
 59 |             if image_id not in all_difficult_cases[class_index]:
 60 |                 all_difficult_cases[class_index][image_id] = []
 61 |             all_difficult_cases[class_index][image_id].append(difficult)
 62 | 
 63 |     for class_index in all_gt_boxes:
 64 |         for image_id in all_gt_boxes[class_index]:
 65 |             all_gt_boxes[class_index][image_id] = torch.stack(all_gt_boxes[class_index][image_id])
 66 |     for class_index in all_difficult_cases:
 67 |         for image_id in all_difficult_cases[class_index]:
 68 |             all_gt_boxes[class_index][image_id] = torch.tensor(all_gt_boxes[class_index][image_id])
 69 |     return true_case_stat, all_gt_boxes, all_difficult_cases
 70 | 
 71 | 
 72 | def compute_average_precision_per_class(num_true_cases, gt_boxes, difficult_cases,
 73 |                                         prediction_file, iou_threshold, use_2007_metric):
 74 |     with open(prediction_file) as f:
 75 |         image_ids = []
 76 |         boxes = []
 77 |         scores = []
 78 |         for line in f:
 79 |             t = line.rstrip().split(" ")
 80 |             image_ids.append(t[0])
 81 |             scores.append(float(t[1]))
 82 |             box = torch.tensor([float(v) for v in t[2:]]).unsqueeze(0)
 83 |             box -= 1.0  # convert to python format where indexes start from 0
 84 |             boxes.append(box)
 85 |         scores = np.array(scores)
 86 |         sorted_indexes = np.argsort(-scores)
 87 |         boxes = [boxes[i] for i in sorted_indexes]
 88 |         image_ids = [image_ids[i] for i in sorted_indexes]
 89 |         true_positive = np.zeros(len(image_ids))
 90 |         false_positive = np.zeros(len(image_ids))
 91 |         matched = set()
 92 |         for i, image_id in enumerate(image_ids):
 93 |             box = boxes[i]
 94 |             if image_id not in gt_boxes:
 95 |                 false_positive[i] = 1
 96 |                 continue
 97 | 
 98 |             gt_box = gt_boxes[image_id]
 99 |             ious = box_utils.iou_of(box, gt_box)
100 |             max_iou = torch.max(ious).item()
101 |             max_arg = torch.argmax(ious).item()
102 |             if max_iou > iou_threshold:
103 |                 if difficult_cases[image_id][max_arg] == 0:
104 |                     if (image_id, max_arg) not in matched:
105 |                         true_positive[i] = 1
106 |                         matched.add((image_id, max_arg))
107 |                     else:
108 |                         false_positive[i] = 1
109 |             else:
110 |                 false_positive[i] = 1
111 | 
112 |     true_positive = true_positive.cumsum()
113 |     false_positive = false_positive.cumsum()
114 |     precision = true_positive / (true_positive + false_positive)
115 |     recall = true_positive / num_true_cases
116 |     if use_2007_metric:
117 |         return measurements.compute_voc2007_average_precision(precision, recall)
118 |     else:
119 |         return measurements.compute_average_precision(precision, recall)
120 | 
121 | 
122 | if __name__ == '__main__':
123 |     eval_path = pathlib.Path(args.eval_dir)
124 |     eval_path.mkdir(exist_ok=True)
125 |     timer = Timer()
126 |     class_names = [name.strip() for name in open(args.label_file).readlines()]
127 | 
128 |     if args.dataset_type == "voc":
129 |         dataset = VOCDataset(args.dataset, is_test=True)
130 |     elif args.dataset_type == 'open_images':
131 |         dataset = OpenImagesDataset(args.dataset, dataset_type="test")
132 | 
133 |     true_case_stat, all_gb_boxes, all_difficult_cases = group_annotation_by_class(dataset)
134 |     if args.net == 'vgg16-ssd':
135 |         net = create_vgg_ssd(len(class_names), is_test=True)
136 |     elif args.net == 'mb1-ssd':
137 |         net = create_mobilenetv1_ssd(len(class_names), is_test=True)
138 |     elif args.net == 'mb1-ssd-lite':
139 |         net = create_mobilenetv1_ssd_lite(len(class_names), is_test=True)
140 |     elif args.net == 'sq-ssd-lite':
141 |         net = create_squeezenet_ssd_lite(len(class_names), is_test=True)
142 |     elif args.net == 'mb2-ssd-lite':
143 |         net = create_mobilenetv2_ssd_lite(len(class_names), width_mult=args.mb2_width_mult, is_test=True)
144 |     elif args.net == 'mb3-ssd-lite':
145 |         net = create_mobilenetv3_ssd_lite(len(class_names), is_test=True)
146 |     else:
147 |         logging.fatal("The net type is wrong. It should be one of vgg16-ssd, mb1-ssd and mb1-ssd-lite.")
148 |         parser.print_help(sys.stderr)
149 |         sys.exit(1)  
150 | 
151 |     timer.start("Load Model")
152 |     net.load(args.trained_model)
153 |     net = net.to(DEVICE)
154 |     print(f'It took {timer.end("Load Model")} seconds to load the model.')
155 |     if args.net == 'vgg16-ssd':
156 |         predictor = create_vgg_ssd_predictor(net, nms_method=args.nms_method, device=DEVICE)
157 |     elif args.net == 'mb1-ssd':
158 |         predictor = create_mobilenetv1_ssd_predictor(net, nms_method=args.nms_method, device=DEVICE)
159 |     elif args.net == 'mb1-ssd-lite':
160 |         predictor = create_mobilenetv1_ssd_lite_predictor(net, nms_method=args.nms_method, device=DEVICE)
161 |     elif args.net == 'sq-ssd-lite':
162 |         predictor = create_squeezenet_ssd_lite_predictor(net,nms_method=args.nms_method, device=DEVICE)
163 |     elif args.net == 'mb2-ssd-lite':
164 |         predictor = create_mobilenetv2_ssd_lite_predictor(net, nms_method=args.nms_method, device=DEVICE)
165 |     elif args.net == 'mb3-ssd-lite':
166 |         predictor = create_mobilenetv3_ssd_lite_predictor(net, nms_method=args.nms_method, device=DEVICE)
167 |     else:
168 |         logging.fatal("The net type is wrong. It should be one of vgg16-ssd, mb1-ssd and mb1-ssd-lite.")
169 |         parser.print_help(sys.stderr)
170 |         sys.exit(1)
171 | 
172 |     results = []
173 |     for i in range(len(dataset)):
174 |         print("process image", i)
175 |         timer.start("Load Image")
176 |         image = dataset.get_image(i)
177 |         print("Load Image: {:4f} seconds.".format(timer.end("Load Image")))
178 |         timer.start("Predict")
179 |         boxes, labels, probs = predictor.predict(image)
180 |         print("Prediction: {:4f} seconds.".format(timer.end("Predict")))
181 |         indexes = torch.ones(labels.size(0), 1, dtype=torch.float32) * i
182 |         results.append(torch.cat([
183 |             indexes.reshape(-1, 1),
184 |             labels.reshape(-1, 1).float(),
185 |             probs.reshape(-1, 1),
186 |             boxes + 1.0  # matlab's indexes start from 1
187 |         ], dim=1))
188 |     results = torch.cat(results)
189 |     for class_index, class_name in enumerate(class_names):
190 |         if class_index == 0: continue  # ignore background
191 |         prediction_path = eval_path / f"det_test_{class_name}.txt"
192 |         with open(prediction_path, "w") as f:
193 |             sub = results[results[:, 1] == class_index, :]
194 |             for i in range(sub.size(0)):
195 |                 prob_box = sub[i, 2:].numpy()
196 |                 image_id = dataset.ids[int(sub[i, 0])]
197 |                 print(
198 |                     image_id + " " + " ".join([str(v) for v in prob_box]),
199 |                     file=f
200 |                 )
201 |     aps = []
202 |     print("\n\nAverage Precision Per-class:")
203 |     for class_index, class_name in enumerate(class_names):
204 |         if class_index == 0:
205 |             continue
206 |         prediction_path = eval_path / f"det_test_{class_name}.txt"
207 |         ap = compute_average_precision_per_class(
208 |             true_case_stat[class_index],
209 |             all_gb_boxes[class_index],
210 |             all_difficult_cases[class_index],
211 |             prediction_path,
212 |             args.iou_threshold,
213 |             args.use_2007_metric
214 |         )
215 |         aps.append(ap)
216 |         print(f"{class_name}: {ap}")
217 | 
218 |     print(f"\nAverage Precision Across All Classes:{sum(aps)/len(aps)}")
219 | 
220 | 
221 | 
222 | 


--------------------------------------------------------------------------------
/vision/prunning/prunner.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import logging
  4 | from heapq import nsmallest
  5 | 
  6 | from ..utils.model_book import ModelBook
  7 | 
  8 | 
  9 | class ModelPrunner:
 10 |     def __init__(self, model, train_fun, ignored_paths=[]):
 11 |         """ Implement the pruning algorithm described in the paper https://arxiv.org/pdf/1611.06440.pdf .
 12 | 
 13 |         The prunning criteria is dC/dh * h, while C is the cost, h is the activation.
 14 |         """
 15 |         self.model = model
 16 |         self.train_fun = train_fun
 17 |         self.ignored_paths = ignored_paths
 18 |         self.book = ModelBook(self.model)
 19 |         self.outputs = {}
 20 |         self.grads = {}
 21 |         self.handles = []
 22 |         self.decendent_batch_norms = {}  # descendants impacted by the conv layers.
 23 |         self.last_conv_path = None    # used to trace the graph
 24 |         self.descendent_convs = {}    # descendants impacted by the conv layers.
 25 |         self.descendent_linears = {}  # descendants impacted by the linear layers.
 26 |         self.last_linear_path = None  # used to trace the graph
 27 | 
 28 |     def _make_new_conv(self, conv, filter_index, channel_type="out"):
 29 |         if not isinstance(conv, nn.Conv2d):
 30 |             raise TypeError(f"The module is not Conv2d, but {type(conv)}.")
 31 | 
 32 |         if channel_type == "out":
 33 |             new_conv = nn.Conv2d(conv.in_channels, conv.out_channels - 1, conv.kernel_size, conv.stride,
 34 |                                  conv.padding, conv.dilation, conv.groups, conv.bias is not None)
 35 |             mask = torch.ones(conv.out_channels, dtype=torch.uint8)
 36 |             mask[filter_index] = 0
 37 |             new_conv.weight.data = conv.weight.data[mask, :, :, :]
 38 |             if conv.bias is not None:
 39 |                 new_conv.bias.data = conv.bias.data[mask]
 40 | 
 41 |         elif channel_type == 'in':
 42 |             new_conv = nn.Conv2d(conv.in_channels - 1, conv.out_channels, conv.kernel_size, conv.stride,
 43 |                                  conv.padding, conv.dilation, conv.groups, conv.bias is not None)
 44 |             mask = torch.ones(conv.in_channels, dtype=torch.uint8)
 45 |             mask[filter_index] = 0
 46 |             new_conv.weight.data = conv.weight.data[:, mask, :, :]
 47 |             if conv.bias is not None:
 48 |                 new_conv.bias.data = conv.bias.data
 49 |         else:
 50 |             raise ValueError(f"{channel_type} should be either 'in' or 'out'.")
 51 |         return new_conv
 52 | 
 53 |     def remove_conv_filter(self, path, filter_index):
 54 |         conv = self.book.get_module(path)
 55 |         logging.info(f'Prune Conv: {"/".join(path)}, Filter: {filter_index}, Layer: {conv}')
 56 |         new_conv = self._make_new_conv(conv, filter_index, channel_type="out")
 57 |         self._update_model(path, new_conv)
 58 | 
 59 |         next_conv_path = self.descendent_convs.get(path)
 60 |         if next_conv_path:
 61 |             next_conv = self.book.get_module(next_conv_path)
 62 |             new_next_conv = self._make_new_conv(next_conv, filter_index, channel_type="in")
 63 |             self._update_model(next_conv_path, new_next_conv)
 64 | 
 65 |         # reduce the num_features of batch norm
 66 |         batch_norm_path = self.decendent_batch_norms.get(path)
 67 |         if batch_norm_path:
 68 |             batch_norm = self.book.get_module(batch_norm_path)
 69 |             new_batch_norm = nn.BatchNorm2d(batch_norm.num_features - 1)
 70 |             self._update_model(batch_norm_path, new_batch_norm)
 71 | 
 72 |         # reduce the in channels of linear layer
 73 |         linear_path = self.descendent_linears.get(path)
 74 |         if linear_path:
 75 |             linear = self.book.get_module(linear_path)
 76 |             new_linear = self._make_new_linear(linear, filter_index, conv, channel_type="in")
 77 |             self._update_model(linear_path, new_linear)
 78 | 
 79 |     @staticmethod
 80 |     def _make_new_linear(linear, feature_index, conv=None, channel_type="out"):
 81 |         if channel_type == "out":
 82 |             new_linear = nn.Linear(linear.in_features, linear.out_features - 1,
 83 |                                    bias=linear.bias is not None)
 84 |             mask = torch.ones(linear.out_features, dtype=torch.uint8)
 85 |             mask[feature_index] = 0
 86 |             new_linear.weight.data = linear.weight.data[mask, :]
 87 |             if linear.bias is not None:
 88 |                 new_linear.bias.data = linear.bias.data[mask]
 89 |         elif channel_type == "in":
 90 |             if conv:
 91 |                 block = int(linear.in_features / conv.out_channels)
 92 |             else:
 93 |                 block = 1
 94 |             new_linear = nn.Linear(linear.in_features - block, linear.out_features,
 95 |                                    bias=linear.bias is not None)
 96 |             start_index = feature_index * block
 97 |             end_index = (feature_index + 1) * block
 98 |             mask = torch.ones(linear.in_features, dtype=torch.uint8)
 99 |             mask[start_index: end_index] = 0
100 |             new_linear.weight.data = linear.weight.data[:, mask]
101 |             if linear.bias is not None:
102 |                 new_linear.bias.data = linear.bias.data
103 |         else:
104 |             raise ValueError(f"{channel_type} should be either 'in' or 'out'.")
105 |         return new_linear
106 | 
107 |     def prune_conv_layers(self, num=1):
108 |         """Prune one conv2d filter.
109 |         """
110 |         self.register_conv_hooks()
111 |         before_loss, before_accuracy = self.train_fun(self.model)
112 |         ranks = []
113 |         for path, output in self.outputs.items():
114 |             output = output.data
115 |             grad = self.grads[path].data
116 |             v = grad * output
117 |             v = v.sum(0).sum(1).sum(1)  # sum to the channel axis.
118 |             v = torch.abs(v)
119 |             v = v / torch.sqrt(torch.sum(v * v))  # normalize
120 |             for i, e in enumerate(v):
121 |                 ranks.append((path, i, e))
122 |         to_prune = nsmallest(num, ranks, key=lambda t: t[2])
123 |         to_prune = sorted(to_prune, key=lambda t: (t[0], -t[1]))  # prune the filters with bigger indexes first to avoid rearrangement.
124 |         for path, filter_index, value in to_prune:
125 |             self.remove_conv_filter(path, filter_index)
126 |         self.deregister_hooks()
127 |         after_loss, after_accuracy = self.train_fun(self.model)
128 |         return after_loss - before_loss, after_accuracy - before_accuracy
129 | 
130 |     def register_conv_hooks(self):
131 |         """Run register before training for pruning."""
132 |         self.outputs.clear()
133 |         self.grads.clear()
134 |         self.handles.clear()
135 |         self.last_conv_path = None
136 |         self.decendent_batch_norms.clear()
137 |         self.descendent_convs.clear()
138 |         self.descendent_linears.clear()
139 | 
140 |         def forward_hook(m, input, output):
141 |             path = self.book.get_path(m)
142 |             if isinstance(m, nn.Conv2d):
143 |                 if path not in self.ignored_paths:
144 |                     self.outputs[path] = output
145 |                 if self.last_conv_path:
146 |                     self.descendent_convs[self.last_conv_path] = path
147 |                 self.last_conv_path = path
148 |             elif isinstance(m, nn.BatchNorm2d):
149 |                 if self.last_conv_path:
150 |                     self.decendent_batch_norms[self.last_conv_path] = path
151 |             elif isinstance(m, nn.Linear):
152 |                 if self.last_conv_path:
153 |                     self.descendent_linears[self.last_conv_path] = path
154 |                 self.last_conv_path = None  # after a linear layer the conv layer doesn't matter
155 | 
156 |         def backward_hook(m, input, output):
157 |             path = self.book.get_path(m)
158 |             self.grads[path] = output[0]
159 | 
160 |         for path, m in self.book.modules(module_type=(nn.Conv2d, nn.BatchNorm2d, nn.Linear)):
161 |             h = m.register_forward_hook(forward_hook)
162 |             self.handles.append(h)
163 |             h = m.register_backward_hook(backward_hook)
164 |             self.handles.append(h)
165 | 
166 |     def deregister_hooks(self):
167 |         """Run degresiter before retraining to recover the model"""
168 |         for handle in self.handles:
169 |             handle.remove()
170 | 
171 |     def prune_linear_layers(self, num=1):
172 |         self.register_linear_hooks()
173 |         before_loss, before_accuracy = self.train_fun(self.model)
174 |         ranks = []
175 |         for path, output in self.outputs.items():
176 |             output = output.data
177 |             grad = self.grads[path].data
178 |             v = grad * output
179 |             v = v.sum(0)  # sum to the channel axis.
180 |             v = torch.abs(v)
181 |             v = v / torch.sqrt(torch.sum(v * v))  # normalize
182 |             for i, e in enumerate(v):
183 |                 ranks.append((path, i, e))
184 |         to_prune = nsmallest(num, ranks, key=lambda t: t[2])
185 |         to_prune = sorted(to_prune, key=lambda t: (t[0], -t[1]))
186 |         for path, feature_index, value in to_prune:
187 |             self.remove_linear_feature(path, feature_index)
188 |         self.deregister_hooks()
189 |         after_loss, after_accuracy = self.train_fun(self.model)
190 |         return after_loss - before_loss, after_accuracy - before_accuracy
191 | 
192 |     def register_linear_hooks(self):
193 |         self.outputs.clear()
194 |         self.grads.clear()
195 |         self.handles.clear()
196 |         self.descendent_linears.clear()
197 |         self.last_linear_path = None
198 | 
199 |         def forward_hook(m, input, output):
200 |             path = self.book.get_path(m)
201 |             if path not in self.ignored_paths:
202 |                 self.outputs[path] = output
203 |             if self.last_linear_path:
204 |                 self.descendent_linears[self.last_linear_path] = path
205 |             self.last_linear_path = path
206 | 
207 |         def backward_hook(m, input, output):
208 |             path = self.book.get_path(m)
209 |             self.grads[path] = output[0]
210 | 
211 |         for _, m in self.book.linear_modules():
212 |             h = m.register_forward_hook(forward_hook)
213 |             self.handles.append(h)
214 |             h = m.register_backward_hook(backward_hook)
215 |             self.handles.append(h)
216 | 
217 |     def remove_linear_feature(self, path, feature_index):
218 |         linear = self.book.get_module(path)
219 |         logging.info(f'Prune Linear: {"/".join(path)}, Filter: {feature_index}, Layer: {linear}')
220 |         new_linear = self._make_new_linear(linear, feature_index, channel_type="out")
221 |         self._update_model(path, new_linear)
222 | 
223 |         # update following linear layers
224 |         next_linear_path = self.descendent_linears.get(path)
225 |         if next_linear_path:
226 |             next_linear = self.book.get_module(next_linear_path)
227 |             new_next_linear = self._make_new_linear(next_linear, feature_index, channel_type='in')
228 |             self._update_model(next_linear_path, new_next_linear)
229 | 
230 |     def _update_model(self, path, module):
231 |         parent = self.book.get_module(path[:-1])
232 |         parent._modules[path[-1]] = module
233 |         self.book.update(path, module)
234 | 


--------------------------------------------------------------------------------
/prune_alexnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.optim as optim
  4 | from torch.optim import lr_scheduler
  5 | from torchvision import datasets, transforms
  6 | import argparse
  7 | import logging
  8 | import sys
  9 | from tensorboardX import SummaryWriter
 10 | 
 11 | from vision.prunning.prunner import ModelPrunner
 12 | from vision.utils.misc import str2bool
 13 | from vision.nn.alexnet import alexnet
 14 | 
 15 | 
 16 | parser = argparse.ArgumentParser(description='Demonstration of Pruning AlexNet')
 17 | 
 18 | parser.add_argument("--train", dest="train", action="store_true")
 19 | parser.add_argument("--prune_conv", dest="prune_conv", action="store_true")
 20 | parser.add_argument("--prune_linear", dest="prune_linear", action="store_true")
 21 | parser.add_argument("--trained_model", type=str)
 22 | parser.add_argument('--dataset', type=str, help='Dataset directory path')
 23 | parser.add_argument('--validation_dataset', help='Dataset directory path')
 24 | parser.add_argument('--batch_size', default=12, type=int,
 25 |                     help='Batch size for training')
 26 | parser.add_argument('--num_epochs', default=25, type=int,
 27 |                     help='number of batches to train')
 28 | parser.add_argument('--num_recovery_batches', default=2, type=int,
 29 |                     help='number of batches to train to recover the network')
 30 | parser.add_argument('--recovery_learning_rate', default=1e-4, type=float,
 31 |                     help='learning rate to recover the network')
 32 | parser.add_argument('--recovery_batch_size', default=32, type=int,
 33 |                     help='Batch size for training')
 34 | 
 35 | # Params for SGD
 36 | parser.add_argument('--learning_rate', default=1e-3, type=float,
 37 |                     help='initial learning rate')
 38 | parser.add_argument('--momentum', default=0.9, type=float,
 39 |                     help='Momentum value for optim')
 40 | parser.add_argument('--weight_decay', default=5e-4, type=float,
 41 |                     help='Weight decay for SGD')
 42 | parser.add_argument('--gamma', default=0.1, type=float,
 43 |                     help='Gamma update for SGD')
 44 | 
 45 | # Params for Pruning
 46 | parser.add_argument('--prune_conv_num', default=1, type=int,
 47 |                     help='the number of conv filters you want to prune in very iteration.')
 48 | parser.add_argument('--prune_linear_num', default=2, type=int,
 49 |                     help='the number of linear filters you want to prune in very iteration.')
 50 | parser.add_argument('--window', default=10, type=int,
 51 |                     help='Window size for tracking training accuracy.')
 52 | 
 53 | parser.add_argument('--use_cuda', default=True, type=str2bool,
 54 |                     help='Use CUDA to train model')
 55 | 
 56 | 
 57 | args = parser.parse_args()
 58 | DEVICE = torch.device("cuda:0" if torch.cuda.is_available() and args.use_cuda else "cpu")
 59 | cpu_device = torch.device("cpu")
 60 | 
 61 | 
 62 | if args.use_cuda and torch.cuda.is_available():
 63 |     torch.backends.cudnn.benchmark = True
 64 | 
 65 | 
 66 | def train_epoch(net, data_iter, num_epochs=1, optimizer=None):
 67 |     net = net.to(DEVICE)
 68 |     net.train()
 69 |     criterion = nn.CrossEntropyLoss()
 70 | 
 71 |     num = 0
 72 |     for i in range(num_epochs):
 73 |         inputs, labels = next(data_iter)
 74 |         inputs = inputs.to(DEVICE)
 75 |         labels = labels.to(DEVICE)
 76 |         if optimizer:
 77 |             optimizer.zero_grad()
 78 | 
 79 |         outputs = net(inputs)
 80 | 
 81 |         _, preds = torch.max(outputs, 1)
 82 |         loss = criterion(outputs, labels)
 83 |         loss.backward()
 84 |         if optimizer:
 85 |             optimizer.step()
 86 |         train_loss = loss.item() * inputs.size(0)
 87 |         train_accuracy = torch.sum(preds == labels.data).item()
 88 |         num += inputs.size(0)
 89 |     train_loss /= num
 90 |     train_accuracy /= num
 91 |     logging.info('Train Epoch Loss:{:.4f}, Accuracy:{:.4f}'.format(train_loss, train_accuracy))
 92 |     return train_loss, train_accuracy
 93 | 
 94 | 
 95 | def train(net, train_loader, val_loader, num_epochs, learning_rate, save_model=True):
 96 |     net = net.to(DEVICE)
 97 |     net.train()
 98 |     criterion = nn.CrossEntropyLoss()
 99 |     optimizer = optim.SGD(net.parameters(), lr=learning_rate,
100 |                           momentum=args.momentum, weight_decay=args.weight_decay)
101 |     exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
102 | 
103 |     for i in range(num_epochs):
104 |         net.train()
105 |         exp_lr_scheduler.step()
106 |         num = 0
107 |         running_loss = 0.0
108 |         running_corrects = 0.0
109 |         for inputs, labels in train_loader:
110 |             inputs = inputs.to(DEVICE)
111 |             labels = labels.to(DEVICE)
112 |             optimizer.zero_grad()
113 |             outputs = net(inputs)
114 | 
115 |             _, preds = torch.max(outputs, 1)
116 |             loss = criterion(outputs, labels)
117 |             loss.backward()
118 |             optimizer.step()
119 |             running_loss += loss.item() * inputs.size(0)
120 |             running_corrects += torch.sum(preds == labels.data).item()
121 |             num += inputs.size(0)
122 | 
123 |         logging.info('Epoch: {}, Training Loss:{:.4f}, Training Accuracy:{:.4f}'.format(i, running_loss/num, running_corrects/num))
124 |         val_loss, val_accuracy = eval(net, val_loader)
125 |         logging.info('Epoch: {}, Val Loss:{:.4f}, Val Accuracy:{:.4f}'.format(i, val_loss, val_accuracy))
126 |         if save_model:
127 |             torch.save(net.state_dict(), "models/ant-alexnet-epoch-{}-{:.4f}.pth".format(i, val_accuracy))
128 |     return val_loss, val_accuracy
129 | 
130 | 
131 | def eval(net, loader):
132 |     net.eval()
133 |     criterion = nn.CrossEntropyLoss()
134 |     running_loss = 0.0
135 |     running_corrects = 0
136 |     num = 0
137 |     for inputs, labels in loader:
138 |         inputs = inputs.to(DEVICE)
139 |         labels = labels.to(DEVICE)
140 |         with torch.set_grad_enabled(False):
141 |             outputs = net(inputs)
142 | 
143 |             _, preds = torch.max(outputs, 1)
144 |             loss = criterion(outputs, labels)
145 |         running_loss += loss.item() * inputs.size(0)
146 |         running_corrects += torch.sum(preds == labels.data).item()
147 |         num += inputs.size(0)
148 |     running_loss /= num
149 |     running_corrects = running_corrects / num
150 |     return running_loss, running_corrects
151 | 
152 | 
153 | def make_prunner_loader(dataset):
154 |     loader = torch.utils.data.DataLoader(dataset, batch_size=args.recovery_batch_size, shuffle=True, num_workers=1)
155 |     while True:
156 |         for inputs, labels in loader:
157 |             yield inputs, labels
158 | 
159 | if __name__ == '__main__':
160 |     logging.basicConfig(stream=sys.stdout, level=logging.INFO,
161 |                         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
162 | 
163 |     net = alexnet(True)
164 |     net.classifier = nn.Sequential(
165 |             nn.Dropout(),
166 |             nn.Linear(256 * 6 * 6, 4096),
167 |             nn.ReLU(inplace=True),
168 |             nn.Dropout(),
169 |             nn.Linear(4096, 4096),
170 |             nn.ReLU(inplace=True),
171 |             nn.Linear(4096, 2),
172 |         )
173 | 
174 |     train_transform = transforms.Compose([
175 |         transforms.RandomResizedCrop(224),
176 |         transforms.RandomHorizontalFlip(),
177 |         transforms.ToTensor(),
178 |         transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
179 |     ])
180 | 
181 |     val_transform = transforms.Compose([
182 |         transforms.Resize(256),
183 |         transforms.CenterCrop(224),
184 |         transforms.ToTensor(),
185 |         transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
186 |     ])
187 | 
188 |     train_dataset = datasets.ImageFolder(args.dataset, train_transform)
189 |     val_dataset = datasets.ImageFolder(args.validation_dataset, val_transform)
190 |     logging.info(f"Training dataset size: {len(train_dataset)}.")
191 |     logging.info(f"Validation Dataset size: {len(val_dataset)}.")
192 | 
193 |     train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=4)
194 |     val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=len(val_dataset), shuffle=False, num_workers=1)
195 |     writer = SummaryWriter()
196 |     if args.train:
197 |         logging.info("Start training.")
198 |         train(net, train_loader, val_loader, args.num_epochs, args.learning_rate)
199 |     elif args.prune_conv or args.prune_linear:
200 |         net.load_state_dict(torch.load(args.trained_model))
201 |         prunner_data_iter = iter(make_prunner_loader(train_dataset))
202 |         prunner = ModelPrunner(net, lambda model: train_epoch(model, prunner_data_iter),
203 |                                ignored_paths=[('classifier', '6')])  # do not prune the last layer.
204 |         num_filters = prunner.book.num_of_conv2d_filters()
205 |         logging.info(f"Number of Conv2d filters: {num_filters}")
206 | 
207 |         num_linear_filters = prunner.book.num_of_linear_filters()
208 |         logging.info(f"Number of Linear filters: {num_linear_filters}")
209 |         if args.prune_conv:
210 |             prune_num = prunner.book.num_of_conv2d_filters() - 5 * (prunner.book.num_of_conv2d_modules())
211 |         else:
212 |             prune_num = prunner.book.num_of_linear_filters() - 5 * (prunner.book.num_of_linear_modules())
213 |         logging.info(f"Number of Layers to Prune: {prune_num}")
214 |         i = 0
215 |         iteration = 0
216 |         train_data_iter = iter(make_prunner_loader(train_dataset))
217 |         optimizer = optim.SGD(net.parameters(), lr=args.recovery_learning_rate,
218 |                               momentum=args.momentum, weight_decay=args.weight_decay)
219 |         while i < prune_num:
220 |             if args.prune_conv:
221 |                 prunner.prune_conv_layers(args.prune_conv_num)
222 |                 i += args.prune_conv_num
223 |             else:
224 |                 _, accuracy_gain = prunner.prune_linear_layers(args.prune_linear_num)
225 |                 i += args.prune_linear_num
226 |             if iteration % 10 == 0:
227 |                 val_loss, val_accuracy = eval(prunner.model, val_loader)
228 |                 logging.info(f"Prune: {i}/{prune_num}, After Pruning Evaluation Accuracy:{val_accuracy:.4f}.")
229 |             val_loss, val_accuracy = train_epoch(prunner.model, train_data_iter, args.num_recovery_batches, optimizer)
230 |             for name, param in net.named_parameters():
231 |                 writer.add_histogram(name, param.clone().cpu().data.numpy(), 10)
232 |             if iteration % 10 == 0:
233 |                 dummy_input = torch.rand(1, 3, 224, 224)
234 |                 writer.add_graph(net, dummy_input)
235 |                 val_loss, val_accuracy = eval(prunner.model, val_loader)
236 |                 logging.info(f"Prune: {i}/{prune_num}, After Recovery Evaluation Accuracy:{val_accuracy:.4f}.")
237 |                 logging.info(f"Prune: {i}/{prune_num}, Iteration: {iteration}, Save model.")
238 |                 with open(f"models/alexnet-pruned-{i}.txt", "w") as f:
239 |                     print(prunner.model, file=f)
240 |                 torch.save(prunner.model.state_dict(), f"models/prunned-alexnet-{i}-{prune_num}-{val_accuracy:.4f}.pth")
241 |             iteration += 1
242 |     else:
243 |         logging.fatal("You should specify --prune_conv, --prune_linear or --train.")
244 | 
245 |     writer.close()
246 | 


--------------------------------------------------------------------------------
/vision/nn/mobilenet_v3.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | 
  6 | def get_model_parameters(model):
  7 |     total_parameters = 0
  8 |     for layer in list(model.parameters()):
  9 |         layer_parameter = 1
 10 |         for l in list(layer.size()):
 11 |             layer_parameter *= l
 12 |         total_parameters += layer_parameter
 13 |     return total_parameters
 14 | 
 15 | 
 16 | def _weights_init(m):
 17 |     if isinstance(m, nn.Conv2d):
 18 |         torch.nn.init.xavier_uniform_(m.weight)
 19 |         if m.bias is not None:
 20 |             torch.nn.init.zeros_(m.bias)
 21 |     elif isinstance(m, nn.BatchNorm2d):
 22 |         m.weight.data.fill_(1)
 23 |         m.bias.data.zero_()
 24 |     elif isinstance(m, nn.Linear):
 25 |         n = m.weight.size(1)
 26 |         m.weight.data.normal_(0, 0.01)
 27 |         m.bias.data.zero_()
 28 | 
 29 | 
 30 | class h_sigmoid(nn.Module):
 31 |     def __init__(self, inplace=True):
 32 |         super(h_sigmoid, self).__init__()
 33 |         self.inplace = inplace
 34 | 
 35 |     def forward(self, x):
 36 |         return F.relu6(x + 3., inplace=self.inplace) / 6.
 37 | 
 38 | 
 39 | class h_swish(nn.Module):
 40 |     def __init__(self, inplace=True):
 41 |         super(h_swish, self).__init__()
 42 |         self.inplace = inplace
 43 | 
 44 |     def forward(self, x):
 45 |         out = F.relu6(x + 3., self.inplace) / 6.
 46 |         return out * x
 47 | 
 48 | 
 49 | def _make_divisible(v, divisor=8, min_value=None):
 50 |     if min_value is None:
 51 |         min_value = divisor
 52 |     new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
 53 |     # Make sure that round down does not go down by more than 10%.
 54 |     if new_v < 0.9 * v:
 55 |         new_v += divisor
 56 |     return new_v
 57 | 
 58 | 
 59 | class SqueezeBlock(nn.Module):
 60 |     def __init__(self, exp_size, divide=4):
 61 |         super(SqueezeBlock, self).__init__()
 62 |         self.avg_pool = nn.AdaptiveAvgPool2d(1)
 63 |         self.dense = nn.Sequential(
 64 |             nn.Linear(exp_size, exp_size // divide),
 65 |             nn.ReLU(inplace=True),
 66 |             nn.Linear(exp_size // divide, exp_size),
 67 |             h_sigmoid()
 68 |         )
 69 | 
 70 |     def forward(self, x):
 71 |         batch, channels, height, width = x.size()
 72 |         #out = F.avg_pool2d(x, kernel_size=[height, width]).view(batch, -1)
 73 |         out = self.avg_pool(x).view(batch, channels)
 74 |         out = self.dense(out)
 75 |         out = out.view(batch, channels, 1, 1)
 76 |         # out = hard_sigmoid(out)
 77 | 
 78 |         return out * x
 79 | 
 80 | 
 81 | class MobileBlock(nn.Module):
 82 |     # def __init__(self, in_channels, out_channels, kernal_size, stride, nonLinear, SE, exp_size, dropout_rate=1.0):
 83 |     def __init__(self, in_channels, out_channels, kernal_size, stride, nonLinear, SE, exp_size):
 84 |         super(MobileBlock, self).__init__()
 85 |         self.out_channels = out_channels
 86 |         self.nonLinear = nonLinear
 87 |         self.SE = SE
 88 |         # self.dropout_rate = dropout_rate
 89 |         padding = (kernal_size - 1) // 2
 90 | 
 91 |         self.use_connect = stride == 1 and in_channels == out_channels
 92 | 
 93 |         if self.nonLinear == "RE":
 94 |             activation = nn.ReLU
 95 |         else:
 96 |             activation = h_swish
 97 | 
 98 |         self.conv = nn.Sequential(
 99 |             nn.Conv2d(in_channels, exp_size, kernel_size=1, stride=1, padding=0, bias=False),
100 |             nn.BatchNorm2d(exp_size),
101 |             activation(inplace=True)
102 |         )
103 |         self.depth_conv = nn.Sequential(
104 |             nn.Conv2d(exp_size, exp_size, kernel_size=kernal_size, stride=stride, padding=padding, groups=exp_size),
105 |             nn.BatchNorm2d(exp_size),
106 |         )
107 | 
108 |         if self.SE:
109 |             self.squeeze_block = SqueezeBlock(exp_size)
110 | 
111 |         self.point_conv = nn.Sequential(
112 |             nn.Conv2d(exp_size, out_channels, kernel_size=1, stride=1, padding=0),
113 |             nn.BatchNorm2d(out_channels),
114 |             activation(inplace=True)
115 |         )
116 | 
117 |     def forward(self, x):
118 |         # MobileNetV2
119 |         out = self.conv(x)
120 |         out = self.depth_conv(out)
121 | 
122 |         # Squeeze and Excite
123 |         if self.SE:
124 |             out = self.squeeze_block(out)
125 | 
126 |         # point-wise conv
127 |         out = self.point_conv(out)
128 | 
129 |         # connection
130 |         if self.use_connect:
131 |             return x + out
132 |         else:
133 |             return out
134 | 
135 | 
136 | class MobileNetV3(nn.Module):
137 |     # def __init__(self, model_mode="SMALL", num_classes=30, multiplier=1.0):
138 |     def __init__(self, model_mode="SMALL", num_classes=30, multiplier=1.0, dropout_rate=0.0):
139 |         super(MobileNetV3, self).__init__()
140 |         self.num_classes = num_classes
141 | 
142 |         if model_mode == "LARGE":
143 |             layers = [
144 |                 [16, 16, 3, 1, "RE", False, 16],
145 |                 [16, 24, 3, 2, "RE", False, 64],
146 |                 [24, 24, 3, 1, "RE", False, 72],
147 |                 [24, 40, 5, 2, "RE", True, 72],
148 |                 [40, 40, 5, 1, "RE", True, 120],
149 | 
150 |                 [40, 40, 5, 1, "RE", True, 120],
151 |                 [40, 80, 3, 2, "HS", False, 240],
152 |                 [80, 80, 3, 1, "HS", False, 200],
153 |                 [80, 80, 3, 1, "HS", False, 184],
154 |                 [80, 80, 3, 1, "HS", False, 184],
155 | 
156 |                 [80, 112, 3, 1, "HS", True, 480],
157 |                 [112, 112, 3, 1, "HS", True, 672],
158 |                 [112, 160, 5, 1, "HS", True, 672],
159 |                 [160, 160, 5, 2, "HS", True, 672],
160 |                 [160, 160, 5, 1, "HS", True, 960],
161 |             ]
162 |             init_conv_out = _make_divisible(16 * multiplier)
163 |             self.init_conv = nn.Sequential(
164 |                 nn.Conv2d(in_channels=3, out_channels=init_conv_out, kernel_size=3, stride=2, padding=1),
165 |                 nn.BatchNorm2d(init_conv_out),
166 |                 h_swish(inplace=True),
167 |             )
168 | 
169 |             self.block = []
170 |             for in_channels, out_channels, kernal_size, stride, nonlinear, se, exp_size in layers:
171 |                 in_channels = _make_divisible(in_channels * multiplier)
172 |                 out_channels = _make_divisible(out_channels * multiplier)
173 |                 exp_size = _make_divisible(exp_size * multiplier)
174 |                 self.block.append(MobileBlock(in_channels, out_channels, kernal_size, stride, nonlinear, se, exp_size))
175 |             self.block = nn.Sequential(*self.block)
176 | 
177 |             out_conv1_in = _make_divisible(160 * multiplier)
178 |             out_conv1_out = _make_divisible(960 * multiplier)
179 |             self.out_conv1 = nn.Sequential(
180 |                 nn.Conv2d(out_conv1_in, out_conv1_out, kernel_size=1, stride=1),
181 |                 nn.BatchNorm2d(out_conv1_out),
182 |                 h_swish(inplace=True),
183 |             )
184 | 
185 |             out_conv2_in = _make_divisible(960 * multiplier)
186 |             out_conv2_out = _make_divisible(1280 * multiplier)
187 |             self.out_conv2 = nn.Sequential(
188 |                 nn.Conv2d(out_conv2_in, out_conv2_out, kernel_size=1, stride=1),
189 |                 h_swish(inplace=True),
190 |                 nn.Dropout(dropout_rate),
191 |                 nn.Conv2d(out_conv2_out, self.num_classes, kernel_size=1, stride=1),
192 |             )
193 | 
194 |         elif model_mode == "SMALL":
195 |             layers = [
196 |                 [16, 16, 3, 2, "RE", True, 16],
197 |                 [16, 24, 3, 2, "RE", False, 72],
198 |                 [24, 24, 3, 1, "RE", False, 88],
199 |                 [24, 40, 5, 2, "RE", True, 96],
200 |                 [40, 40, 5, 1, "RE", True, 240],
201 |                 [40, 40, 5, 1, "RE", True, 240],
202 |                 [40, 48, 5, 1, "HS", True, 120],
203 |                 [48, 48, 5, 1, "HS", True, 144],
204 |                 [48, 96, 5, 2, "HS", True, 288],
205 |                 [96, 96, 5, 1, "HS", True, 576],
206 |                 [96, 96, 5, 1, "HS", True, 576],
207 |             ]
208 | 
209 |             self.features = []
210 | 
211 |             init_conv_out = _make_divisible(16 * multiplier)
212 |             self.init_conv = nn.Sequential(
213 |                 nn.Conv2d(in_channels=3, out_channels=init_conv_out, kernel_size=3, stride=2, padding=1),
214 |                 nn.BatchNorm2d(init_conv_out),
215 |                 h_swish(inplace=True),
216 |             )
217 | 
218 |             self.features.append(nn.Conv2d(in_channels=3, out_channels=init_conv_out, kernel_size=3, stride=2, padding=1))
219 |             self.features.append(nn.BatchNorm2d(init_conv_out))
220 |             self.features.append(h_swish(inplace=True))
221 | 
222 |             self.block = []
223 |             for in_channels, out_channels, kernal_size, stride, nonlinear, se, exp_size in layers:
224 |                 in_channels = _make_divisible(in_channels * multiplier)
225 |                 out_channels = _make_divisible(out_channels * multiplier)
226 |                 exp_size = _make_divisible(exp_size * multiplier)
227 |                 self.block.append(MobileBlock(in_channels, out_channels, kernal_size, stride, nonlinear, se, exp_size))
228 |                 self.features.append(MobileBlock(in_channels, out_channels, kernal_size, stride, nonlinear, se, exp_size))
229 |             self.block = nn.Sequential(*self.block)
230 | 
231 |             out_conv1_in = _make_divisible(96 * multiplier)
232 |             out_conv1_out = _make_divisible(576 * multiplier)
233 |             self.out_conv1 = nn.Sequential(
234 |                 nn.Conv2d(out_conv1_in, out_conv1_out, kernel_size=1, stride=1),
235 |                 SqueezeBlock(out_conv1_out),
236 |                 nn.BatchNorm2d(out_conv1_out),
237 |                 h_swish(inplace=True),
238 |             )
239 |             self.avg_pool = nn.AdaptiveAvgPool2d(1)
240 |             self.features.append(nn.Conv2d(out_conv1_in, out_conv1_out, kernel_size=1, stride=1))
241 |             self.features.append(SqueezeBlock(out_conv1_out))
242 |             self.features.append(nn.BatchNorm2d(out_conv1_out))
243 |             self.features.append(h_swish(inplace=True))
244 | 
245 |             out_conv2_in = _make_divisible(576 * multiplier)
246 |             out_conv2_out = _make_divisible(1280 * multiplier)
247 |             self.out_conv2 = nn.Sequential(
248 |                 nn.Conv2d(out_conv2_in, out_conv2_out, kernel_size=1, stride=1),
249 |                 h_swish(inplace=True),
250 |                 nn.Dropout(dropout_rate),
251 |                 nn.Conv2d(out_conv2_out, self.num_classes, kernel_size=1, stride=1),
252 |             )
253 |             self.features.append(nn.Conv2d(out_conv2_in, out_conv2_out, kernel_size=1, stride=1))
254 |             self.features.append(h_swish(inplace=True))
255 |             # self.features.append(nn.Dropout(dropout_rate))
256 | 
257 |             self.features = nn.Sequential(*self.features)
258 | 
259 |         self.apply(_weights_init)
260 | 
261 |     def forward(self, x):
262 |         out = self.init_conv(x)
263 |         out = self.block(out)
264 |         out = self.out_conv1(out)
265 |         batch, channels, height, width = out.size()
266 |         #out = F.avg_pool2d(out, kernel_size=[height, width])
267 |         out = self.avg_pool(out)
268 |         out = self.out_conv2(out).view(batch, -1)
269 |         return out
270 | 


--------------------------------------------------------------------------------
/vision/utils/box_utils.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import torch
  3 | import itertools
  4 | from typing import List
  5 | import math
  6 | 
  7 | SSDBoxSizes = collections.namedtuple('SSDBoxSizes', ['min', 'max'])
  8 | 
  9 | SSDSpec = collections.namedtuple('SSDSpec', ['feature_map_size', 'shrinkage', 'box_sizes', 'aspect_ratios'])
 10 | 
 11 | 
 12 | def generate_ssd_priors(specs: List[SSDSpec], image_size, clamp=True) -> torch.Tensor:
 13 |     """Generate SSD Prior Boxes.
 14 | 
 15 |     It returns the center, height and width of the priors. The values are relative to the image size
 16 |     Args:
 17 |         specs: SSDSpecs about the shapes of sizes of prior boxes. i.e.
 18 |             specs = [
 19 |                 SSDSpec(38, 8, SSDBoxSizes(30, 60), [2]),
 20 |                 SSDSpec(19, 16, SSDBoxSizes(60, 111), [2, 3]),
 21 |                 SSDSpec(10, 32, SSDBoxSizes(111, 162), [2, 3]),
 22 |                 SSDSpec(5, 64, SSDBoxSizes(162, 213), [2, 3]),
 23 |                 SSDSpec(3, 100, SSDBoxSizes(213, 264), [2]),
 24 |                 SSDSpec(1, 300, SSDBoxSizes(264, 315), [2])
 25 |             ]
 26 |         image_size: image size.
 27 |         clamp: if true, clamp the values to make fall between [0.0, 1.0]
 28 |     Returns:
 29 |         priors (num_priors, 4): The prior boxes represented as [[center_x, center_y, w, h]]. All the values
 30 |             are relative to the image size.
 31 |     """
 32 |     priors = []
 33 |     for spec in specs:
 34 |         scale = image_size / spec.shrinkage
 35 |         for j, i in itertools.product(range(spec.feature_map_size), repeat=2):
 36 |             x_center = (i + 0.5) / scale
 37 |             y_center = (j + 0.5) / scale
 38 | 
 39 |             # small sized square box
 40 |             size = spec.box_sizes.min
 41 |             h = w = size / image_size
 42 |             priors.append([
 43 |                 x_center,
 44 |                 y_center,
 45 |                 w,
 46 |                 h
 47 |             ])
 48 | 
 49 |             # big sized square box
 50 |             size = math.sqrt(spec.box_sizes.max * spec.box_sizes.min)
 51 |             h = w = size / image_size
 52 |             priors.append([
 53 |                 x_center,
 54 |                 y_center,
 55 |                 w,
 56 |                 h
 57 |             ])
 58 | 
 59 |             # change h/w ratio of the small sized box
 60 |             size = spec.box_sizes.min
 61 |             h = w = size / image_size
 62 |             for ratio in spec.aspect_ratios:
 63 |                 ratio = math.sqrt(ratio)
 64 |                 priors.append([
 65 |                     x_center,
 66 |                     y_center,
 67 |                     w * ratio,
 68 |                     h / ratio
 69 |                 ])
 70 |                 priors.append([
 71 |                     x_center,
 72 |                     y_center,
 73 |                     w / ratio,
 74 |                     h * ratio
 75 |                 ])
 76 | 
 77 |     priors = torch.tensor(priors)
 78 |     if clamp:
 79 |         torch.clamp(priors, 0.0, 1.0, out=priors)
 80 |     return priors
 81 | 
 82 | 
 83 | def convert_locations_to_boxes(locations, priors, center_variance,
 84 |                                size_variance):
 85 |     """Convert regressional location results of SSD into boxes in the form of (center_x, center_y, h, w).
 86 | 
 87 |     The conversion:
 88 |         $$predicted\_center * center_variance = \frac {real\_center - prior\_center} {prior\_hw}$$
 89 |         $$exp(predicted\_hw * size_variance) = \frac {real\_hw} {prior\_hw}$$
 90 |     We do it in the inverse direction here.
 91 |     Args:
 92 |         locations (batch_size, num_priors, 4): the regression output of SSD. It will contain the outputs as well.
 93 |         priors (num_priors, 4) or (batch_size/1, num_priors, 4): prior boxes.
 94 |         center_variance: a float used to change the scale of center.
 95 |         size_variance: a float used to change of scale of size.
 96 |     Returns:
 97 |         boxes:  priors: [[center_x, center_y, h, w]]. All the values
 98 |             are relative to the image size.
 99 |     """
100 |     # priors can have one dimension less.
101 |     if priors.dim() + 1 == locations.dim():
102 |         priors = priors.unsqueeze(0)
103 |     return torch.cat([
104 |         locations[..., :2] * center_variance * priors[..., 2:] + priors[..., :2],
105 |         torch.exp(locations[..., 2:] * size_variance) * priors[..., 2:]
106 |     ], dim=locations.dim() - 1)
107 | 
108 | 
109 | def convert_boxes_to_locations(center_form_boxes, center_form_priors, center_variance, size_variance):
110 |     # priors can have one dimension less
111 |     if center_form_priors.dim() + 1 == center_form_boxes.dim():
112 |         center_form_priors = center_form_priors.unsqueeze(0)
113 |     return torch.cat([
114 |         (center_form_boxes[..., :2] - center_form_priors[..., :2]) / center_form_priors[..., 2:] / center_variance,
115 |         torch.log(center_form_boxes[..., 2:] / center_form_priors[..., 2:]) / size_variance
116 |     ], dim=center_form_boxes.dim() - 1)
117 | 
118 | 
119 | def area_of(left_top, right_bottom) -> torch.Tensor:
120 |     """Compute the areas of rectangles given two corners.
121 | 
122 |     Args:
123 |         left_top (N, 2): left top corner.
124 |         right_bottom (N, 2): right bottom corner.
125 | 
126 |     Returns:
127 |         area (N): return the area.
128 |     """
129 |     hw = torch.clamp(right_bottom - left_top, min=0.0)
130 |     return hw[..., 0] * hw[..., 1]
131 | 
132 | 
133 | def iou_of(boxes0, boxes1, eps=1e-5):
134 |     """Return intersection-over-union (Jaccard index) of boxes.
135 | 
136 |     Args:
137 |         boxes0 (N, 4): ground truth boxes.
138 |         boxes1 (N or 1, 4): predicted boxes.
139 |         eps: a small number to avoid 0 as denominator.
140 |     Returns:
141 |         iou (N): IoU values.
142 |     """
143 |     overlap_left_top = torch.max(boxes0[..., :2], boxes1[..., :2])
144 |     overlap_right_bottom = torch.min(boxes0[..., 2:], boxes1[..., 2:])
145 | 
146 |     overlap_area = area_of(overlap_left_top, overlap_right_bottom)
147 |     area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
148 |     area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
149 |     return overlap_area / (area0 + area1 - overlap_area + eps)
150 | 
151 | 
152 | def assign_priors(gt_boxes, gt_labels, corner_form_priors,
153 |                   iou_threshold):
154 |     """Assign ground truth boxes and targets to priors.
155 | 
156 |     Args:
157 |         gt_boxes (num_targets, 4): ground truth boxes.
158 |         gt_labels (num_targets): labels of targets.
159 |         priors (num_priors, 4): corner form priors
160 |     Returns:
161 |         boxes (num_priors, 4): real values for priors.
162 |         labels (num_priros): labels for priors.
163 |     """
164 |     # size: num_priors x num_targets
165 |     ious = iou_of(gt_boxes.unsqueeze(0), corner_form_priors.unsqueeze(1))
166 |     # size: num_priors
167 |     best_target_per_prior, best_target_per_prior_index = ious.max(1)
168 |     # size: num_targets
169 |     best_prior_per_target, best_prior_per_target_index = ious.max(0)
170 | 
171 |     for target_index, prior_index in enumerate(best_prior_per_target_index):
172 |         best_target_per_prior_index[prior_index] = target_index
173 |     # 2.0 is used to make sure every target has a prior assigned
174 |     best_target_per_prior.index_fill_(0, best_prior_per_target_index, 2)
175 |     # size: num_priors
176 |     labels = gt_labels[best_target_per_prior_index]
177 |     labels[best_target_per_prior < iou_threshold] = 0  # the backgournd id
178 |     boxes = gt_boxes[best_target_per_prior_index]
179 |     return boxes, labels
180 | 
181 | 
182 | def hard_negative_mining(loss, labels, neg_pos_ratio):
183 |     """
184 |     It used to suppress the presence of a large number of negative prediction.
185 |     It works on image level not batch level.
186 |     For any example/image, it keeps all the positive predictions and
187 |      cut the number of negative predictions to make sure the ratio
188 |      between the negative examples and positive examples is no more
189 |      the given ratio for an image.
190 | 
191 |     Args:
192 |         loss (N, num_priors): the loss for each example.
193 |         labels (N, num_priors): the labels.
194 |         neg_pos_ratio:  the ratio between the negative examples and positive examples.
195 |     """
196 |     pos_mask = labels > 0
197 |     num_pos = pos_mask.long().sum(dim=1, keepdim=True)
198 |     num_neg = num_pos * neg_pos_ratio
199 | 
200 |     loss[pos_mask] = -math.inf
201 |     _, indexes = loss.sort(dim=1, descending=True)
202 |     _, orders = indexes.sort(dim=1)
203 |     neg_mask = orders < num_neg
204 |     return pos_mask | neg_mask
205 | 
206 | 
207 | def center_form_to_corner_form(locations):
208 |     return torch.cat([locations[..., :2] - locations[..., 2:]/2,
209 |                      locations[..., :2] + locations[..., 2:]/2], locations.dim() - 1) 
210 | 
211 | 
212 | def corner_form_to_center_form(boxes):
213 |     return torch.cat([
214 |         (boxes[..., :2] + boxes[..., 2:]) / 2,
215 |          boxes[..., 2:] - boxes[..., :2]
216 |     ], boxes.dim() - 1)
217 | 
218 | 
219 | def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
220 |     """
221 | 
222 |     Args:
223 |         box_scores (N, 5): boxes in corner-form and probabilities.
224 |         iou_threshold: intersection over union threshold.
225 |         top_k: keep top_k results. If k <= 0, keep all the results.
226 |         candidate_size: only consider the candidates with the highest scores.
227 |     Returns:
228 |          picked: a list of indexes of the kept boxes
229 |     """
230 |     scores = box_scores[:, -1]
231 |     boxes = box_scores[:, :-1]
232 |     picked = []
233 |     _, indexes = scores.sort(descending=True)
234 |     indexes = indexes[:candidate_size]
235 |     while len(indexes) > 0:
236 |         current = indexes[0]
237 |         picked.append(current.item())
238 |         if 0 < top_k == len(picked) or len(indexes) == 1:
239 |             break
240 |         current_box = boxes[current, :]
241 |         indexes = indexes[1:]
242 |         rest_boxes = boxes[indexes, :]
243 |         iou = iou_of(
244 |             rest_boxes,
245 |             current_box.unsqueeze(0),
246 |         )
247 |         indexes = indexes[iou <= iou_threshold]
248 | 
249 |     return box_scores[picked, :]
250 | 
251 | 
252 | def nms(box_scores, nms_method=None, score_threshold=None, iou_threshold=None,
253 |         sigma=0.5, top_k=-1, candidate_size=200):
254 |     if nms_method == "soft":
255 |         return soft_nms(box_scores, score_threshold, sigma, top_k)
256 |     else:
257 |         return hard_nms(box_scores, iou_threshold, top_k, candidate_size=candidate_size)
258 | 
259 | 
260 | def soft_nms(box_scores, score_threshold, sigma=0.5, top_k=-1):
261 |     """Soft NMS implementation.
262 | 
263 |     References:
264 |         https://arxiv.org/abs/1704.04503
265 |         https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/cython_nms.pyx
266 | 
267 |     Args:
268 |         box_scores (N, 5): boxes in corner-form and probabilities.
269 |         score_threshold: boxes with scores less than value are not considered.
270 |         sigma: the parameter in score re-computation.
271 |             scores[i] = scores[i] * exp(-(iou_i)^2 / simga)
272 |         top_k: keep top_k results. If k <= 0, keep all the results.
273 |     Returns:
274 |          picked_box_scores (K, 5): results of NMS.
275 |     """
276 |     picked_box_scores = []
277 |     while box_scores.size(0) > 0:
278 |         max_score_index = torch.argmax(box_scores[:, 4])
279 |         cur_box_prob = torch.tensor(box_scores[max_score_index, :])
280 |         picked_box_scores.append(cur_box_prob)
281 |         if len(picked_box_scores) == top_k > 0 or box_scores.size(0) == 1:
282 |             break
283 |         cur_box = cur_box_prob[:-1]
284 |         box_scores[max_score_index, :] = box_scores[-1, :]
285 |         box_scores = box_scores[:-1, :]
286 |         ious = iou_of(cur_box.unsqueeze(0), box_scores[:, :-1])
287 |         box_scores[:, -1] = box_scores[:, -1] * torch.exp(-(ious * ious) / sigma)
288 |         box_scores = box_scores[box_scores[:, -1] > score_threshold, :]
289 |     if len(picked_box_scores) > 0:
290 |         return torch.stack(picked_box_scores)
291 |     else:
292 |         return torch.tensor([])
293 | 
294 | 
295 | 
296 | 


--------------------------------------------------------------------------------
/vision/transforms/transforms.py:
--------------------------------------------------------------------------------
  1 | # from https://github.com/amdegroot/ssd.pytorch
  2 | 
  3 | 
  4 | import torch
  5 | from torchvision import transforms
  6 | import cv2
  7 | import numpy as np
  8 | import types
  9 | from numpy import random
 10 | 
 11 | 
 12 | def intersect(box_a, box_b):
 13 |     max_xy = np.minimum(box_a[:, 2:], box_b[2:])
 14 |     min_xy = np.maximum(box_a[:, :2], box_b[:2])
 15 |     inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf)
 16 |     return inter[:, 0] * inter[:, 1]
 17 | 
 18 | 
 19 | def jaccard_numpy(box_a, box_b):
 20 |     """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
 21 |     is simply the intersection over union of two boxes.
 22 |     E.g.:
 23 |         A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
 24 |     Args:
 25 |         box_a: Multiple bounding boxes, Shape: [num_boxes,4]
 26 |         box_b: Single bounding box, Shape: [4]
 27 |     Return:
 28 |         jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]]
 29 |     """
 30 |     inter = intersect(box_a, box_b)
 31 |     area_a = ((box_a[:, 2]-box_a[:, 0]) *
 32 |               (box_a[:, 3]-box_a[:, 1]))  # [A,B]
 33 |     area_b = ((box_b[2]-box_b[0]) *
 34 |               (box_b[3]-box_b[1]))  # [A,B]
 35 |     union = area_a + area_b - inter
 36 |     return inter / union  # [A,B]
 37 | 
 38 | 
 39 | class Compose(object):
 40 |     """Composes several augmentations together.
 41 |     Args:
 42 |         transforms (List[Transform]): list of transforms to compose.
 43 |     Example:
 44 |         >>> augmentations.Compose([
 45 |         >>>     transforms.CenterCrop(10),
 46 |         >>>     transforms.ToTensor(),
 47 |         >>> ])
 48 |     """
 49 | 
 50 |     def __init__(self, transforms):
 51 |         self.transforms = transforms
 52 | 
 53 |     def __call__(self, img, boxes=None, labels=None):
 54 |         for t in self.transforms:
 55 |             img, boxes, labels = t(img, boxes, labels)
 56 |         return img, boxes, labels
 57 | 
 58 | 
 59 | class Lambda(object):
 60 |     """Applies a lambda as a transform."""
 61 | 
 62 |     def __init__(self, lambd):
 63 |         assert isinstance(lambd, types.LambdaType)
 64 |         self.lambd = lambd
 65 | 
 66 |     def __call__(self, img, boxes=None, labels=None):
 67 |         return self.lambd(img, boxes, labels)
 68 | 
 69 | 
 70 | class ConvertFromInts(object):
 71 |     def __call__(self, image, boxes=None, labels=None):
 72 |         return image.astype(np.float32), boxes, labels
 73 | 
 74 | 
 75 | class SubtractMeans(object):
 76 |     def __init__(self, mean):
 77 |         self.mean = np.array(mean, dtype=np.float32)
 78 | 
 79 |     def __call__(self, image, boxes=None, labels=None):
 80 |         image = image.astype(np.float32)
 81 |         image -= self.mean
 82 |         return image.astype(np.float32), boxes, labels
 83 | 
 84 | 
 85 | class ToAbsoluteCoords(object):
 86 |     def __call__(self, image, boxes=None, labels=None):
 87 |         height, width, channels = image.shape
 88 |         boxes[:, 0] *= width
 89 |         boxes[:, 2] *= width
 90 |         boxes[:, 1] *= height
 91 |         boxes[:, 3] *= height
 92 | 
 93 |         return image, boxes, labels
 94 | 
 95 | 
 96 | class ToPercentCoords(object):
 97 |     def __call__(self, image, boxes=None, labels=None):
 98 |         height, width, channels = image.shape
 99 |         boxes[:, 0] /= width
100 |         boxes[:, 2] /= width
101 |         boxes[:, 1] /= height
102 |         boxes[:, 3] /= height
103 | 
104 |         return image, boxes, labels
105 | 
106 | 
107 | class Resize(object):
108 |     def __init__(self, size=300):
109 |         self.size = size
110 | 
111 |     def __call__(self, image, boxes=None, labels=None):
112 |         image = cv2.resize(image, (self.size,
113 |                                  self.size))
114 |         return image, boxes, labels
115 | 
116 | 
117 | class RandomSaturation(object):
118 |     def __init__(self, lower=0.5, upper=1.5):
119 |         self.lower = lower
120 |         self.upper = upper
121 |         assert self.upper >= self.lower, "contrast upper must be >= lower."
122 |         assert self.lower >= 0, "contrast lower must be non-negative."
123 | 
124 |     def __call__(self, image, boxes=None, labels=None):
125 |         if random.randint(2):
126 |             image[:, :, 1] *= random.uniform(self.lower, self.upper)
127 | 
128 |         return image, boxes, labels
129 | 
130 | 
131 | class RandomHue(object):
132 |     def __init__(self, delta=18.0):
133 |         assert delta >= 0.0 and delta <= 360.0
134 |         self.delta = delta
135 | 
136 |     def __call__(self, image, boxes=None, labels=None):
137 |         if random.randint(2):
138 |             image[:, :, 0] += random.uniform(-self.delta, self.delta)
139 |             image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0
140 |             image[:, :, 0][image[:, :, 0] < 0.0] += 360.0
141 |         return image, boxes, labels
142 | 
143 | 
144 | class RandomLightingNoise(object):
145 |     def __init__(self):
146 |         self.perms = ((0, 1, 2), (0, 2, 1),
147 |                       (1, 0, 2), (1, 2, 0),
148 |                       (2, 0, 1), (2, 1, 0))
149 | 
150 |     def __call__(self, image, boxes=None, labels=None):
151 |         if random.randint(2):
152 |             swap = self.perms[random.randint(len(self.perms))]
153 |             shuffle = SwapChannels(swap)  # shuffle channels
154 |             image = shuffle(image)
155 |         return image, boxes, labels
156 | 
157 | 
158 | class ConvertColor(object):
159 |     def __init__(self, current, transform):
160 |         self.transform = transform
161 |         self.current = current
162 | 
163 |     def __call__(self, image, boxes=None, labels=None):
164 |         if self.current == 'BGR' and self.transform == 'HSV':
165 |             image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
166 |         elif self.current == 'RGB' and self.transform == 'HSV':
167 |             image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
168 |         elif self.current == 'BGR' and self.transform == 'RGB':
169 |             image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
170 |         elif self.current == 'HSV' and self.transform == 'BGR':
171 |             image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
172 |         elif self.current == 'HSV' and self.transform == "RGB":
173 |             image = cv2.cvtColor(image, cv2.COLOR_HSV2RGB)
174 |         else:
175 |             raise NotImplementedError
176 |         return image, boxes, labels
177 | 
178 | 
179 | class RandomContrast(object):
180 |     def __init__(self, lower=0.5, upper=1.5):
181 |         self.lower = lower
182 |         self.upper = upper
183 |         assert self.upper >= self.lower, "contrast upper must be >= lower."
184 |         assert self.lower >= 0, "contrast lower must be non-negative."
185 | 
186 |     # expects float image
187 |     def __call__(self, image, boxes=None, labels=None):
188 |         if random.randint(2):
189 |             alpha = random.uniform(self.lower, self.upper)
190 |             image *= alpha
191 |         return image, boxes, labels
192 | 
193 | 
194 | class RandomBrightness(object):
195 |     def __init__(self, delta=32):
196 |         assert delta >= 0.0
197 |         assert delta <= 255.0
198 |         self.delta = delta
199 | 
200 |     def __call__(self, image, boxes=None, labels=None):
201 |         if random.randint(2):
202 |             delta = random.uniform(-self.delta, self.delta)
203 |             image += delta
204 |         return image, boxes, labels
205 | 
206 | 
207 | class ToCV2Image(object):
208 |     def __call__(self, tensor, boxes=None, labels=None):
209 |         return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), boxes, labels
210 | 
211 | 
212 | class ToTensor(object):
213 |     def __call__(self, cvimage, boxes=None, labels=None):
214 |         return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), boxes, labels
215 | 
216 | 
217 | class RandomSampleCrop(object):
218 |     """Crop
219 |     Arguments:
220 |         img (Image): the image being input during training
221 |         boxes (Tensor): the original bounding boxes in pt form
222 |         labels (Tensor): the class labels for each bbox
223 |         mode (float tuple): the min and max jaccard overlaps
224 |     Return:
225 |         (img, boxes, classes)
226 |             img (Image): the cropped image
227 |             boxes (Tensor): the adjusted bounding boxes in pt form
228 |             labels (Tensor): the class labels for each bbox
229 |     """
230 |     def __init__(self):
231 |         self.sample_options = (
232 |             # using entire original input image
233 |             None,
234 |             # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9
235 |             (0.1, None),
236 |             (0.3, None),
237 |             (0.7, None),
238 |             (0.9, None),
239 |             # randomly sample a patch
240 |             (None, None),
241 |         )
242 | 
243 |     def __call__(self, image, boxes=None, labels=None):
244 |         height, width, _ = image.shape
245 |         while True:
246 |             # randomly choose a mode
247 |             mode = random.choice(self.sample_options)
248 |             if mode is None:
249 |                 return image, boxes, labels
250 | 
251 |             min_iou, max_iou = mode
252 |             if min_iou is None:
253 |                 min_iou = float('-inf')
254 |             if max_iou is None:
255 |                 max_iou = float('inf')
256 | 
257 |             # max trails (50)
258 |             for _ in range(50):
259 |                 current_image = image
260 | 
261 |                 w = random.uniform(0.3 * width, width)
262 |                 h = random.uniform(0.3 * height, height)
263 | 
264 |                 # aspect ratio constraint b/t .5 & 2
265 |                 if h / w < 0.5 or h / w > 2:
266 |                     continue
267 | 
268 |                 left = random.uniform(width - w)
269 |                 top = random.uniform(height - h)
270 | 
271 |                 # convert to integer rect x1,y1,x2,y2
272 |                 rect = np.array([int(left), int(top), int(left+w), int(top+h)])
273 | 
274 |                 # calculate IoU (jaccard overlap) b/t the cropped and gt boxes
275 |                 overlap = jaccard_numpy(boxes, rect)
276 | 
277 |                 # is min and max overlap constraint satisfied? if not try again
278 |                 if overlap.min() < min_iou and max_iou < overlap.max():
279 |                     continue
280 | 
281 |                 # cut the crop from the image
282 |                 current_image = current_image[rect[1]:rect[3], rect[0]:rect[2],
283 |                                               :]
284 | 
285 |                 # keep overlap with gt box IF center in sampled patch
286 |                 centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0
287 | 
288 |                 # mask in all gt boxes that above and to the left of centers
289 |                 m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1])
290 | 
291 |                 # mask in all gt boxes that under and to the right of centers
292 |                 m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1])
293 | 
294 |                 # mask in that both m1 and m2 are true
295 |                 mask = m1 * m2
296 | 
297 |                 # have any valid boxes? try again if not
298 |                 if not mask.any():
299 |                     continue
300 | 
301 |                 # take only matching gt boxes
302 |                 current_boxes = boxes[mask, :].copy()
303 | 
304 |                 # take only matching gt labels
305 |                 current_labels = labels[mask]
306 | 
307 |                 # should we use the box left and top corner or the crop's
308 |                 current_boxes[:, :2] = np.maximum(current_boxes[:, :2],
309 |                                                   rect[:2])
310 |                 # adjust to crop (by substracting crop's left,top)
311 |                 current_boxes[:, :2] -= rect[:2]
312 | 
313 |                 current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:],
314 |                                                   rect[2:])
315 |                 # adjust to crop (by substracting crop's left,top)
316 |                 current_boxes[:, 2:] -= rect[:2]
317 | 
318 |                 return current_image, current_boxes, current_labels
319 | 
320 | 
321 | class Expand(object):
322 |     def __init__(self, mean):
323 |         self.mean = mean
324 | 
325 |     def __call__(self, image, boxes, labels):
326 |         if random.randint(2):
327 |             return image, boxes, labels
328 | 
329 |         height, width, depth = image.shape
330 |         ratio = random.uniform(1, 4)
331 |         left = random.uniform(0, width*ratio - width)
332 |         top = random.uniform(0, height*ratio - height)
333 | 
334 |         expand_image = np.zeros(
335 |             (int(height*ratio), int(width*ratio), depth),
336 |             dtype=image.dtype)
337 |         expand_image[:, :, :] = self.mean
338 |         expand_image[int(top):int(top + height),
339 |                      int(left):int(left + width)] = image
340 |         image = expand_image
341 | 
342 |         boxes = boxes.copy()
343 |         boxes[:, :2] += (int(left), int(top))
344 |         boxes[:, 2:] += (int(left), int(top))
345 | 
346 |         return image, boxes, labels
347 | 
348 | 
349 | class RandomMirror(object):
350 |     def __call__(self, image, boxes, classes):
351 |         _, width, _ = image.shape
352 |         if random.randint(2):
353 |             image = image[:, ::-1]
354 |             boxes = boxes.copy()
355 |             boxes[:, 0::2] = width - boxes[:, 2::-2]
356 |         return image, boxes, classes
357 | 
358 | 
359 | class SwapChannels(object):
360 |     """Transforms a tensorized image by swapping the channels in the order
361 |      specified in the swap tuple.
362 |     Args:
363 |         swaps (int triple): final order of channels
364 |             eg: (2, 1, 0)
365 |     """
366 | 
367 |     def __init__(self, swaps):
368 |         self.swaps = swaps
369 | 
370 |     def __call__(self, image):
371 |         """
372 |         Args:
373 |             image (Tensor): image tensor to be transformed
374 |         Return:
375 |             a tensor with channels swapped according to swap
376 |         """
377 |         # if torch.is_tensor(image):
378 |         #     image = image.data.cpu().numpy()
379 |         # else:
380 |         #     image = np.array(image)
381 |         image = image[:, :, self.swaps]
382 |         return image
383 | 
384 | 
385 | class PhotometricDistort(object):
386 |     def __init__(self):
387 |         self.pd = [
388 |             RandomContrast(),  # RGB
389 |             ConvertColor(current="RGB", transform='HSV'),  # HSV
390 |             RandomSaturation(),  # HSV
391 |             RandomHue(),  # HSV
392 |             ConvertColor(current='HSV', transform='RGB'),  # RGB
393 |             RandomContrast()  # RGB
394 |         ]
395 |         self.rand_brightness = RandomBrightness()
396 |         self.rand_light_noise = RandomLightingNoise()
397 | 
398 |     def __call__(self, image, boxes, labels):
399 |         im = image.copy()
400 |         im, boxes, labels = self.rand_brightness(im, boxes, labels)
401 |         if random.randint(2):
402 |             distort = Compose(self.pd[:-1])
403 |         else:
404 |             distort = Compose(self.pd[1:])
405 |         im, boxes, labels = distort(im, boxes, labels)
406 |         return self.rand_light_noise(im, boxes, labels)
407 | 
408 | 


--------------------------------------------------------------------------------
/train_ssd.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import logging
  4 | import sys
  5 | import itertools
  6 | 
  7 | import torch
  8 | from torch.utils.data import DataLoader, ConcatDataset
  9 | from torch.optim.lr_scheduler import CosineAnnealingLR, MultiStepLR
 10 | 
 11 | from vision.utils.misc import str2bool, Timer, freeze_net_layers, store_labels
 12 | from vision.ssd.ssd import MatchPrior
 13 | from vision.ssd.vgg_ssd import create_vgg_ssd
 14 | from vision.ssd.mobilenetv1_ssd import create_mobilenetv1_ssd
 15 | from vision.ssd.mobilenetv1_ssd_lite import create_mobilenetv1_ssd_lite
 16 | from vision.ssd.mobilenet_v2_ssd_lite import create_mobilenetv2_ssd_lite
 17 | from vision.ssd.squeezenet_ssd_lite import create_squeezenet_ssd_lite
 18 | from vision.datasets.voc_dataset import VOCDataset
 19 | from vision.datasets.open_images import OpenImagesDataset
 20 | from vision.nn.multibox_loss import MultiboxLoss
 21 | from vision.ssd.config import vgg_ssd_config
 22 | from vision.ssd.config import mobilenetv1_ssd_config
 23 | from vision.ssd.config import squeezenet_ssd_config
 24 | from vision.ssd.data_preprocessing import TrainAugmentation, TestTransform
 25 | 
 26 | from vision.ssd.mobilenet_v3_ssd_lite import create_mobilenetv3_ssd_lite
 27 | 
 28 | parser = argparse.ArgumentParser(
 29 |     description='Single Shot MultiBox Detector Training With Pytorch')
 30 | 
 31 | parser.add_argument("--dataset_type", default="voc", type=str,
 32 |                     help='Specify dataset type. Currently support voc and open_images.')
 33 | 
 34 | parser.add_argument('--datasets', nargs='+', help='Dataset directory path')
 35 | parser.add_argument('--validation_dataset', help='Dataset directory path')
 36 | parser.add_argument('--balance_data', action='store_true',
 37 |                     help="Balance training data by down-sampling more frequent labels.")
 38 | 
 39 | 
 40 | parser.add_argument('--net', default="vgg16-ssd",
 41 |                     help="The network architecture, it can be mb1-ssd, mb1-lite-ssd, mb2-ssd-lite or vgg16-ssd.")
 42 | parser.add_argument('--freeze_base_net', action='store_true',
 43 |                     help="Freeze base net layers.")
 44 | parser.add_argument('--freeze_net', action='store_true',
 45 |                     help="Freeze all the layers except the prediction head.")
 46 | 
 47 | parser.add_argument('--mb2_width_mult', default=1.0, type=float,
 48 |                     help='Width Multiplifier for MobilenetV2')
 49 | 
 50 | # Params for SGD
 51 | parser.add_argument('--lr', '--learning-rate', default=1e-3, type=float,
 52 |                     help='initial learning rate')
 53 | parser.add_argument('--momentum', default=0.9, type=float,
 54 |                     help='Momentum value for optim')
 55 | parser.add_argument('--weight_decay', default=5e-4, type=float,
 56 |                     help='Weight decay for SGD')
 57 | parser.add_argument('--gamma', default=0.1, type=float,
 58 |                     help='Gamma update for SGD')
 59 | parser.add_argument('--base_net_lr', default=None, type=float,
 60 |                     help='initial learning rate for base net.')
 61 | parser.add_argument('--extra_layers_lr', default=None, type=float,
 62 |                     help='initial learning rate for the layers not in base net and prediction heads.')
 63 | 
 64 | 
 65 | # Params for loading pretrained basenet or checkpoints.
 66 | parser.add_argument('--base_net',
 67 |                     help='Pretrained base model')
 68 | parser.add_argument('--pretrained_ssd', help='Pre-trained base model')
 69 | parser.add_argument('--resume', default=None, type=str,
 70 |                     help='Checkpoint state_dict file to resume training from')
 71 | 
 72 | # Scheduler
 73 | parser.add_argument('--scheduler', default="multi-step", type=str,
 74 |                     help="Scheduler for SGD. It can one of multi-step and cosine")
 75 | 
 76 | # Params for Multi-step Scheduler
 77 | parser.add_argument('--milestones', default="80,100", type=str,
 78 |                     help="milestones for MultiStepLR")
 79 | 
 80 | # Params for Cosine Annealing
 81 | parser.add_argument('--t_max', default=120, type=float,
 82 |                     help='T_max value for Cosine Annealing Scheduler.')
 83 | 
 84 | # Train params
 85 | parser.add_argument('--batch_size', default=32, type=int,
 86 |                     help='Batch size for training')
 87 | parser.add_argument('--num_epochs', default=120, type=int,
 88 |                     help='the number epochs')
 89 | parser.add_argument('--num_workers', default=4, type=int,
 90 |                     help='Number of workers used in dataloading')
 91 | parser.add_argument('--validation_epochs', default=5, type=int,
 92 |                     help='the number epochs')
 93 | parser.add_argument('--debug_steps', default=100, type=int,
 94 |                     help='Set the debug log output frequency.')
 95 | parser.add_argument('--use_cuda', default=True, type=str2bool,
 96 |                     help='Use CUDA to train model')
 97 | 
 98 | parser.add_argument('--checkpoint_folder', default='models/',
 99 |                     help='Directory for saving checkpoint models')
100 | 
101 | 
102 | logging.basicConfig(stream=sys.stdout, level=logging.INFO,
103 |                     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
104 | args = parser.parse_args()
105 | DEVICE = torch.device("cuda:0" if torch.cuda.is_available() and args.use_cuda else "cpu")
106 | 
107 | if args.use_cuda and torch.cuda.is_available():
108 |     torch.backends.cudnn.benchmark = True
109 |     logging.info("Use Cuda.")
110 | 
111 | 
112 | def train(loader, net, criterion, optimizer, device, debug_steps=100, epoch=-1):
113 |     net.train(True)
114 |     running_loss = 0.0
115 |     running_regression_loss = 0.0
116 |     running_classification_loss = 0.0
117 |     for i, data in enumerate(loader):
118 |         images, boxes, labels = data
119 |         images = images.to(device)
120 |         boxes = boxes.to(device)
121 |         labels = labels.to(device)
122 | 
123 |         optimizer.zero_grad()
124 |         confidence, locations = net(images)
125 |         regression_loss, classification_loss = criterion(confidence, locations, labels, boxes)  # TODO CHANGE BOXES
126 |         loss = regression_loss + classification_loss
127 |         loss.backward()
128 |         optimizer.step()
129 | 
130 |         running_loss += loss.item()
131 |         running_regression_loss += regression_loss.item()
132 |         running_classification_loss += classification_loss.item()
133 |         if i and i % debug_steps == 0:
134 |             avg_loss = running_loss / debug_steps
135 |             avg_reg_loss = running_regression_loss / debug_steps
136 |             avg_clf_loss = running_classification_loss / debug_steps
137 |             logging.info(
138 |                 f"Epoch: {epoch}, Step: {i}, " +
139 |                 f"Average Loss: {avg_loss:.4f}, " +
140 |                 f"Average Regression Loss {avg_reg_loss:.4f}, " +
141 |                 f"Average Classification Loss: {avg_clf_loss:.4f}"
142 |             )
143 |             running_loss = 0.0
144 |             running_regression_loss = 0.0
145 |             running_classification_loss = 0.0
146 | 
147 | 
148 | def test(loader, net, criterion, device):
149 |     net.eval()
150 |     running_loss = 0.0
151 |     running_regression_loss = 0.0
152 |     running_classification_loss = 0.0
153 |     num = 0
154 |     for _, data in enumerate(loader):
155 |         images, boxes, labels = data
156 |         images = images.to(device)
157 |         boxes = boxes.to(device)
158 |         labels = labels.to(device)
159 |         num += 1
160 | 
161 |         with torch.no_grad():
162 |             confidence, locations = net(images)
163 |             regression_loss, classification_loss = criterion(confidence, locations, labels, boxes)
164 |             loss = regression_loss + classification_loss
165 | 
166 |         running_loss += loss.item()
167 |         running_regression_loss += regression_loss.item()
168 |         running_classification_loss += classification_loss.item()
169 |     return running_loss / num, running_regression_loss / num, running_classification_loss / num
170 | 
171 | 
172 | if __name__ == '__main__':
173 |     timer = Timer()
174 | 
175 |     logging.info(args)
176 |     if args.net == 'vgg16-ssd':
177 |         create_net = create_vgg_ssd
178 |         config = vgg_ssd_config
179 |     elif args.net == 'mb1-ssd':
180 |         create_net = create_mobilenetv1_ssd
181 |         config = mobilenetv1_ssd_config
182 |     elif args.net == 'mb1-ssd-lite':
183 |         create_net = create_mobilenetv1_ssd_lite
184 |         config = mobilenetv1_ssd_config
185 |     elif args.net == 'sq-ssd-lite':
186 |         create_net = create_squeezenet_ssd_lite
187 |         config = squeezenet_ssd_config
188 |     elif args.net == 'mb2-ssd-lite':
189 |         create_net = lambda num: create_mobilenetv2_ssd_lite(num, width_mult=args.mb2_width_mult)
190 |         config = mobilenetv1_ssd_config
191 |     elif args.net == 'mb3-ssd-lite':
192 |         create_net = lambda num: create_mobilenetv3_ssd_lite(num)
193 |         config = mobilenetv1_ssd_config
194 |     else:
195 |         logging.fatal("The net type is wrong.")
196 |         parser.print_help(sys.stderr)
197 |         sys.exit(1)
198 |     train_transform = TrainAugmentation(config.image_size, config.image_mean, config.image_std)
199 |     target_transform = MatchPrior(config.priors, config.center_variance,
200 |                                   config.size_variance, 0.5)
201 | 
202 |     test_transform = TestTransform(config.image_size, config.image_mean, config.image_std)
203 | 
204 |     logging.info("Prepare training datasets.")
205 |     datasets = []
206 |     for dataset_path in args.datasets:
207 |         if args.dataset_type == 'voc':
208 |             dataset = VOCDataset(dataset_path, transform=train_transform,
209 |                                  target_transform=target_transform)
210 |             label_file = os.path.join(args.checkpoint_folder, "voc-model-labels.txt")
211 |             store_labels(label_file, dataset.class_names)
212 |             num_classes = len(dataset.class_names)
213 |         elif args.dataset_type == 'open_images':
214 |             dataset = OpenImagesDataset(dataset_path,
215 |                  transform=train_transform, target_transform=target_transform,
216 |                  dataset_type="train", balance_data=args.balance_data)
217 |             label_file = os.path.join(args.checkpoint_folder, "open-images-model-labels.txt")
218 |             store_labels(label_file, dataset.class_names)
219 |             logging.info(dataset)
220 |             num_classes = len(dataset.class_names)
221 | 
222 |         else:
223 |             raise ValueError(f"Dataset tpye {args.dataset_type} is not supported.")
224 |         datasets.append(dataset)
225 |     logging.info(f"Stored labels into file {label_file}.")
226 |     train_dataset = ConcatDataset(datasets)
227 |     logging.info("Train dataset size: {}".format(len(train_dataset)))
228 |     train_loader = DataLoader(train_dataset, args.batch_size,
229 |                               num_workers=args.num_workers,
230 |                               shuffle=True,
231 |                               drop_last=True)
232 |     logging.info("Prepare Validation datasets.")
233 |     if args.dataset_type == "voc":
234 |         val_dataset = VOCDataset(args.validation_dataset, transform=test_transform,
235 |                                  target_transform=target_transform, is_test=True)
236 |     elif args.dataset_type == 'open_images':
237 |         val_dataset = OpenImagesDataset(dataset_path,
238 |                                         transform=test_transform, target_transform=target_transform,
239 |                                         dataset_type="test")
240 |         logging.info(val_dataset)
241 |     logging.info("validation dataset size: {}".format(len(val_dataset)))
242 | 
243 |     val_loader = DataLoader(val_dataset, args.batch_size,
244 |                             num_workers=args.num_workers,
245 |                             shuffle=False,
246 |                             drop_last=True)
247 |     logging.info("Build network.")
248 |     net = create_net(num_classes)
249 |     #print(net)
250 | 
251 |   
252 | 
253 | 
254 |     min_loss = -10000.0
255 |     last_epoch = -1
256 | 
257 |     base_net_lr = args.base_net_lr if args.base_net_lr is not None else args.lr
258 |     extra_layers_lr = args.extra_layers_lr if args.extra_layers_lr is not None else args.lr
259 |     if args.freeze_base_net:
260 |         logging.info("Freeze base net.")
261 |         freeze_net_layers(net.base_net)
262 |         params = itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters(),
263 |                                  net.regression_headers.parameters(), net.classification_headers.parameters())
264 |         params = [
265 |             {'params': itertools.chain(
266 |                 net.source_layer_add_ons.parameters(),
267 |                 net.extras.parameters()
268 |             ), 'lr': extra_layers_lr},
269 |             {'params': itertools.chain(
270 |                 net.regression_headers.parameters(),
271 |                 net.classification_headers.parameters()
272 |             )}
273 |         ]
274 |     elif args.freeze_net:
275 |         freeze_net_layers(net.base_net)
276 |         freeze_net_layers(net.source_layer_add_ons)
277 |         freeze_net_layers(net.extras)
278 |         params = itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters())
279 |         logging.info("Freeze all the layers except prediction heads.")
280 |     else:
281 |         params = [
282 |             {'params': net.base_net.parameters(), 'lr': base_net_lr},
283 |             {'params': itertools.chain(
284 |                 net.source_layer_add_ons.parameters(),
285 |                 net.extras.parameters()
286 |             ), 'lr': extra_layers_lr},
287 |             {'params': itertools.chain(
288 |                 net.regression_headers.parameters(),
289 |                 net.classification_headers.parameters()
290 |             )}
291 |         ]
292 | 
293 |     timer.start("Load Model")
294 |     if args.resume:
295 |         logging.info(f"Resume from the model {args.resume}")
296 |         net.load(args.resume)
297 |     elif args.base_net:
298 |         logging.info(f"Init from base net {args.base_net}")
299 |         net.init_from_base_net(args.base_net)
300 |     elif args.pretrained_ssd:
301 |         logging.info(f"Init from pretrained ssd {args.pretrained_ssd}")
302 |         net.init_from_pretrained_ssd(args.pretrained_ssd)
303 |     logging.info(f'Took {timer.end("Load Model"):.2f} seconds to load the model.')
304 | 
305 |     net.to(DEVICE)
306 | 
307 |     criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=3,
308 |                              center_variance=0.1, size_variance=0.2, device=DEVICE)
309 |     optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum,
310 |                                 weight_decay=args.weight_decay)
311 |     logging.info(f"Learning rate: {args.lr}, Base net learning rate: {base_net_lr}, "
312 |                  + f"Extra Layers learning rate: {extra_layers_lr}.")
313 | 
314 |     if args.scheduler == 'multi-step':
315 |         logging.info("Uses MultiStepLR scheduler.")
316 |         milestones = [int(v.strip()) for v in args.milestones.split(",")]
317 |         scheduler = MultiStepLR(optimizer, milestones=milestones,
318 |                                                      gamma=0.1, last_epoch=last_epoch)
319 |     elif args.scheduler == 'cosine':
320 |         logging.info("Uses CosineAnnealingLR scheduler.")
321 |         scheduler = CosineAnnealingLR(optimizer, args.t_max, last_epoch=last_epoch)
322 |     else:
323 |         logging.fatal(f"Unsupported Scheduler: {args.scheduler}.")
324 |         parser.print_help(sys.stderr)
325 |         sys.exit(1)
326 | 
327 |     logging.info(f"Start training from epoch {last_epoch + 1}.")
328 | 
329 |     #sys.exit(0)#test
330 | 
331 | 
332 |     for epoch in range(last_epoch + 1, args.num_epochs):
333 |         scheduler.step()
334 |         train(train_loader, net, criterion, optimizer,
335 |               device=DEVICE, debug_steps=args.debug_steps, epoch=epoch)
336 |         
337 |         if epoch % args.validation_epochs == 0 or epoch == args.num_epochs - 1:
338 |             val_loss, val_regression_loss, val_classification_loss = test(val_loader, net, criterion, DEVICE)
339 |             logging.info(
340 |                 f"Epoch: {epoch}, " +
341 |                 f"Validation Loss: {val_loss:.4f}, " +
342 |                 f"Validation Regression Loss {val_regression_loss:.4f}, " +
343 |                 f"Validation Classification Loss: {val_classification_loss:.4f}"
344 |             )
345 |             model_path = os.path.join(args.checkpoint_folder, f"{args.net}-Epoch-{epoch}-Loss-{val_loss}.pth")
346 |             net.save(model_path)
347 |             logging.info(f"Saved model {model_path}")
348 | 


--------------------------------------------------------------------------------