├── README.md ├── data └── VG │ ├── test_list_500.txt │ ├── train_list_500.txt │ └── vg_category_500_labels_index.json ├── datasets ├── __init__.py ├── cocodataset.py ├── vgdataset.py ├── voc07dataset.py └── voc12dataset.py ├── element_wise_layer.py ├── ggnn.py ├── images ├── framework.png └── pipeline.png ├── main.py ├── main_coco.sh ├── main_vg.sh ├── main_voc07.sh ├── main_voc12.sh ├── models.py ├── networks ├── __init__.py ├── __init__.pyc ├── resnet.py └── resnet.pyc ├── semantic.py └── utils ├── __init__.py ├── __init__.pyc ├── cocodataset.pyc ├── load_pretrain_model.py ├── load_pretrain_model.pyc ├── metrics.py ├── metrics.pyc ├── transforms.py └── transforms.pyc /README.md: -------------------------------------------------------------------------------- 1 | # Learning Semantic-Specific Graph Representation for Multi-Label Image Recognition 2 | 3 | Implementation of the paper: "[Learning Semantic-Specific Graph Representation for Multi-Label Image Recognition](https://arxiv.org/abs/1908.07325)" (ICCV 2019) by Tianshui Chen, Muxin Xu, Xiaolu Hui, Hefeng Wu, Liang Lin. 4 | 5 | 6 | ![Pipeline](./images//pipeline.png) 7 | 8 | 9 | ## Environment 10 | Python 2.7 11 | Pytorch 0.4.1 12 | Ubuntu 14.04 LTS 13 | 14 | 15 | 16 | ## Datasets 17 | [Microsoft COCO](http://cocodataset.org/#home) - 80 common object categories 18 | 19 | [Pascal VOC 2007](http://host.robots.ox.ac.uk/pascal/VOC/voc2007/) - 20 common object categories 20 | 21 | [Pascal VOC 2012](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/) - 20 common object categories 22 | 23 | [VisualGenome](https://visualgenome.org/) - subset of VG, covering 500 most common object categories 24 | 25 | 26 | ## Models && features && adjacency matrices 27 | You can download the data files and our best models [here](https://pan.baidu.com/s/1OtPUX3QEbWkk6mYGv9fk1Q) 28 | password: ep6u 29 | 30 | ## Usage 31 | git clone https://github.com/Mu-xsan/SSGRL.git 32 | 33 | cd SSGRL 34 | 35 | mkdir data (download the data needed and put here) 36 | 37 | ### Run Microsoft COCO 38 | bash main_coco.sh [GPU_id] [Remark for this experiment] 39 | ### Run Pascal VOC 2007 40 | bash main_voc07.sh [GPU_id] [Remark for this experiment] 41 | ### Run Pascal VOC 2012 42 | bash main_voc12.sh [GPU_id] [Remark for this experiment] 43 | ### Run VisualGenome-500 44 | bash main_vg.sh [GPU_id] [Remark for this experiment] 45 | 46 | ## Result 47 | Microsoft COCO: 48 | 49 | |Method| mAP| CP|CR|CF1|OP|OR|OF1| 50 | |---------|-------|-------|---------|-------|-------|---------|-------| 51 | SSGRL|83.8|89.9|68.5|76.8|91.3|70.8|79.7| 52 | 53 | Pascal VOC 2007: 54 | 55 | | Classes | AP(SSGRL)| AP(pre) | 56 | |-------------|--------|--------| 57 | |aeroplane|99.5|99.7| 58 | |bicycle|97.1|98.4| 59 | |bird|97.6|98.0| 60 | |boat|97.8|97.6| 61 | |bottle|82.6|85.7| 62 | |bus|94.8|96.2| 63 | |car|96.7|98.2| 64 | |cat|98.1|98.8| 65 | |chair|78.0|82.0| 66 | |cow|97.0|98.1| 67 | |diningtable|85.6|89.7| 68 | |dog|97.8|98.8 69 | |horse|98.3|98.7| 70 | |motorbike|96.4|97.0| 71 | |person|98.8|99.0| 72 | |pottedplant|84.9|86.9| 73 | |sheep|96.5|98.1| 74 | |sofa|79.8|85.8| 75 | |train|98.4|99.0| 76 | |tvmonitor|92.8|93.7| 77 | | mAP | 93.4|95.0| 78 | 79 | Pascal VOC 2012: 80 | 81 | | Classes | AP(SSGRL)| AP(pre) | 82 | |-------------|--------|--------| 83 | |aeroplane|99.5|99.7| 84 | |bicycle|95.1|96.1| 85 | |bird|97.4|97.7| 86 | |boat|96.4|96.5| 87 | |bottle|85.8|86.9| 88 | |bus|94.5|95.8| 89 | |car|93.7|95.0| 90 | |cat|98.9|98.9| 91 | |chair|86.7|88.3| 92 | |cow|96.3|97.6| 93 | |diningtable|84.6|87.4| 94 | |dog|98.9| 99.1| 95 | |horse|98.6|99.2| 96 | |motorbike|96.2|97.3| 97 | |person|98.7|99.0| 98 | |pottedplant|82.2|84.8| 99 | |sheep|98.2|98.3| 100 | |sofa|84.2|85.8| 101 | |train|98.1|99.2| 102 | |tvmonitor|93.5|94.1| 103 | | mAP | 93.9|94.8| 104 | 105 | VisualGenome-500 106 | 107 | | Method | mAP| 108 | |-------------|--------| 109 | |SSGRL|36.6| 110 | 111 | ## Citation 112 | @inproceedings{chen2019learning, 113 | title={Learning semantic-specific graph representation for multi-label image recognition}, 114 | author={Chen, Tianshui and Xu, Muxin and Hui, Xiaolu and Wu, Hefeng and Lin, Liang}, 115 | booktitle={Proceedings of the IEEE International Conference on Computer Vision}, 116 | pages={522--531}, 117 | year={2019} 118 | } 119 | @article{chen2020knowledge, 120 | title={Knowledge-guided multi-label few-shot learning for general image recognition}, 121 | author={Chen, Tianshui and Lin, Liang and Hui, Xiaolu and Chen, Riquan and Wu, Hefeng}, 122 | journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, 123 | year={2022}, 124 | publisher={IEEE} 125 | } 126 | 127 | 128 | ## Contributing 129 | For any questions, feel free to open an issue or contact us (tianshuichen@gmail.com & xumx7@mail2.sysu.edu.cn & huixlu@mail2.sysu.edu.cn) 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HCPLab-SYSU/SSGRL/ea47ccb2cf55ff37c5a91fc5a6974bdbc9ab6679/datasets/__init__.py -------------------------------------------------------------------------------- /datasets/cocodataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | sys.path.append('/data1/multi-label/MS-COCO_2014/cocoapi/PythonAPI') 4 | #sys.path.append('/home/chentianshui/xmx/multi-label/cocoapi/PythonAPI') 5 | import torchvision.datasets as dset 6 | import torchvision.transforms as transforms 7 | import torch.utils.data as data 8 | from PIL import Image 9 | import numpy as np 10 | import json 11 | import random 12 | 13 | 14 | class CoCoDataset(data.Dataset): 15 | def __init__(self, image_dir, anno_path, input_transform=None, labels_path=None): 16 | self.coco = dset.CocoDetection(root=image_dir, annFile=anno_path) 17 | with open('./data/coco/category.json','r') as load_category: 18 | self.category_map = json.load(load_category) 19 | self.input_transform = input_transform 20 | self.labels_path = labels_path 21 | 22 | self.labels = [] 23 | if self.labels_path: 24 | self.labels = np.load(self.labels_path).astype(np.float64) 25 | self.labels = (self.labels > 0).astype(np.float64) 26 | else: 27 | l = len(self.coco) 28 | for i in range(l): 29 | item = self.coco[i] 30 | print(i) 31 | categories = self.getCategoryList(item[1]) 32 | label = self.getLabelVector(categories) 33 | self.labels.append(label) 34 | 35 | 36 | def __getitem__(self, index): 37 | input = self.coco[index][0] 38 | if self.input_transform: 39 | input = self.input_transform(input) 40 | return input, self.labels[index] 41 | 42 | 43 | def getCategoryList(self, item): 44 | categories = set() 45 | for t in item: 46 | categories.add(t['category_id']) 47 | return list(categories) 48 | 49 | def getLabelVector(self, categories): 50 | label = np.zeros(80) 51 | label_num = len(categories) 52 | for c in categories: 53 | index = self.category_map[str(c)]-1 54 | label[index] = 1.0 / label_num 55 | return label 56 | 57 | 58 | def __len__(self): 59 | return len(self.coco) 60 | -------------------------------------------------------------------------------- /datasets/vgdataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | import torchvision.datasets as dset 4 | import torchvision.transforms as transforms 5 | import torch.utils.data as data 6 | from PIL import Image 7 | import numpy as np 8 | import json 9 | import random 10 | import os 11 | #category_info = '../data/VG/vg_category_1000.json' 12 | 13 | class VGDataset(data.Dataset): 14 | def __init__(self, img_dir, img_list, input_transform, label_path): 15 | with open(img_list, 'r') as f: 16 | self.img_names = f.readlines() 17 | with open(label_path, 'r') as f: 18 | self.labels = json.load(f) 19 | 20 | self.input_transform = input_transform 21 | self.img_dir = img_dir 22 | self.num_classes= 500 23 | 24 | def __getitem__(self, index): 25 | name = self.img_names[index][:-1] 26 | input = Image.open(os.path.join(self.img_dir, name)).convert('RGB') 27 | #b, g, r = input.split() 28 | #input = Image.merge("RGB", (r, g, b)) 29 | if self.input_transform: 30 | input = self.input_transform(input) 31 | label = np.zeros(self.num_classes).astype(np.float32) 32 | label[self.labels[name]] = 1.0 33 | return input, label 34 | 35 | def __len__(self): 36 | return len(self.img_names) 37 | 38 | -------------------------------------------------------------------------------- /datasets/voc07dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | import torchvision.datasets as dset 4 | import torchvision.transforms as transforms 5 | import torch.utils.data as data 6 | from PIL import Image 7 | import numpy as np 8 | import json 9 | import random 10 | from xml.dom.minidom import parse 11 | import xml.dom.minidom 12 | import os 13 | category_info = {'aeroplane':0, 'bicycle':1, 'bird':2, 'boat':3, 'bottle':4, 14 | 'bus':5, 'car':6, 'cat':7, 'chair':8, 'cow':9, 15 | 'diningtable':10, 'dog':11, 'horse':12, 'motorbike':13, 'person':14, 16 | 'pottedplant':15, 'sheep':16, 'sofa':17, 'train':18, 'tvmonitor':19} 17 | 18 | class Voc07Dataset(data.Dataset): 19 | def __init__(self, img_dir='./data/VOCdevkit/VOC2007/JPEGImages', anno_path='./data/VOCdevkit/VOC2007/Main/trainval.txt', input_transform=None, labels_path='./data/VOCdevkit/VOC2007/Annotations'): 20 | self.img_names = [] 21 | with open(anno_path, 'r') as f: 22 | self.img_names = f.readlines() 23 | self.img_dir = img_dir 24 | 25 | self.labels = [] 26 | for name in self.img_names: 27 | label_file = os.path.join(labels_path,name[:-1]+'.xml') 28 | label_vector = np.zeros(20) 29 | DOMTree = xml.dom.minidom.parse(label_file) 30 | root = DOMTree.documentElement 31 | objects = root.getElementsByTagName('object') 32 | for obj in objects: 33 | if (obj.getElementsByTagName('difficult')[0].firstChild.data) == '1': 34 | continue 35 | tag = obj.getElementsByTagName('name')[0].firstChild.data.lower() 36 | label_vector[int(category_info[tag])] = 1.0 37 | self.labels.append(label_vector) 38 | self.labels = np.array(self.labels).astype(np.float32) 39 | self.input_transform = input_transform 40 | def __getitem__(self, index): 41 | name = self.img_names[index][:-1]+'.jpg' 42 | input = Image.open(os.path.join(self.img_dir, name)).convert('RGB') 43 | 44 | if self.input_transform: 45 | input = self.input_transform(input) 46 | return input, self.labels[index] 47 | 48 | def __len__(self): 49 | return len(self.img_names) 50 | -------------------------------------------------------------------------------- /datasets/voc12dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | import torchvision.datasets as dset 4 | import torchvision.transforms as transforms 5 | import torch.utils.data as data 6 | from PIL import Image 7 | import numpy as np 8 | import json 9 | import random 10 | from xml.dom.minidom import parse 11 | import xml.dom.minidom 12 | import os 13 | category_info = {'aeroplane':0, 'bicycle':1, 'bird':2, 'boat':3, 'bottle':4, 14 | 'bus':5, 'car':6, 'cat':7, 'chair':8, 'cow':9, 15 | 'diningtable':10, 'dog':11, 'horse':12, 'motorbike':13, 'person':14, 16 | 'pottedplant':15, 'sheep':16, 'sofa':17, 'train':18, 'tvmonitor':19} 17 | 18 | class Voc12Dataset(data.Dataset): 19 | def __init__(self, img_dir='./data/VOCdevkit/VOC2012/JPEGImages', anno_path='./data/VOCdevkit/VOC2012/Main/trainval.txt', input_transform=None, labels_path='./data/VOCdevkit/VOC2012/Annotations'): 20 | self.img_names = [] 21 | with open(anno_path, 'r') as f: 22 | self.img_names = f.readlines() 23 | self.img_dir = img_dir 24 | self.labels = [] 25 | if labels_path == './': 26 | # no ground truth of test data of voc12, just a placeholder 27 | self.labels = np.ones((len(self.img_names),20)) 28 | else: 29 | for name in self.img_names: 30 | label_file = os.path.join(labels_path,name[:-1]+'.xml') 31 | label_vector = np.zeros(20) 32 | DOMTree = xml.dom.minidom.parse(label_file) 33 | root = DOMTree.documentElement 34 | objects = root.getElementsByTagName('object') 35 | for obj in objects: 36 | if (obj.getElementsByTagName('difficult')[0].firstChild.data) == '1': 37 | continue 38 | tag = obj.getElementsByTagName('name')[0].firstChild.data.lower() 39 | label_vector[int(category_info[tag])] = 1.0 40 | self.labels.append(label_vector) 41 | self.labels = np.array(self.labels).astype(np.float32) 42 | self.input_transform = input_transform 43 | def __getitem__(self, index): 44 | name = self.img_names[index][:-1]+'.jpg' 45 | input = Image.open(os.path.join(self.img_dir, name)).convert('RGB') 46 | 47 | if self.input_transform: 48 | input = self.input_transform(input) 49 | return input, self.labels[index] 50 | 51 | def __len__(self): 52 | return len(self.img_names) 53 | -------------------------------------------------------------------------------- /element_wise_layer.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | from torch.nn.parameter import Parameter 5 | import torch.nn.functional as F 6 | import torch.nn as nn 7 | 8 | class Element_Wise_Layer(nn.Module): 9 | def __init__(self, in_features, out_features, bias=True): 10 | super(Element_Wise_Layer, self).__init__() 11 | self.in_features = in_features 12 | self.out_features = out_features 13 | self.weight = Parameter(torch.Tensor(in_features, out_features)) 14 | if bias: 15 | self.bias = Parameter(torch.Tensor(in_features)) 16 | else: 17 | self.register_parameter('bias', None) 18 | self.reset_parameters() 19 | 20 | def reset_parameters(self): 21 | stdv = 1. / math.sqrt(self.weight.size(1)) 22 | for i in range(self.in_features): 23 | self.weight[i].data.uniform_(-stdv, stdv) 24 | if self.bias is not None: 25 | for i in range(self.in_features): 26 | self.bias[i].data.uniform_(-stdv, stdv) 27 | 28 | 29 | def forward(self, input): 30 | #print('input_size: {}'.format(input.size())) 31 | #(class_num, feature_dim) 32 | #print('weight size: {}'.format(self.weight.size())) 33 | x = input * self.weight 34 | #(class_num, 1) 35 | x = torch.sum(x,2) 36 | #print('after reducing(sum): {}'.format(x.size())) 37 | if self.bias is not None: 38 | x = x + self.bias 39 | return x 40 | 41 | def extra_repr(self): 42 | return 'in_features={}, out_features={}, bias={}'.format( 43 | self.in_features, self.out_features, self.bias is not None) 44 | -------------------------------------------------------------------------------- /ggnn.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | import torch.nn.functional as F 7 | import numpy as np 8 | 9 | class GGNN(nn.Module): 10 | def __init__(self, input_dim, time_step, in_matrix,out_matrix): 11 | super(GGNN, self).__init__() 12 | self.input_dim = input_dim 13 | self.time_step = time_step 14 | self._in_matrix = in_matrix 15 | self._out_matrix = out_matrix 16 | 17 | self.fc_eq3_w = nn.Linear(2*input_dim, input_dim) 18 | self.fc_eq3_u = nn.Linear(input_dim, input_dim) 19 | self.fc_eq4_w = nn.Linear(2*input_dim, input_dim) 20 | self.fc_eq4_u = nn.Linear(input_dim, input_dim) 21 | self.fc_eq5_w = nn.Linear(2*input_dim, input_dim) 22 | self.fc_eq5_u = nn.Linear(input_dim, input_dim) 23 | 24 | def forward(self, input): 25 | batch_size = input.size()[0] 26 | input = input.view(-1, self.input_dim) 27 | node_num = self._in_matrix.size()[0] 28 | batch_aog_nodes = input.view(batch_size, node_num, self.input_dim) 29 | batch_in_matrix = self._in_matrix.repeat(batch_size, 1).view(batch_size, node_num, -1) 30 | batch_out_matrix = self._out_matrix.repeat(batch_size, 1).view(batch_size, node_num, -1) 31 | for t in xrange(self.time_step): 32 | # eq(2) 33 | av = torch.cat((torch.bmm(batch_in_matrix, batch_aog_nodes), torch.bmm(batch_out_matrix, batch_aog_nodes)), 2) 34 | av = av.view(batch_size * node_num, -1) 35 | 36 | flatten_aog_nodes = batch_aog_nodes.view(batch_size * node_num, -1) 37 | 38 | # eq(3) 39 | zv = torch.sigmoid(self.fc_eq3_w(av) + self.fc_eq3_u(flatten_aog_nodes)) 40 | 41 | # eq(4) 42 | rv = torch.sigmoid(self.fc_eq4_w(av) + self.fc_eq3_u(flatten_aog_nodes)) 43 | 44 | #eq(5) 45 | hv = torch.tanh(self.fc_eq5_w(av) + self.fc_eq5_u(rv * flatten_aog_nodes)) 46 | 47 | flatten_aog_nodes = (1 - zv) * flatten_aog_nodes + zv * hv 48 | batch_aog_nodes = flatten_aog_nodes.view(batch_size, node_num, -1) 49 | return batch_aog_nodes 50 | 51 | 52 | -------------------------------------------------------------------------------- /images/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HCPLab-SYSU/SSGRL/ea47ccb2cf55ff37c5a91fc5a6974bdbc9ab6679/images/framework.png -------------------------------------------------------------------------------- /images/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HCPLab-SYSU/SSGRL/ea47ccb2cf55ff37c5a91fc5a6974bdbc9ab6679/images/pipeline.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os,sys 3 | import shutil 4 | import time,pickle 5 | import numpy as np 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import torch.nn.parallel 10 | import torch.backends.cudnn as cudnn 11 | import torch.optim 12 | import torch.utils.data 13 | import torchvision.transforms as transforms 14 | import torchvision.models as models 15 | from torch.utils.data import DataLoader 16 | 17 | from utils.transforms import get_train_test_set 18 | from networks.resnet import resnet101 19 | from utils.load_pretrain_model import load_pretrain_model 20 | from utils.metrics import voc12_mAP 21 | from models import SSGRL 22 | 23 | global best_prec1 24 | best_prec1 = 0 25 | 26 | def arg_parse(): 27 | parser = argparse.ArgumentParser(description='PyTorch multi label Training') 28 | parser.add_argument('dataset', metavar='DATASET', 29 | help='path to train dataset') 30 | parser.add_argument('train_data', metavar='DIR', 31 | help='path to train dataset') 32 | parser.add_argument('test_data', metavar='DIR', 33 | help='path to test dataset') 34 | parser.add_argument('trainlist', metavar='DIR', 35 | help='path to train list') 36 | parser.add_argument('testlist', metavar='DIR', 37 | help='path to test list') 38 | parser.add_argument('-pm','--pretrain_model', default='', type=str, metavar='PATH', 39 | help='path to latest pretrained_model (default: none)') 40 | parser.add_argument('-train_label', default='', type=str, metavar='PATH', 41 | help='path to train label (default: none)') 42 | parser.add_argument('-graph_file', default='', type=str, metavar='PATH', 43 | help='path to graph (default: none)') 44 | parser.add_argument('-word_file', default='', type=str, metavar='PATH', 45 | help='path to word feature') 46 | parser.add_argument('-test_label', default='', type=str, metavar='PATH', 47 | help='path to test label (default: none)') 48 | parser.add_argument('--print_freq', '-p', default=100, type=int, metavar='N', 49 | help='number of print_freq (default: 100)') 50 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', 51 | help='number of data loading workers (default: 4)') 52 | parser.add_argument('--epochs', default=90, type=int, metavar='N', 53 | help='number of total epochs to run') 54 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N', 55 | help='manual epoch number (useful on restarts)') 56 | parser.add_argument('--step_epoch', default=30, type=int, metavar='N', 57 | help='decend the lr in epoch number') 58 | parser.add_argument('-b', '--batch-size', default=256, type=int, 59 | metavar='N', help='mini-batch size (default: 256)') 60 | parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, 61 | metavar='LR', help='initial learning rate') 62 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M', 63 | help='momentum') 64 | parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float, 65 | metavar='N', help='print frequency (default: 10)') 66 | parser.add_argument('--resume', default='', type=str, metavar='PATH', 67 | help='path to latest checkpoint (default: none)') 68 | parser.add_argument('--pretrained', dest='pretrained', type=int,default=0, 69 | help='use pre-trained model') 70 | parser.add_argument('--crop_size', dest='crop_size',default=224, type=int, 71 | help='crop size') 72 | parser.add_argument('--scale_size', dest = 'scale_size',default=448, type=int, 73 | help='the size of the rescale image') 74 | parser.add_argument('--evaluate', dest='evaluate', action='store_true', 75 | help='evaluate model on validation set') 76 | parser.add_argument('--post', dest='post', type=str,default='', 77 | help='postname of save model') 78 | parser.add_argument('--num_classes', '-n', default=80, type=int, metavar='N', 79 | help='number of classes (default: 80)') 80 | args = parser.parse_args() 81 | return args 82 | 83 | def print_args(args): 84 | print "==========================================" 85 | print "========== CONFIG =============" 86 | print "==========================================" 87 | for arg,content in args.__dict__.items(): 88 | print "{}:{}".format(arg,content) 89 | print "\n" 90 | 91 | 92 | def main(): 93 | global best_prec1 94 | args = arg_parse() 95 | print_args(args) 96 | 97 | # Create dataloader 98 | print "==> Creating dataloader..." 99 | train_data_dir = args.train_data 100 | test_data_dir = args.test_data 101 | train_list = args.trainlist 102 | test_list = args.testlist 103 | train_label = args.train_label 104 | test_label = args.test_label 105 | train_loader,test_loader = get_train_test_set(train_data_dir,test_data_dir,train_list,test_list,train_label, test_label,args) 106 | 107 | # load the network 108 | print "==> Loading the network ..." 109 | 110 | model = SSGRL(image_feature_dim=2048, 111 | output_dim=2048, time_step=3, 112 | adjacency_matrix=args.graph_file, 113 | word_features=args.word_file, 114 | num_classes=args.num_classes) 115 | 116 | if args.pretrained: 117 | model = load_pretrain_model(model,args) 118 | model.cuda() 119 | 120 | criterion = nn.BCEWithLogitsLoss(reduce=True, size_average=True).cuda() 121 | for p in model.resnet_101.parameters(): 122 | p.requires_grad=False 123 | for p in model.resnet_101.layer4.parameters(): 124 | p.requires_grad=True 125 | optimizer = torch.optim.Adam(filter(lambda p : p.requires_grad,model.parameters()), lr=args.lr) 126 | 127 | if args.resume: 128 | if os.path.isfile(args.resume): 129 | print("=> loading checkpoint '{}'".format(args.resume)) 130 | checkpoint = torch.load(args.resume) 131 | args.start_epoch = checkpoint['epoch'] 132 | best_prec1 = checkpoint['best_mAP'] 133 | model.load_state_dict(checkpoint['state_dict']) 134 | print("=> loaded checkpoint '{}' (epoch {})" 135 | .format(args.resume, checkpoint['epoch'])) 136 | else: 137 | print("=> no checkpoint found at '{}'".format(args.resume)) 138 | 139 | cudnn.benchmark = True 140 | 141 | if args.evaluate: 142 | with torch.no_grad(): 143 | validate(test_loader, model, criterion, 0, args) 144 | return 145 | 146 | for epoch in range(args.start_epoch,args.epochs): 147 | train(train_loader, model, criterion, optimizer, epoch, args) 148 | 149 | # evaluate on validation set 150 | with torch.no_grad(): 151 | mAP = validate(test_loader, model, criterion, epoch, args) 152 | # remember best prec@1 and save checkpoint 153 | is_best = mAP > best_prec1 154 | best_prec1 = max(mAP, best_prec1) 155 | save_checkpoint({ 156 | 'epoch': epoch + 1, 157 | 'state_dict': model.state_dict(), 158 | 'best_mAP': mAP, 159 | }, is_best,args) 160 | 161 | def train(train_loader, model, criterion, optimizer, epoch, args): 162 | batch_time = AverageMeter() 163 | data_time = AverageMeter() 164 | losses = AverageMeter() 165 | top1 = AverageMeter() 166 | top5 = AverageMeter() 167 | 168 | model.train() 169 | end = time.time() 170 | model.resnet_101.eval() 171 | model.resnet_101.layer4.train() 172 | for i, (input, target) in enumerate(train_loader): 173 | # measure data loading time 174 | data_time.update(time.time() - end) 175 | target = torch.tensor(target).cuda(async=True) 176 | input_var = torch.tensor(input).cuda() 177 | # compute output 178 | 179 | t1 = time.time() 180 | output = model(input_var) 181 | target = target.float() 182 | target = target.cuda(async=True) 183 | target_var = torch.autograd.Variable(target) 184 | loss = criterion(output, target_var) 185 | losses.update(loss.data[0], input.size(0)) 186 | 187 | # compute gradient and do SGD step 188 | optimizer.zero_grad() 189 | loss.backward() 190 | optimizer.step() 191 | 192 | # measure elapsed time 193 | batch_time.update(time.time() - end) 194 | end = time.time() 195 | 196 | if i % args.print_freq == 0: 197 | print('Epoch: [{0}][{1}/{2}]\t' 198 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 199 | 'Loss {loss.val:.4f} ({loss.avg:.4f})'.format( 200 | epoch, i, len(train_loader), batch_time=batch_time, 201 | loss=losses)) 202 | 203 | def validate(val_loader, model, criterion,epoch,args): 204 | batch_time = AverageMeter() 205 | losses = AverageMeter() 206 | top1 = AverageMeter() 207 | top5 = AverageMeter() 208 | 209 | # switch to evaluate mode 210 | model.eval() 211 | end = time.time() 212 | x=[] 213 | for i, (input, target) in enumerate(val_loader): 214 | target = torch.tensor(target).cuda(async=True) 215 | input_var = torch.tensor(input).cuda() 216 | output = model(input_var) 217 | target = target.float() 218 | target = target.cuda(async=True) 219 | target_var = torch.autograd.Variable(target) 220 | loss = criterion(output, target_var) 221 | losses.update(loss.data[0],input.size(0)) 222 | 223 | mask = (target > 0).float() 224 | v = torch.cat((output, mask),1) 225 | x.append(v) 226 | 227 | # measure elapsed time 228 | batch_time.update(time.time() - end) 229 | end = time.time() 230 | 231 | if i % args.print_freq == 0: 232 | print('Test: [{0}/{1}]\t' 233 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 234 | 'Loss {loss.val:.4f} ({loss.avg:.4f})'.format( 235 | i, len(val_loader), batch_time=batch_time, loss=losses)) 236 | x = torch.cat(x,0) 237 | x = x.cpu().detach().numpy() 238 | print(x.shape) 239 | np.savetxt(args.post+'_score', x) 240 | mAP=voc12_mAP(args.post+'_score', args.num_classes) 241 | print(' * mAP {mAP:.3f}'.format(mAP=mAP)) 242 | return mAP 243 | 244 | def save_checkpoint(state, is_best, args,filename='checkpoint.pth.tar'): 245 | filename = 'checkpoint_{}.pth.tar'.format(args.post) 246 | torch.save(state, filename) 247 | if is_best: 248 | shutil.copyfile(filename, 'model_best_{}.pth.tar'.format(args.post)) 249 | 250 | class AverageMeter(object): 251 | """Computes and stores the average and current value""" 252 | def __init__(self): 253 | self.reset() 254 | 255 | def reset(self): 256 | self.val = 0 257 | self.avg = 0 258 | self.sum = 0 259 | self.count = 0 260 | 261 | def update(self, val, n=1): 262 | self.val = val 263 | self.sum += val * n 264 | self.count += n 265 | self.avg = self.sum / self.count 266 | 267 | 268 | 269 | 270 | def accuracy(output, target, topk=(1,)): 271 | """Computes the precision@k for the specified values of k""" 272 | maxk = max(topk) 273 | batch_size = target.size(0) 274 | 275 | _, pred = output.topk(maxk, 1, True, True) 276 | pred = pred.t() 277 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 278 | 279 | res = [] 280 | for k in topk: 281 | correct_k = correct[:k].view(-1).float().sum(0) 282 | res.append(correct_k.mul_(100.0 / batch_size)) 283 | return res 284 | 285 | 286 | if __name__=="__main__": 287 | main() 288 | 289 | -------------------------------------------------------------------------------- /main_coco.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #LOG="log/ResNet101-baseline-448-Adam-1e-5-bs16.txt" 4 | LOG="log/finetune-81.8-`date +'%Y-%m-%d_%H-%M-%S'`" 5 | #exec &> >(tee -a "$LOG") 6 | 7 | # usage: 8 | # ./main.sh [post(any content to record the conducted experiment)] 9 | #LOG="log/bcnn.`date +'%Y-%m-%d_%H-%M-%S'`" 10 | #exec &> >(tee -a "$LOG") 11 | dataset='COCO' 12 | train_data_dir='./data/coco/train2014' 13 | train_list='./data/coco/annotations/instances_train2014.json' 14 | test_data_dir='./data/coco/val2014' 15 | test_list='./data/coco/annotations/instances_val2014.json' 16 | train_label='./data/coco/train_label_vectors.npy' 17 | test_label='./data/coco/val_label_vectors.npy' 18 | 19 | graph_file='./data/coco/prob_train.npy' 20 | word_file='./data/coco/vectors.npy' 21 | #word_file='../data/coco_random_shuffle.npy' 22 | batch_size=4 23 | epochs=200 24 | learning_rate=1e-5 25 | momentum=0.9 26 | weight_decay=0 27 | 28 | pretrained=1 29 | pretrain_model='./pretrain_model/resnet101.pth.tar' 30 | #input parameter 31 | crop_size=576 32 | scale_size=640 33 | 34 | #number of data loading workers 35 | workers=2 36 | #manual epoch number (useful on restarts) 37 | start_epoch=0 38 | #epoch number to decend lr 39 | step_epoch=1516541 40 | #print frequency (default: 10) 41 | print_freq=500 42 | #path to latest checkpoint (default: none) 43 | #resume="model_best_vgg_pretrain_bk.pth.tar" 44 | #resume="backup/86.26.pth.tar" 45 | #evaluate mode 46 | evaluate=false 47 | extra_cmd="" 48 | if $evaluate 49 | then 50 | extra_cmd="$extra_cmd --evaluate" 51 | fi 52 | # resume is not none 53 | if [ -n "$resume" ]; 54 | then 55 | extra_cmd="$extra_cmd --resume $resume" 56 | fi 57 | 58 | 59 | # use single gpu (eg,gpu 0) to trian: 60 | # CUDA_VISIBLE_DEVICES=0 61 | # use multiple gpu (eg,gpu 0 and 1) to train 62 | # CUDA_VISIBLE_DEVICES=0,1 63 | CUDA_VISIBLE_DEVICES=$1 python main.py \ 64 | ${dataset} \ 65 | ${train_data_dir} \ 66 | ${test_data_dir} \ 67 | ${train_list} \ 68 | ${test_list} \ 69 | -b ${batch_size} \ 70 | -train_label ${train_label} \ 71 | -test_label ${test_label} \ 72 | -graph_file ${graph_file} \ 73 | -word_file ${word_file} \ 74 | -j ${workers} \ 75 | --epochs ${epochs} \ 76 | --start-epoch ${start_epoch} \ 77 | --batch-size ${batch_size} \ 78 | --learning-rate ${learning_rate} \ 79 | --momentum ${momentum} \ 80 | --weight-decay ${weight_decay} \ 81 | --crop_size ${crop_size} \ 82 | --scale_size ${scale_size} \ 83 | --step_epoch ${step_epoch} \ 84 | --print_freq ${print_freq} \ 85 | --pretrained ${pretrained} \ 86 | --pretrain_model ${pretrain_model} \ 87 | --post $2\ 88 | ${extra_cmd} 89 | -------------------------------------------------------------------------------- /main_vg.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #LOG="log/ResNet101-baseline-448-Adam-1e-5-bs16.txt" 4 | LOG="log/finetune-81.8-`date +'%Y-%m-%d_%H-%M-%S'`" 5 | #exec &> >(tee -a "$LOG") 6 | 7 | # usage: 8 | # ./main.sh [post(any content to record the conducted experiment)] 9 | #LOG="log/bcnn.`date +'%Y-%m-%d_%H-%M-%S'`" 10 | #exec &> >(tee -a "$LOG") 11 | dataset='VG' 12 | train_data_dir='./data/VG/VG_100K' 13 | train_list='./data/VG/train_list_500.txt' 14 | test_data_dir='./data/VG/VG_100K' 15 | test_list='./data/VG/test_list_500.txt' 16 | train_label='./data/VG/vg_category_500_labels_index.json' 17 | test_label='./data/VG/vg_category_500_labels_index.json' 18 | 19 | graph_file='./data/VG/graph_500_norm.npy' 20 | word_file='./data/VG/vg_500_vector.npy' 21 | #word_file='../data/coco_random_shuffle.npy' 22 | batch_size=4 23 | epochs=200 24 | learning_rate=1e-5 25 | momentum=0.9 26 | weight_decay=0 27 | num_classes=500 28 | pretrained=1 29 | pretrain_model='./pretrain_model/resnet101.pth.tar' 30 | #input parameter 31 | crop_size=576 32 | scale_size=640 33 | 34 | #number of data loading workers 35 | workers=2 36 | #manual epoch number (useful on restarts) 37 | start_epoch=0 38 | #epoch number to decend lr 39 | step_epoch=1516541 40 | #print frequency (default: 10) 41 | print_freq=500 42 | #path to latest checkpoint (default: none) 43 | #resume="model_best_vgg_pretrain_bk.pth.tar" 44 | #resume="backup/86.26.pth.tar" 45 | #evaluate mode 46 | evaluate=false 47 | extra_cmd="" 48 | if $evaluate 49 | then 50 | extra_cmd="$extra_cmd --evaluate" 51 | fi 52 | # resume is not none 53 | if [ -n "$resume" ]; 54 | then 55 | extra_cmd="$extra_cmd --resume $resume" 56 | fi 57 | 58 | 59 | # use single gpu (eg,gpu 0) to trian: 60 | # CUDA_VISIBLE_DEVICES=0 61 | # use multiple gpu (eg,gpu 0 and 1) to train 62 | # CUDA_VISIBLE_DEVICES=0,1 63 | CUDA_VISIBLE_DEVICES=$1 python main.py \ 64 | ${dataset} \ 65 | ${train_data_dir} \ 66 | ${test_data_dir} \ 67 | ${train_list} \ 68 | ${test_list} \ 69 | -b ${batch_size} \ 70 | -train_label ${train_label} \ 71 | -test_label ${test_label} \ 72 | -graph_file ${graph_file} \ 73 | -word_file ${word_file} \ 74 | -j ${workers} \ 75 | --epochs ${epochs} \ 76 | --start-epoch ${start_epoch} \ 77 | --batch-size ${batch_size} \ 78 | --learning-rate ${learning_rate} \ 79 | --momentum ${momentum} \ 80 | --weight-decay ${weight_decay} \ 81 | --crop_size ${crop_size} \ 82 | --scale_size ${scale_size} \ 83 | --step_epoch ${step_epoch} \ 84 | --print_freq ${print_freq} \ 85 | --pretrained ${pretrained} \ 86 | --pretrain_model ${pretrain_model} \ 87 | --num_classes ${num_classes} \ 88 | --post $2\ 89 | ${extra_cmd} 90 | -------------------------------------------------------------------------------- /main_voc07.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #LOG="log/ResNet101-baseline-448-Adam-1e-5-bs16.txt" 4 | LOG="log/finetune-81.8-`date +'%Y-%m-%d_%H-%M-%S'`" 5 | #exec &> >(tee -a "$LOG") 6 | 7 | # usage: 8 | # ./main.sh [post(any content to record the conducted experiment)] 9 | #LOG="log/bcnn.`date +'%Y-%m-%d_%H-%M-%S'`" 10 | #exec &> >(tee -a "$LOG") 11 | dataset='VOC2007' 12 | train_data_dir='./data/VOCdevkit/VOC2007/JPEGImages' 13 | train_list='./data/VOCdevkit/VOC2007/ImageSets/Main/trainval.txt' 14 | test_data_dir='./data/VOCdevkit/VOC2007/JPEGImages' 15 | test_list='./data/VOCdevkit/VOC2007/ImageSets/Main/test.txt' 16 | train_label='./data/VOCdevkit/VOC2007/Annotations' 17 | test_label='./data/VOCdevkit/VOC2007/Annotations' 18 | 19 | graph_file='./data/VOCdevkit/VOC2007/prob_trainval.npy' 20 | word_file='./data/VOCdevkit/VOC2007/voc07_vector.npy' 21 | batch_size=4 22 | epochs=200 23 | learning_rate=1e-5 24 | momentum=0.9 25 | weight_decay=0 26 | num_classes=20 27 | pretrained=1 28 | pretrain_model='./pretrain_model/resnet101.pth.tar' 29 | #input parameter 30 | crop_size=576 31 | scale_size=640 32 | 33 | #number of data loading workers 34 | workers=2 35 | #manual epoch number (useful on restarts) 36 | start_epoch=0 37 | #epoch number to decend lr 38 | step_epoch=1516541 39 | #print frequency (default: 10) 40 | print_freq=500 41 | #path to latest checkpoint (default: none) 42 | #resume="model_best_vgg_pretrain_bk.pth.tar" 43 | #resume="backup/86.26.pth.tar" 44 | #evaluate mode 45 | evaluate=false 46 | extra_cmd="" 47 | if $evaluate 48 | then 49 | extra_cmd="$extra_cmd --evaluate" 50 | fi 51 | # resume is not none 52 | if [ -n "$resume" ]; 53 | then 54 | extra_cmd="$extra_cmd --resume $resume" 55 | fi 56 | 57 | 58 | # use single gpu (eg,gpu 0) to trian: 59 | # CUDA_VISIBLE_DEVICES=0 60 | # use multiple gpu (eg,gpu 0 and 1) to train 61 | # CUDA_VISIBLE_DEVICES=0,1 62 | CUDA_VISIBLE_DEVICES=$1 python main.py \ 63 | ${dataset} \ 64 | ${train_data_dir} \ 65 | ${test_data_dir} \ 66 | ${train_list} \ 67 | ${test_list} \ 68 | -b ${batch_size} \ 69 | -train_label ${train_label} \ 70 | -test_label ${test_label} \ 71 | -graph_file ${graph_file} \ 72 | -word_file ${word_file} \ 73 | -j ${workers} \ 74 | --epochs ${epochs} \ 75 | --start-epoch ${start_epoch} \ 76 | --batch-size ${batch_size} \ 77 | --learning-rate ${learning_rate} \ 78 | --momentum ${momentum} \ 79 | --weight-decay ${weight_decay} \ 80 | --crop_size ${crop_size} \ 81 | --scale_size ${scale_size} \ 82 | --step_epoch ${step_epoch} \ 83 | --print_freq ${print_freq} \ 84 | --pretrained ${pretrained} \ 85 | --pretrain_model ${pretrain_model} \ 86 | --num_classes ${num_classes} \ 87 | --post $2\ 88 | ${extra_cmd} 89 | -------------------------------------------------------------------------------- /main_voc12.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #LOG="log/ResNet101-baseline-448-Adam-1e-5-bs16.txt" 4 | LOG="log/finetune-81.8-`date +'%Y-%m-%d_%H-%M-%S'`" 5 | #exec &> >(tee -a "$LOG") 6 | 7 | # usage: 8 | # ./main.sh [post(any content to record the conducted experiment)] 9 | #LOG="log/bcnn.`date +'%Y-%m-%d_%H-%M-%S'`" 10 | #exec &> >(tee -a "$LOG") 11 | dataset='VOC2012' 12 | train_data_dir='./data/VOCdevkit/VOC2012/JPEGImages' 13 | train_list='./data/VOCdevkit/VOC2012/ImageSets/Main/trainval.txt' 14 | test_data_dir='./data/VOCdevkit/VOC2012/JPEGImages' 15 | test_list='./data/VOCdevkit/VOC2012/ImageSets/Main/test.txt' 16 | 17 | train_label='./data/VOCdevkit/VOC2012/Annotations' 18 | test_label='./' 19 | 20 | graph_file='./data/VOCdevkit/VOC2012/prob_trainval.npy' 21 | word_file='./data/VOCdevkit/VOC2012/voc12_vector.npy' 22 | batch_size=4 23 | epochs=200 24 | learning_rate=1e-5 25 | momentum=0.9 26 | weight_decay=0 27 | num_classes=20 28 | pretrained=1 29 | pretrain_model='./pretrain_model/resnet101.pth.tar' 30 | #input parameter 31 | crop_size=576 32 | scale_size=640 33 | 34 | #number of data loading workers 35 | workers=2 36 | #manual epoch number (useful on restarts) 37 | start_epoch=0 38 | #epoch number to decend lr 39 | step_epoch=1516541 40 | #print frequency (default: 10) 41 | print_freq=500 42 | #path to latest checkpoint (default: none) 43 | #resume="model_best_vgg_pretrain_bk.pth.tar" 44 | #resume="backup/86.26.pth.tar" 45 | #evaluate mode 46 | evaluate=false 47 | extra_cmd="" 48 | if $evaluate 49 | then 50 | extra_cmd="$extra_cmd --evaluate" 51 | fi 52 | # resume is not none 53 | if [ -n "$resume" ]; 54 | then 55 | extra_cmd="$extra_cmd --resume $resume" 56 | fi 57 | 58 | 59 | # use single gpu (eg,gpu 0) to trian: 60 | # CUDA_VISIBLE_DEVICES=0 61 | # use multiple gpu (eg,gpu 0 and 1) to train 62 | # CUDA_VISIBLE_DEVICES=0,1 63 | CUDA_VISIBLE_DEVICES=$1 python main.py \ 64 | ${dataset} \ 65 | ${train_data_dir} \ 66 | ${test_data_dir} \ 67 | ${train_list} \ 68 | ${test_list} \ 69 | -b ${batch_size} \ 70 | -train_label ${train_label} \ 71 | -test_label ${test_label} \ 72 | -graph_file ${graph_file} \ 73 | -word_file ${word_file} \ 74 | -j ${workers} \ 75 | --epochs ${epochs} \ 76 | --start-epoch ${start_epoch} \ 77 | --batch-size ${batch_size} \ 78 | --learning-rate ${learning_rate} \ 79 | --momentum ${momentum} \ 80 | --weight-decay ${weight_decay} \ 81 | --crop_size ${crop_size} \ 82 | --scale_size ${scale_size} \ 83 | --step_epoch ${step_epoch} \ 84 | --print_freq ${print_freq} \ 85 | --pretrained ${pretrained} \ 86 | --pretrain_model ${pretrain_model} \ 87 | --num_classes ${num_classes} \ 88 | --post $2\ 89 | ${extra_cmd} 90 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | from torch.autograd import Variable 5 | import torch.nn.functional as F 6 | 7 | from networks.resnet import resnet101 8 | from semantic import semantic 9 | from ggnn import GGNN 10 | from element_wise_layer import Element_Wise_Layer 11 | 12 | class SSGRL(nn.Module): 13 | def __init__(self, image_feature_dim, output_dim, time_step, 14 | adjacency_matrix, word_features, num_classes=80, word_feature_dim = 300): 15 | super(SSGRL, self).__init__() 16 | self.resnet_101 = resnet101() 17 | 18 | self.num_classes = num_classes 19 | self.word_feature_dim = word_feature_dim 20 | self.image_feature_dim = image_feature_dim 21 | 22 | self.word_semantic = semantic(num_classes= self.num_classes, 23 | image_feature_dim = self.image_feature_dim, 24 | word_feature_dim=self.word_feature_dim) 25 | 26 | self.word_features = word_features 27 | self._word_features = self.load_features() 28 | self.adjacency_matrix = adjacency_matrix 29 | self._in_matrix, self._out_matrix = self.load_matrix() 30 | self.time_step = time_step 31 | 32 | self.graph_net = GGNN(input_dim=self.image_feature_dim, 33 | time_step=self.time_step, 34 | in_matrix=self._in_matrix, 35 | out_matrix=self._out_matrix) 36 | 37 | self.output_dim = output_dim 38 | self.fc_output = nn.Linear(2*self.image_feature_dim, self.output_dim) 39 | self.classifiers = Element_Wise_Layer(self.num_classes, self.output_dim) 40 | 41 | def forward(self, x): 42 | batch_size = x.size()[0] 43 | img_feature_map = self.resnet_101(x) 44 | graph_net_input = self.word_semantic(batch_size, 45 | img_feature_map, 46 | torch.tensor(self._word_features).cuda()) 47 | graph_net_feature = self.graph_net(graph_net_input) 48 | 49 | output = torch.cat((graph_net_feature.view(batch_size*self.num_classes,-1), graph_net_input.view(-1, self.image_feature_dim)), 1) 50 | output = self.fc_output(output) 51 | output = torch.tanh(output) 52 | output = output.contiguous().view(batch_size, self.num_classes, self.output_dim) 53 | result = self.classifiers(output) 54 | return result 55 | 56 | def load_features(self): 57 | return Variable(torch.from_numpy(np.load(self.word_features).astype(np.float32))).cuda() 58 | 59 | def load_matrix(self): 60 | mat = np.load(self.adjacency_matrix) 61 | _in_matrix, _out_matrix = mat.astype(np.float32), mat.T.astype(np.float32) 62 | _in_matrix = Variable(torch.from_numpy(_in_matrix), requires_grad=False).cuda() 63 | _out_matrix = Variable(torch.from_numpy(_out_matrix), requires_grad=False).cuda() 64 | return _in_matrix, _out_matrix 65 | -------------------------------------------------------------------------------- /networks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HCPLab-SYSU/SSGRL/ea47ccb2cf55ff37c5a91fc5a6974bdbc9ab6679/networks/__init__.py -------------------------------------------------------------------------------- /networks/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HCPLab-SYSU/SSGRL/ea47ccb2cf55ff37c5a91fc5a6974bdbc9ab6679/networks/__init__.pyc -------------------------------------------------------------------------------- /networks/resnet.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import math 3 | import torch.utils.model_zoo as model_zoo 4 | import torch.nn.functional as F 5 | 6 | 7 | __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 8 | 'resnet152'] 9 | 10 | 11 | model_urls = { 12 | 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 13 | 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 14 | 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 15 | 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 16 | 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', 17 | } 18 | 19 | 20 | def conv3x3(in_planes, out_planes, stride=1): 21 | """3x3 convolution with padding""" 22 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 23 | padding=1, bias=False) 24 | 25 | 26 | class BasicBlock(nn.Module): 27 | expansion = 1 28 | 29 | def __init__(self, inplanes, planes, stride=1, downsample=None): 30 | super(BasicBlock, self).__init__() 31 | self.conv1 = conv3x3(inplanes, planes, stride) 32 | self.bn1 = nn.BatchNorm2d(planes) 33 | self.relu = nn.ReLU(inplace=True) 34 | self.conv2 = conv3x3(planes, planes) 35 | self.bn2 = nn.BatchNorm2d(planes) 36 | self.downsample = downsample 37 | self.stride = stride 38 | 39 | def forward(self, x): 40 | residual = x 41 | 42 | out = self.conv1(x) 43 | out = self.bn1(out) 44 | out = self.relu(out) 45 | 46 | out = self.conv2(out) 47 | out = self.bn2(out) 48 | 49 | if self.downsample is not None: 50 | residual = self.downsample(x) 51 | 52 | out += residual 53 | out = self.relu(out) 54 | 55 | return out 56 | 57 | 58 | class Bottleneck(nn.Module): 59 | expansion = 4 60 | 61 | def __init__(self, inplanes, planes, stride=1, downsample=None): 62 | super(Bottleneck, self).__init__() 63 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 64 | self.bn1 = nn.BatchNorm2d(planes) 65 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, 66 | padding=1, bias=False) 67 | self.bn2 = nn.BatchNorm2d(planes) 68 | self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False) 69 | self.bn3 = nn.BatchNorm2d(planes * self.expansion) 70 | self.relu = nn.ReLU(inplace=True) 71 | self.downsample = downsample 72 | self.stride = stride 73 | 74 | def forward(self, x): 75 | residual = x 76 | 77 | out = self.conv1(x) 78 | out = self.bn1(out) 79 | out = self.relu(out) 80 | 81 | out = self.conv2(out) 82 | out = self.bn2(out) 83 | out = self.relu(out) 84 | 85 | out = self.conv3(out) 86 | out = self.bn3(out) 87 | 88 | if self.downsample is not None: 89 | residual = self.downsample(x) 90 | 91 | out += residual 92 | out = self.relu(out) 93 | 94 | return out 95 | 96 | 97 | class ResNet(nn.Module): 98 | 99 | def __init__(self, block, layers, num_classes=80, avg_pool_kernel_size = 7): 100 | self.inplanes = 64 101 | super(ResNet, self).__init__() 102 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, 103 | bias=False) 104 | self.bn1 = nn.BatchNorm2d(64) 105 | self.relu = nn.ReLU(inplace=True) 106 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 107 | self.layer1 = self._make_layer(block, 64, layers[0]) 108 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 109 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 110 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 111 | 112 | self.avgpool1 = nn.AvgPool2d(2 , stride=2) 113 | #self.avgpool2 = nn.AvgPool2d(avg_pool_kernel_size, stride=1) 114 | #self.fc = nn.Linear(8192, num_classes) 115 | #self.fc = nn.Linear(512 * block.expansion, num_classes) 116 | 117 | for m in self.modules(): 118 | if isinstance(m, nn.Conv2d): 119 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 120 | elif isinstance(m, nn.BatchNorm2d): 121 | nn.init.constant_(m.weight, 1) 122 | nn.init.constant_(m.bias, 0) 123 | 124 | def _make_layer(self, block, planes, blocks, stride=1): 125 | downsample = None 126 | if stride != 1 or self.inplanes != planes * block.expansion: 127 | downsample = nn.Sequential( 128 | nn.Conv2d(self.inplanes, planes * block.expansion, 129 | kernel_size=1, stride=stride, bias=False), 130 | nn.BatchNorm2d(planes * block.expansion), 131 | ) 132 | 133 | layers = [] 134 | layers.append(block(self.inplanes, planes, stride, downsample)) 135 | self.inplanes = planes * block.expansion 136 | for i in range(1, blocks): 137 | layers.append(block(self.inplanes, planes)) 138 | 139 | return nn.Sequential(*layers) 140 | 141 | def forward(self, x): 142 | x = self.conv1(x) 143 | x = self.bn1(x) 144 | x = self.relu(x) 145 | x = self.maxpool(x) 146 | 147 | x = self.layer1(x) 148 | x = self.layer2(x) 149 | x = self.layer3(x) 150 | x = self.layer4(x) 151 | x = self.avgpool1(x) 152 | #print(x.shape) 153 | #x = self.avgpool2(x) 154 | #x = x.view(x.size(0), -1) 155 | #x = self.fc(x) 156 | 157 | return x 158 | 159 | 160 | def resnet18(pretrained=False, **kwargs): 161 | """Constructs a ResNet-18 model. 162 | 163 | Args: 164 | pretrained (bool): If True, returns a model pre-trained on ImageNet 165 | """ 166 | model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) 167 | if pretrained: 168 | model.load_state_dict(model_zoo.load_url(model_urls['resnet18'])) 169 | return model 170 | 171 | 172 | def resnet34(pretrained=False, **kwargs): 173 | """Constructs a ResNet-34 model. 174 | 175 | Args: 176 | pretrained (bool): If True, returns a model pre-trained on ImageNet 177 | """ 178 | model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs) 179 | if pretrained: 180 | model.load_state_dict(model_zoo.load_url(model_urls['resnet34'])) 181 | return model 182 | 183 | 184 | def resnet50(pretrained=False, **kwargs): 185 | """Constructs a ResNet-50 model. 186 | 187 | Args: 188 | pretrained (bool): If True, returns a model pre-trained on ImageNet 189 | """ 190 | model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) 191 | if pretrained: 192 | model.load_state_dict(model_zoo.load_url(model_urls['resnet50'])) 193 | return model 194 | 195 | 196 | def resnet101(pretrained=False, **kwargs): 197 | """Constructs a ResNet-101 model. 198 | 199 | Args: 200 | pretrained (bool): If True, returns a model pre-trained on ImageNet 201 | """ 202 | model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) 203 | #if pretrained: 204 | #model.load_state_dict(model_zoo.load_url(model_urls['resnet101'])) 205 | return model 206 | 207 | 208 | def resnet152(pretrained=False, **kwargs): 209 | """Constructs a ResNet-152 model. 210 | 211 | Args: 212 | pretrained (bool): If True, returns a model pre-trained on ImageNet 213 | """ 214 | model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) 215 | if pretrained: 216 | model.load_state_dict(model_zoo.load_url(model_urls['resnet152'])) 217 | return model 218 | -------------------------------------------------------------------------------- /networks/resnet.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HCPLab-SYSU/SSGRL/ea47ccb2cf55ff37c5a91fc5a6974bdbc9ab6679/networks/resnet.pyc -------------------------------------------------------------------------------- /semantic.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import numpy as np 4 | from torch.autograd import Variable 5 | import torch 6 | 7 | class semantic(nn.Module): 8 | def __init__(self, num_classes, image_feature_dim, word_feature_dim, intermediary_dim=1024): 9 | super(semantic, self).__init__() 10 | self.num_classes = num_classes 11 | self.image_feature_dim = image_feature_dim 12 | self.word_feature_dim = word_feature_dim 13 | self.intermediary_dim = intermediary_dim 14 | self.fc_1 = nn.Linear(self.image_feature_dim, self.intermediary_dim, bias=False) 15 | self.fc_2 = nn.Linear(self.word_feature_dim, self.intermediary_dim, bias=False) 16 | self.fc_3 = nn.Linear(self.intermediary_dim, self.intermediary_dim) 17 | self.fc_a = nn.Linear(self.intermediary_dim, 1) 18 | 19 | def forward(self,batch_size, img_feature_map, word_features): 20 | convsize = img_feature_map.size()[3] 21 | 22 | img_feature_map = torch.transpose(torch.transpose(img_feature_map, 1, 2),2,3) 23 | f_wh_feature = img_feature_map.contiguous().view(batch_size*convsize*convsize, -1) 24 | f_wh_feature = self.fc_1(f_wh_feature).view(batch_size*convsize*convsize, 1, -1).repeat(1, self.num_classes, 1) 25 | 26 | f_wd_feature = self.fc_2(word_features).view(1, self.num_classes, 1024).repeat(batch_size*convsize*convsize,1,1) 27 | lb_feature = self.fc_3(torch.tanh(f_wh_feature*f_wd_feature).view(-1,1024)) 28 | coefficient = self.fc_a(lb_feature) 29 | coefficient = torch.transpose(torch.transpose(coefficient.view(batch_size, convsize, convsize, self.num_classes),2,3),1,2).view(batch_size, self.num_classes, -1) 30 | 31 | coefficient = F.softmax(coefficient, dim=2) 32 | coefficient = coefficient.view(batch_size, self.num_classes, convsize, convsize) 33 | coefficient = torch.transpose(torch.transpose(coefficient,1,2),2,3) 34 | coefficient = coefficient.view(batch_size, convsize, convsize, self.num_classes, 1).repeat(1,1,1,1,self.image_feature_dim) 35 | img_feature_map = img_feature_map.view(batch_size, convsize, convsize, 1, self.image_feature_dim).repeat(1, 1, 1, self.num_classes, 1)* coefficient 36 | graph_net_input = torch.sum(torch.sum(img_feature_map,1) ,1) 37 | return graph_net_input 38 | 39 | 40 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HCPLab-SYSU/SSGRL/ea47ccb2cf55ff37c5a91fc5a6974bdbc9ab6679/utils/__init__.py -------------------------------------------------------------------------------- /utils/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HCPLab-SYSU/SSGRL/ea47ccb2cf55ff37c5a91fc5a6974bdbc9ab6679/utils/__init__.pyc -------------------------------------------------------------------------------- /utils/cocodataset.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HCPLab-SYSU/SSGRL/ea47ccb2cf55ff37c5a91fc5a6974bdbc9ab6679/utils/cocodataset.pyc -------------------------------------------------------------------------------- /utils/load_pretrain_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pickle 3 | import os 4 | import gc 5 | 6 | def load_pretrain_model(model, args): 7 | model_dict = model.resnet_101.state_dict() 8 | print('loading pretrained model from imagenet:') 9 | resnet_pretrained = torch.load(args.pretrain_model) 10 | pretrain_dict = {k:v for k, v in resnet_pretrained.items() if not k.startswith('fc')} 11 | model_dict.update(pretrain_dict) 12 | model.resnet_101.load_state_dict(model_dict) 13 | del resnet_pretrained 14 | del pretrain_dict 15 | gc.collect() 16 | return model 17 | -------------------------------------------------------------------------------- /utils/load_pretrain_model.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HCPLab-SYSU/SSGRL/ea47ccb2cf55ff37c5a91fc5a6974bdbc9ab6679/utils/load_pretrain_model.pyc -------------------------------------------------------------------------------- /utils/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy as np 3 | def voc12_mAP(imagessetfile, num): 4 | with open(imagessetfile, 'r') as f: 5 | lines = f.readlines() 6 | 7 | seg = np.array([x.strip().split(' ') for x in lines]).astype(float) 8 | gt_label = seg[:,num:].astype(np.int32) 9 | num_target = np.sum(gt_label, axis=1, keepdims = True) 10 | threshold = 1 / (num_target+1e-6) 11 | 12 | predict_result = seg[:,0:num] > threshold 13 | 14 | 15 | sample_num = len(gt_label) 16 | class_num = num 17 | tp = np.zeros(sample_num) 18 | fp = np.zeros(sample_num) 19 | aps = [] 20 | per_class_recall = [] 21 | 22 | for class_id in range(class_num): 23 | confidence = seg[:,class_id] 24 | sorted_ind = np.argsort(-confidence) 25 | sorted_scores = np.sort(-confidence) 26 | sorted_label = [gt_label[x][class_id] for x in sorted_ind] 27 | 28 | for i in range(sample_num): 29 | tp[i] = (sorted_label[i]>0) 30 | fp[i] = (sorted_label[i]<=0) 31 | true_num = 0 32 | true_num = sum(tp) 33 | fp = np.cumsum(fp) 34 | tp = np.cumsum(tp) 35 | rec = tp / float(true_num) 36 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 37 | ap = voc_ap(rec, prec, true_num) 38 | aps += [ap] 39 | 40 | np.set_printoptions(precision=3, suppress=True) 41 | mAP = np.mean(aps) 42 | return mAP 43 | 44 | def voc_ap(rec, prec,true_num): 45 | mrec = np.concatenate(([0.], rec, [1.])) 46 | mpre = np.concatenate(([0.], prec, [0.])) 47 | for i in range(mpre.size - 1, 0, -1): 48 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 49 | i = np.where(mrec[1:] != mrec[:-1])[0] 50 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 51 | return ap 52 | -------------------------------------------------------------------------------- /utils/metrics.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HCPLab-SYSU/SSGRL/ea47ccb2cf55ff37c5a91fc5a6974bdbc9ab6679/utils/metrics.pyc -------------------------------------------------------------------------------- /utils/transforms.py: -------------------------------------------------------------------------------- 1 | from datasets.vgdataset import VGDataset 2 | from datasets.cocodataset import CoCoDataset 3 | from datasets.voc07dataset import Voc07Dataset 4 | from datasets.voc12dataset import Voc12Dataset 5 | import torchvision.transforms as transforms 6 | from torch.utils.data import DataLoader 7 | def get_train_test_set(train_dir, test_dir, train_anno, test_anno, train_label=None, test_label=None,args = None): 8 | print('You will perform multi-scale on images for scale 640') 9 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 10 | scale_size = args.scale_size 11 | crop_size = args.crop_size 12 | 13 | train_data_transform = transforms.Compose([transforms.Resize((scale_size, scale_size)), 14 | transforms.RandomChoice([transforms.RandomCrop(640), 15 | transforms.RandomCrop(576), 16 | transforms.RandomCrop(512), 17 | transforms.RandomCrop(384), 18 | transforms.RandomCrop(320)]), 19 | transforms.Resize((crop_size, crop_size)), 20 | transforms.ToTensor(), 21 | normalize]) 22 | 23 | test_data_transform = transforms.Compose([transforms.Resize((scale_size, scale_size)), 24 | transforms.CenterCrop(crop_size), 25 | transforms.ToTensor(), 26 | normalize]) 27 | 28 | 29 | if args.dataset == 'COCO': 30 | train_set = CoCoDataset(train_dir, train_anno, train_data_transform, train_label) 31 | test_set = CoCoDataset(test_dir, test_anno, test_data_transform, test_label) 32 | elif args.dataset == 'VG': 33 | train_set = VGDataset(train_dir, train_anno, train_data_transform, train_label) 34 | test_set = VGDataset(test_dir, test_anno, test_data_transform, test_label) 35 | elif args.dataset == 'VOC2007': 36 | train_set = Voc07Dataset(train_dir, train_anno, train_data_transform, train_label) 37 | test_set = Voc07Dataset(test_dir, test_anno, test_data_transform, test_label) 38 | elif args.dataset == 'VOC2012': 39 | train_set = Voc12Dataset(train_dir, train_anno, train_data_transform, train_label) 40 | test_set = Voc12Dataset(test_dir, test_anno, test_data_transform, test_label) 41 | else: 42 | print('%s Dataset Not Found'%args.dataset) 43 | exit(1) 44 | train_loader = DataLoader(dataset=train_set, 45 | num_workers=args.workers, 46 | batch_size=args.batch_size, 47 | shuffle = True) 48 | test_loader = DataLoader(dataset=test_set, 49 | num_workers=args.workers, 50 | batch_size=args.batch_size, 51 | shuffle = False) 52 | return train_loader, test_loader 53 | -------------------------------------------------------------------------------- /utils/transforms.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HCPLab-SYSU/SSGRL/ea47ccb2cf55ff37c5a91fc5a6974bdbc9ab6679/utils/transforms.pyc --------------------------------------------------------------------------------