├── .gitignore ├── LICENSE ├── README.md ├── accuracy.png ├── checkpoint └── checkpoints.txt ├── data └── data.txt ├── dataset.py ├── log └── logs.txt ├── model.py ├── preprocess.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # VS Code project settings 104 | .vscode/ 105 | 106 | # data file 107 | data/*.pkl 108 | 109 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Kim Seonghyeon 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # relation-networks-pytorch 2 | Relation Networks (https://arxiv.org/abs/1706.01427) for CLEVR implemented in PyTorch 3 | 4 | Requirements: 5 | * Python 3.6 6 | * PyTorch 7 | * torch-vision 8 | * Pillow 9 | * nltk 10 | * tqdm 11 | 12 | To train: 13 | 14 | 1. Download and extract CLEVR v1.0 dataset from http://cs.stanford.edu/people/jcjohns/clevr/ 15 | 2. Preprocessing question data 16 | ``` 17 | python preprocess.py [CLEVR directory] 18 | ``` 19 | 3. Run train.py 20 | ``` 21 | python train.py [CLEVR directory] 22 | ``` 23 | 24 | # Reproduce status 25 | 26 | Finally reproduced this, thanks to [@mesnico](https://github.com/mesnico)! (https://github.com/mesnico/RelationNetworks-CLEVR) Crucial configurations to reproduce the result is large batch sizes (640), increase learning rate to 2x for every 20 epochs until it reaches maximum learning rate, and inverting questions, that is, feed question words into LSTM in reverse order. 27 | 28 | Accuracy plots 29 | 30 | ![Accuracy plot](accuracy.png) 31 | -------------------------------------------------------------------------------- /accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rosinality/relation-networks-pytorch/78f8ef5c5d832c8403b76c0ee657f9536be94c0b/accuracy.png -------------------------------------------------------------------------------- /checkpoint/checkpoints.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rosinality/relation-networks-pytorch/78f8ef5c5d832c8403b76c0ee657f9536be94c0b/checkpoint/checkpoints.txt -------------------------------------------------------------------------------- /data/data.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rosinality/relation-networks-pytorch/78f8ef5c5d832c8403b76c0ee657f9536be94c0b/data/data.txt -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | import numpy as np 5 | from PIL import Image 6 | import torch 7 | from torch.utils.data import Dataset 8 | from torchvision import transforms 9 | 10 | 11 | resize = transforms.Resize([128, 128]) 12 | 13 | transform = transforms.Compose([ 14 | transforms.Pad(8), 15 | transforms.RandomCrop([128, 128]), 16 | transforms.RandomRotation(2.8), 17 | transforms.ToTensor(), 18 | transforms.Normalize(mean=[0.5, 0.5, 0.5], 19 | std=[0.5, 0.5, 0.5]) 20 | ]) 21 | 22 | eval_transform = transforms.Compose([ 23 | transforms.ToTensor(), 24 | transforms.Normalize(mean=[0.5, 0.5, 0.5], 25 | std=[0.5, 0.5, 0.5]) 26 | ]) 27 | 28 | category = {'0': 'count', 29 | '1': 'count', 30 | '2': 'count', 31 | '3': 'count', 32 | '4': 'count', 33 | '5': 'count', 34 | '6': 'count', 35 | '7': 'count', 36 | '8': 'count', 37 | '9': 'count', 38 | '10': 'count', 39 | 'blue': 'color', 40 | 'brown': 'color', 41 | 'cyan': 'color', 42 | 'yellow': 'color', 43 | 'gray': 'color', 44 | 'green': 'color', 45 | 'purple': 'color', 46 | 'red': 'color', 47 | 'rubber': 'material', 48 | 'metal': 'material', 49 | 'large': 'size', 50 | 'small': 'size', 51 | 'cylinder': 'shape', 52 | 'cube': 'shape', 53 | 'sphere': 'shape', 54 | 'no': 'exist', 55 | 'yes': 'exist'} 56 | 57 | 58 | class CLEVR(Dataset): 59 | def __init__(self, root, split='train', transform=None, 60 | reverse_question=False, use_preprocessed=False): 61 | with open(f'data/{split}.pkl', 'rb') as f: 62 | self.data = pickle.load(f) 63 | 64 | with open('data/dic.pkl', 'rb') as f: 65 | self.dic = pickle.load(f) 66 | self.answer_class = {v: k for k, v in self.dic['answer_dic'].items()} 67 | 68 | self.transform = transform 69 | self.root = root 70 | self.split = split 71 | self.reverse_question = reverse_question 72 | self.use_preprocessed = use_preprocessed 73 | 74 | def __getitem__(self, index): 75 | imgfile, question, answer, _ = self.data[index] 76 | 77 | if self.use_preprocessed is False: 78 | img = Image.open(os.path.join(self.root, 'images', 79 | self.split, imgfile)).convert('RGB') 80 | img = resize(img) 81 | 82 | else: 83 | img = Image.open(os.path.join(self.root, 'images', 84 | self.split + '_preprocessed', 85 | imgfile)).convert('RGB') 86 | 87 | answer_class = category[self.answer_class[answer]] 88 | 89 | if self.transform is not None: 90 | img = self.transform(img) 91 | 92 | else: 93 | img = eval_transform(img) 94 | 95 | if self.reverse_question: 96 | question = question[::-1] 97 | 98 | return img, question, len(question), answer, answer_class 99 | 100 | def __len__(self): 101 | return len(self.data) 102 | 103 | 104 | def collate_data(batch): 105 | images, lengths, answers, answer_classes = [], [], [], [] 106 | batch_size = len(batch) 107 | 108 | max_len = max(map(lambda x: len(x[1]), batch)) 109 | 110 | questions = np.zeros((batch_size, max_len), dtype=np.int64) 111 | sort_by_len = sorted(batch, key=lambda x: len(x[1]), reverse=True) 112 | 113 | for i, b in enumerate(sort_by_len): 114 | image, question, length, answer, class_ = b 115 | images.append(image) 116 | length = len(question) 117 | questions[i, :length] = question 118 | lengths.append(length) 119 | answers.append(answer) 120 | answer_classes.append(class_) 121 | 122 | return torch.stack(images), torch.from_numpy(questions), \ 123 | lengths, torch.LongTensor(answers), answer_classes 124 | -------------------------------------------------------------------------------- /log/logs.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rosinality/relation-networks-pytorch/78f8ef5c5d832c8403b76c0ee657f9536be94c0b/log/logs.txt -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn.init import kaiming_uniform_, normal_ 4 | import torch.nn.functional as F 5 | 6 | 7 | class RelationNetworks(nn.Module): 8 | def __init__( 9 | self, 10 | n_vocab, 11 | conv_hidden=24, 12 | embed_hidden=32, 13 | lstm_hidden=128, 14 | mlp_hidden=256, 15 | classes=29, 16 | ): 17 | super().__init__() 18 | 19 | self.conv = nn.Sequential( 20 | nn.Conv2d(3, conv_hidden, [3, 3], 2, 1, bias=False), 21 | nn.BatchNorm2d(conv_hidden), 22 | nn.ReLU(), 23 | nn.Conv2d(conv_hidden, conv_hidden, [3, 3], 2, 1, bias=False), 24 | nn.BatchNorm2d(conv_hidden), 25 | nn.ReLU(), 26 | nn.Conv2d(conv_hidden, conv_hidden, [3, 3], 2, 1, bias=False), 27 | nn.BatchNorm2d(conv_hidden), 28 | nn.ReLU(), 29 | nn.Conv2d(conv_hidden, conv_hidden, [3, 3], 2, 1, bias=False), 30 | nn.BatchNorm2d(conv_hidden), 31 | nn.ReLU(), 32 | ) 33 | 34 | self.embed = nn.Embedding(n_vocab, embed_hidden) 35 | self.lstm = nn.LSTM(embed_hidden, lstm_hidden, batch_first=True) 36 | 37 | self.n_concat = conv_hidden * 2 + lstm_hidden + 2 * 2 38 | 39 | self.g = nn.Sequential( 40 | nn.Linear(self.n_concat, mlp_hidden), 41 | nn.ReLU(), 42 | nn.Linear(mlp_hidden, mlp_hidden), 43 | nn.ReLU(), 44 | nn.Linear(mlp_hidden, mlp_hidden), 45 | nn.ReLU(), 46 | nn.Linear(mlp_hidden, mlp_hidden), 47 | nn.ReLU(), 48 | ) 49 | 50 | self.f = nn.Sequential( 51 | nn.Linear(mlp_hidden, mlp_hidden), 52 | nn.ReLU(), 53 | nn.Linear(mlp_hidden, mlp_hidden), 54 | nn.ReLU(), 55 | nn.Dropout(), 56 | nn.Linear(mlp_hidden, classes), 57 | ) 58 | 59 | self.conv_hidden = conv_hidden 60 | self.lstm_hidden = lstm_hidden 61 | self.mlp_hidden = mlp_hidden 62 | 63 | coords = torch.linspace(-4, 4, 8) 64 | x = coords.unsqueeze(0).repeat(8, 1) 65 | y = coords.unsqueeze(1).repeat(1, 8) 66 | coords = torch.stack([x, y]).unsqueeze(0) 67 | self.register_buffer('coords', coords) 68 | 69 | def forward(self, image, question, question_len): 70 | conv = self.conv(image) 71 | batch_size, n_channel, conv_h, conv_w = conv.size() 72 | n_pair = conv_h * conv_w 73 | 74 | embed = self.embed(question) 75 | embed_pack = nn.utils.rnn.pack_padded_sequence( 76 | embed, question_len, batch_first=True 77 | ) 78 | _, (h, c) = self.lstm(embed_pack) 79 | h_tile = h.permute(1, 0, 2).expand( 80 | batch_size, n_pair * n_pair, self.lstm_hidden 81 | ) 82 | 83 | conv = torch.cat([conv, self.coords.expand(batch_size, 2, conv_h, conv_w)], 1) 84 | n_channel += 2 85 | conv_tr = conv.view(batch_size, n_channel, -1).permute(0, 2, 1) 86 | conv1 = conv_tr.unsqueeze(1).expand(batch_size, n_pair, n_pair, n_channel) 87 | conv2 = conv_tr.unsqueeze(2).expand(batch_size, n_pair, n_pair, n_channel) 88 | conv1 = conv1.contiguous().view(-1, n_pair * n_pair, n_channel) 89 | conv2 = conv2.contiguous().view(-1, n_pair * n_pair, n_channel) 90 | 91 | concat_vec = torch.cat([conv1, conv2, h_tile], 2).view(-1, self.n_concat) 92 | g = self.g(concat_vec) 93 | g = g.view(-1, n_pair * n_pair, self.mlp_hidden).sum(1).squeeze() 94 | 95 | f = self.f(g) 96 | 97 | return f 98 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import pickle 5 | 6 | import nltk 7 | import tqdm 8 | from torchvision import transforms 9 | from PIL import Image 10 | 11 | 12 | def process_question(root, split, word_dic=None, answer_dic=None): 13 | if word_dic is None: 14 | word_dic = {} 15 | 16 | if answer_dic is None: 17 | answer_dic = {} 18 | 19 | with open(os.path.join(root, 'questions', f'CLEVR_{split}_questions.json')) as f: 20 | data = json.load(f) 21 | 22 | result = [] 23 | word_index = 1 24 | answer_index = 0 25 | 26 | for question in tqdm.tqdm(data['questions']): 27 | words = nltk.word_tokenize(question['question']) 28 | question_token = [] 29 | 30 | for word in words: 31 | try: 32 | question_token.append(word_dic[word]) 33 | 34 | except: 35 | question_token.append(word_index) 36 | word_dic[word] = word_index 37 | word_index += 1 38 | 39 | answer_word = question['answer'] 40 | 41 | try: 42 | answer = answer_dic[answer_word] 43 | 44 | except: 45 | answer = answer_index 46 | answer_dic[answer_word] = answer_index 47 | answer_index += 1 48 | 49 | result.append( 50 | ( 51 | question['image_filename'], 52 | question_token, 53 | answer, 54 | question['question_family_index'], 55 | ) 56 | ) 57 | 58 | with open(f'data/{split}.pkl', 'wb') as f: 59 | pickle.dump(result, f) 60 | 61 | return word_dic, answer_dic 62 | 63 | 64 | resize = transforms.Resize([128, 128]) 65 | 66 | 67 | def process_image(path, output_dir): 68 | images = os.listdir(path) 69 | 70 | if not os.path.isdir(output_dir): 71 | os.mkdir(output_dir) 72 | 73 | for imgfile in tqdm.tqdm(images): 74 | img = Image.open(os.path.join(path, imgfile)).convert('RGB') 75 | img = resize(img) 76 | img.save(os.path.join(output_dir, imgfile)) 77 | 78 | 79 | if __name__ == '__main__': 80 | root = sys.argv[1] 81 | 82 | word_dic, answer_dic = process_question(root, 'train') 83 | process_question(root, 'val', word_dic, answer_dic) 84 | 85 | with open('data/dic.pkl', 'wb') as f: 86 | pickle.dump({'word_dic': word_dic, 'answer_dic': answer_dic}, f) 87 | 88 | process_image( 89 | os.path.join(sys.argv[1], 'images/train'), 90 | os.path.join(sys.argv[1], 'images/train_preprocessed'), 91 | ) 92 | process_image( 93 | os.path.join(sys.argv[1], 'images/val'), 94 | os.path.join(sys.argv[1], 'images/val_preprocessed'), 95 | ) 96 | 97 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pickle 3 | from collections import Counter 4 | 5 | import torch 6 | from torch import nn 7 | from torch import optim 8 | from torch.optim.lr_scheduler import StepLR 9 | from torch.utils.data import DataLoader 10 | from tqdm import tqdm 11 | 12 | from dataset import CLEVR, collate_data, transform 13 | from model import RelationNetworks 14 | 15 | batch_size = 640 16 | lr = 5e-6 17 | lr_max = 5e-4 18 | lr_gamma = 2 19 | lr_step = 20 20 | clip_norm = 50 21 | reverse_question = True 22 | weight_decay = 1e-4 23 | n_epoch = 500 24 | n_worker = 9 25 | data_parallel = True 26 | 27 | 28 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 29 | 30 | 31 | def train(epoch): 32 | train_set = DataLoader( 33 | CLEVR( 34 | sys.argv[1], 35 | transform=transform, 36 | reverse_question=reverse_question, 37 | use_preprocessed=True, 38 | ), 39 | batch_size=batch_size, 40 | num_workers=n_worker, 41 | shuffle=True, 42 | collate_fn=collate_data, 43 | ) 44 | 45 | dataset = iter(train_set) 46 | pbar = tqdm(dataset) 47 | moving_loss = 0 48 | 49 | relnet.train(True) 50 | for i, (image, question, q_len, answer, _) in enumerate(pbar): 51 | image, question, q_len, answer = ( 52 | image.to(device), 53 | question.to(device), 54 | torch.tensor(q_len), 55 | answer.to(device), 56 | ) 57 | 58 | relnet.zero_grad() 59 | output = relnet(image, question, q_len) 60 | loss = criterion(output, answer) 61 | loss.backward() 62 | nn.utils.clip_grad_norm_(relnet.parameters(), clip_norm) 63 | optimizer.step() 64 | 65 | correct = output.data.cpu().numpy().argmax(1) == answer.data.cpu().numpy() 66 | correct = correct.sum() / batch_size 67 | 68 | if moving_loss == 0: 69 | moving_loss = correct 70 | 71 | else: 72 | moving_loss = moving_loss * 0.99 + correct * 0.01 73 | 74 | pbar.set_description( 75 | 'Epoch: {}; Loss: {:.5f}; Acc: {:.5f}; LR: {:.6f}'.format( 76 | epoch + 1, 77 | loss.detach().item(), 78 | moving_loss, 79 | optimizer.param_groups[0]['lr'], 80 | ) 81 | ) 82 | 83 | 84 | def valid(epoch): 85 | valid_set = DataLoader( 86 | CLEVR( 87 | sys.argv[1], 88 | 'val', 89 | transform=None, 90 | reverse_question=reverse_question, 91 | use_preprocessed=True, 92 | ), 93 | batch_size=batch_size // 2, 94 | num_workers=4, 95 | collate_fn=collate_data, 96 | ) 97 | dataset = iter(valid_set) 98 | 99 | relnet.eval() 100 | class_correct = Counter() 101 | class_total = Counter() 102 | 103 | with torch.no_grad(): 104 | for image, question, q_len, answer, answer_class in tqdm(dataset): 105 | image, question, q_len = ( 106 | image.to(device), 107 | question.to(device), 108 | torch.tensor(q_len), 109 | ) 110 | 111 | output = relnet(image, question, q_len) 112 | correct = output.data.cpu().numpy().argmax(1) == answer.numpy() 113 | for c, class_ in zip(correct, answer_class): 114 | if c: 115 | class_correct[class_] += 1 116 | class_total[class_] += 1 117 | 118 | class_correct['total'] = sum(class_correct.values()) 119 | class_total['total'] = sum(class_total.values()) 120 | 121 | with open('log/log_{}.txt'.format(str(epoch + 1).zfill(3)), 'w') as w: 122 | for k, v in class_total.items(): 123 | w.write('{}: {:.5f}\n'.format(k, class_correct[k] / v)) 124 | 125 | print('Avg Acc: {:.5f}'.format(class_correct['total'] / class_total['total'])) 126 | 127 | 128 | if __name__ == '__main__': 129 | with open('data/dic.pkl', 'rb') as f: 130 | dic = pickle.load(f) 131 | 132 | n_words = len(dic['word_dic']) + 1 133 | n_answers = len(dic['answer_dic']) 134 | 135 | relnet = RelationNetworks(n_words) 136 | if data_parallel: 137 | relnet = nn.DataParallel(relnet) 138 | relnet = relnet.cuda() 139 | 140 | criterion = nn.CrossEntropyLoss() 141 | optimizer = optim.Adam(relnet.parameters(), lr=lr, weight_decay=weight_decay) 142 | scheduler = StepLR(optimizer, step_size=lr_step, gamma=lr_gamma) 143 | 144 | for epoch in range(n_epoch): 145 | if scheduler.get_lr()[0] < lr_max: 146 | scheduler.step() 147 | 148 | train(epoch) 149 | valid(epoch) 150 | 151 | with open( 152 | 'checkpoint/checkpoint_{}.model'.format(str(epoch + 1).zfill(3)), 'wb' 153 | ) as f: 154 | torch.save(relnet.state_dict(), f) 155 | --------------------------------------------------------------------------------